From 26e08a6da54c575c69a113ab0c41c2890c0d613d Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 4 Dec 2020 21:02:42 +0100
Subject: dma-buf: Fix kerneldoc formatting

I wanted to look up something and noticed the hyperlink doesn't work.
While fixing that also noticed a trivial kerneldoc comment typo in the
same section, fix that too.

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20201204200242.2671481-1-daniel.vetter@ffwll.ch
---
 include/linux/dma-buf-map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 583a3a1f9447..278d489e4bdd 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -122,7 +122,7 @@ struct dma_buf_map {
 
 /**
  * DMA_BUF_MAP_INIT_VADDR - Initializes struct dma_buf_map to an address in system memory
- * @vaddr:	A system-memory address
+ * @vaddr_:	A system-memory address
  */
 #define DMA_BUF_MAP_INIT_VADDR(vaddr_) \
 	{ \
-- 
cgit v1.2.3


From de9114ece5dfcc422164a24a687785843ed92c47 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 11 Dec 2020 16:58:40 +0100
Subject: dma-buf: Remove kmap kerneldoc vestiges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also try to clarify a bit when dma_buf_begin/end_cpu_access should
be called.

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20201211155843.3348718-1-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-buf.c | 20 ++++++++++++++------
 include/linux/dma-buf.h   | 25 +++++++++----------------
 2 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index e63684d4cd90..a12fdffa130f 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1001,15 +1001,15 @@ EXPORT_SYMBOL_GPL(dma_buf_move_notify);
  *   vmalloc space might be limited and result in vmap calls failing.
  *
  *   Interfaces::
+ *
  *      void \*dma_buf_vmap(struct dma_buf \*dmabuf)
  *      void dma_buf_vunmap(struct dma_buf \*dmabuf, void \*vaddr)
  *
  *   The vmap call can fail if there is no vmap support in the exporter, or if
- *   it runs out of vmalloc space. Fallback to kmap should be implemented. Note
- *   that the dma-buf layer keeps a reference count for all vmap access and
- *   calls down into the exporter's vmap function only when no vmapping exists,
- *   and only unmaps it once. Protection against concurrent vmap/vunmap calls is
- *   provided by taking the dma_buf->lock mutex.
+ *   it runs out of vmalloc space. Note that the dma-buf layer keeps a reference
+ *   count for all vmap access and calls down into the exporter's vmap function
+ *   only when no vmapping exists, and only unmaps it once. Protection against
+ *   concurrent vmap/vunmap calls is provided by taking the &dma_buf.lock mutex.
  *
  * - For full compatibility on the importer side with existing userspace
  *   interfaces, which might already support mmap'ing buffers. This is needed in
@@ -1098,6 +1098,11 @@ static int __dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
  * dma_buf_end_cpu_access(). Only when cpu access is braketed by both calls is
  * it guaranteed to be coherent with other DMA access.
  *
+ * This function will also wait for any DMA transactions tracked through
+ * implicit synchronization in &dma_buf.resv. For DMA transactions with explicit
+ * synchronization this function will only ensure cache coherency, callers must
+ * ensure synchronization with such DMA transactions on their own.
+ *
  * Can return negative error values, returns 0 on success.
  */
 int dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
@@ -1199,7 +1204,10 @@ EXPORT_SYMBOL_GPL(dma_buf_mmap);
  * This call may fail due to lack of virtual mapping address space.
  * These calls are optional in drivers. The intended use for them
  * is for mapping objects linear in kernel space for high use objects.
- * Please attempt to use kmap/kunmap before thinking about these interfaces.
+ *
+ * To ensure coherency users must call dma_buf_begin_cpu_access() and
+ * dma_buf_end_cpu_access() around any cpu access performed through this
+ * mapping.
  *
  * Returns 0 on success, or a negative errno code otherwise.
  */
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index cf72699cb2bc..7eca37c8b10c 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -183,24 +183,19 @@ struct dma_buf_ops {
 	 * @begin_cpu_access:
 	 *
 	 * This is called from dma_buf_begin_cpu_access() and allows the
-	 * exporter to ensure that the memory is actually available for cpu
-	 * access - the exporter might need to allocate or swap-in and pin the
-	 * backing storage. The exporter also needs to ensure that cpu access is
-	 * coherent for the access direction. The direction can be used by the
-	 * exporter to optimize the cache flushing, i.e. access with a different
+	 * exporter to ensure that the memory is actually coherent for cpu
+	 * access. The exporter also needs to ensure that cpu access is coherent
+	 * for the access direction. The direction can be used by the exporter
+	 * to optimize the cache flushing, i.e. access with a different
 	 * direction (read instead of write) might return stale or even bogus
 	 * data (e.g. when the exporter needs to copy the data to temporary
 	 * storage).
 	 *
-	 * This callback is optional.
+	 * Note that this is both called through the DMA_BUF_IOCTL_SYNC IOCTL
+	 * command for userspace mappings established through @mmap, and also
+	 * for kernel mappings established with @vmap.
 	 *
-	 * FIXME: This is both called through the DMA_BUF_IOCTL_SYNC command
-	 * from userspace (where storage shouldn't be pinned to avoid handing
-	 * de-factor mlock rights to userspace) and for the kernel-internal
-	 * users of the various kmap interfaces, where the backing storage must
-	 * be pinned to guarantee that the atomic kmap calls can succeed. Since
-	 * there's no in-kernel users of the kmap interfaces yet this isn't a
-	 * real problem.
+	 * This callback is optional.
 	 *
 	 * Returns:
 	 *
@@ -216,9 +211,7 @@ struct dma_buf_ops {
 	 *
 	 * This is called from dma_buf_end_cpu_access() when the importer is
 	 * done accessing the CPU. The exporter can use this to flush caches and
-	 * unpin any resources pinned in @begin_cpu_access.
-	 * The result of any dma_buf kmap calls after end_cpu_access is
-	 * undefined.
+	 * undo anything else done in @begin_cpu_access.
 	 *
 	 * This callback is optional.
 	 *
-- 
cgit v1.2.3


From 85804b70cca68d77bca419b493ab843bf710de68 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 11 Dec 2020 16:58:41 +0100
Subject: dma-buf: some kerneldoc formatting fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Noticed while reviewing the output. Adds a bunch more links and fixes
the function interface quoting.

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20201211155843.3348718-2-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-buf.c | 31 ++++++++++++++++++-------------
 include/linux/dma-buf.h   |  6 +++---
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index a12fdffa130f..e1fa6c6f02c4 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -480,7 +480,7 @@ err_alloc_file:
  *
  * 4. Once a driver is done with a shared buffer it needs to call
  *    dma_buf_detach() (after cleaning up any mappings) and then release the
- *    reference acquired with dma_buf_get by calling dma_buf_put().
+ *    reference acquired with dma_buf_get() by calling dma_buf_put().
  *
  * For the detailed semantics exporters are expected to implement see
  * &dma_buf_ops.
@@ -496,9 +496,10 @@ err_alloc_file:
  *			by the exporter. see &struct dma_buf_export_info
  *			for further details.
  *
- * Returns, on success, a newly created dma_buf object, which wraps the
- * supplied private data and operations for dma_buf_ops. On either missing
- * ops, or error in allocating struct dma_buf, will return negative error.
+ * Returns, on success, a newly created struct dma_buf object, which wraps the
+ * supplied private data and operations for struct dma_buf_ops. On either
+ * missing ops, or error in allocating struct dma_buf, will return negative
+ * error.
  *
  * For most cases the easiest way to create @exp_info is through the
  * %DEFINE_DMA_BUF_EXPORT_INFO macro.
@@ -584,7 +585,7 @@ err_module:
 EXPORT_SYMBOL_GPL(dma_buf_export);
 
 /**
- * dma_buf_fd - returns a file descriptor for the given dma_buf
+ * dma_buf_fd - returns a file descriptor for the given struct dma_buf
  * @dmabuf:	[in]	pointer to dma_buf for which fd is required.
  * @flags:      [in]    flags to give to fd
  *
@@ -608,10 +609,10 @@ int dma_buf_fd(struct dma_buf *dmabuf, int flags)
 EXPORT_SYMBOL_GPL(dma_buf_fd);
 
 /**
- * dma_buf_get - returns the dma_buf structure related to an fd
- * @fd:	[in]	fd associated with the dma_buf to be returned
+ * dma_buf_get - returns the struct dma_buf related to an fd
+ * @fd:	[in]	fd associated with the struct dma_buf to be returned
  *
- * On success, returns the dma_buf structure associated with an fd; uses
+ * On success, returns the struct dma_buf associated with an fd; uses
  * file's refcounting done by fget to increase refcount. returns ERR_PTR
  * otherwise.
  */
@@ -653,8 +654,7 @@ void dma_buf_put(struct dma_buf *dmabuf)
 EXPORT_SYMBOL_GPL(dma_buf_put);
 
 /**
- * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list; optionally,
- * calls attach() of dma_buf_ops to allow device-specific attach functionality
+ * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list
  * @dmabuf:		[in]	buffer to attach device to.
  * @dev:		[in]	device to be attached.
  * @importer_ops:	[in]	importer operations for the attachment
@@ -663,6 +663,9 @@ EXPORT_SYMBOL_GPL(dma_buf_put);
  * Returns struct dma_buf_attachment pointer for this attachment. Attachments
  * must be cleaned up by calling dma_buf_detach().
  *
+ * Optionally this calls &dma_buf_ops.attach to allow device-specific attach
+ * functionality.
+ *
  * Returns:
  *
  * A pointer to newly created &dma_buf_attachment on success, or a negative
@@ -769,12 +772,13 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 EXPORT_SYMBOL_GPL(dma_buf_attach);
 
 /**
- * dma_buf_detach - Remove the given attachment from dmabuf's attachments list;
- * optionally calls detach() of dma_buf_ops for device-specific detach
+ * dma_buf_detach - Remove the given attachment from dmabuf's attachments list
  * @dmabuf:	[in]	buffer to detach from.
  * @attach:	[in]	attachment to be detached; is free'd after this call.
  *
  * Clean up a device attachment obtained by calling dma_buf_attach().
+ *
+ * Optionally this calls &dma_buf_ops.detach for device-specific detach.
  */
 void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach)
 {
@@ -1061,11 +1065,12 @@ EXPORT_SYMBOL_GPL(dma_buf_move_notify);
  *   shootdowns would increase the complexity quite a bit.
  *
  *   Interface::
+ *
  *      int dma_buf_mmap(struct dma_buf \*, struct vm_area_struct \*,
  *		       unsigned long);
  *
  *   If the importing subsystem simply provides a special-purpose mmap call to
- *   set up a mapping in userspace, calling do_mmap with dma_buf->file will
+ *   set up a mapping in userspace, calling do_mmap with &dma_buf.file will
  *   equally achieve that for a dma-buf object.
  */
 
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 7eca37c8b10c..43802a31b25d 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -85,10 +85,10 @@ struct dma_buf_ops {
 	/**
 	 * @pin:
 	 *
-	 * This is called by dma_buf_pin and lets the exporter know that the
+	 * This is called by dma_buf_pin() and lets the exporter know that the
 	 * DMA-buf can't be moved any more.
 	 *
-	 * This is called with the dmabuf->resv object locked and is mutual
+	 * This is called with the &dmabuf.resv object locked and is mutual
 	 * exclusive with @cache_sgt_mapping.
 	 *
 	 * This callback is optional and should only be used in limited use
@@ -103,7 +103,7 @@ struct dma_buf_ops {
 	/**
 	 * @unpin:
 	 *
-	 * This is called by dma_buf_unpin and lets the exporter know that the
+	 * This is called by dma_buf_unpin() and lets the exporter know that the
 	 * DMA-buf can be moved again.
 	 *
 	 * This is called with the dmabuf->resv object locked and is mutual
-- 
cgit v1.2.3


From c545781e1c55ab680dcc49c37212d5327b9d6812 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 11 Dec 2020 16:58:43 +0100
Subject: dma-buf: doc polish for pin/unpin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Motivated by a discussion with Christian and Thomas: Try to untangle a
bit what's relevant for importers and what's relevant for exporters.

Also add an assert that really only dynamic importers use the api
function, anything else doesn't make sense.

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20201211155843.3348718-4-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/dma-buf.c | 19 ++++++++++++++++---
 include/linux/dma-buf.h   |  8 +++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index a0a02ef888da..b8465243eca2 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -809,9 +809,15 @@ EXPORT_SYMBOL_GPL(dma_buf_detach);
 
 /**
  * dma_buf_pin - Lock down the DMA-buf
- *
  * @attach:	[in]	attachment which should be pinned
  *
+ * Only dynamic importers (who set up @attach with dma_buf_dynamic_attach()) may
+ * call this, and only for limited use cases like scanout and not for temporary
+ * pin operations. It is not permitted to allow userspace to pin arbitrary
+ * amounts of buffers through this interface.
+ *
+ * Buffers must be unpinned by calling dma_buf_unpin().
+ *
  * Returns:
  * 0 on success, negative error code on failure.
  */
@@ -820,6 +826,8 @@ int dma_buf_pin(struct dma_buf_attachment *attach)
 	struct dma_buf *dmabuf = attach->dmabuf;
 	int ret = 0;
 
+	WARN_ON(!dma_buf_attachment_is_dynamic(attach));
+
 	dma_resv_assert_held(dmabuf->resv);
 
 	if (dmabuf->ops->pin)
@@ -830,14 +838,19 @@ int dma_buf_pin(struct dma_buf_attachment *attach)
 EXPORT_SYMBOL_GPL(dma_buf_pin);
 
 /**
- * dma_buf_unpin - Remove lock from DMA-buf
- *
+ * dma_buf_unpin - Unpin a DMA-buf
  * @attach:	[in]	attachment which should be unpinned
+ *
+ * This unpins a buffer pinned by dma_buf_pin() and allows the exporter to move
+ * any mapping of @attach again and inform the importer through
+ * &dma_buf_attach_ops.move_notify.
  */
 void dma_buf_unpin(struct dma_buf_attachment *attach)
 {
 	struct dma_buf *dmabuf = attach->dmabuf;
 
+	WARN_ON(!dma_buf_attachment_is_dynamic(attach));
+
 	dma_resv_assert_held(dmabuf->resv);
 
 	if (dmabuf->ops->unpin)
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 43802a31b25d..628681bf6c99 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -86,13 +86,15 @@ struct dma_buf_ops {
 	 * @pin:
 	 *
 	 * This is called by dma_buf_pin() and lets the exporter know that the
-	 * DMA-buf can't be moved any more.
+	 * DMA-buf can't be moved any more. The exporter should pin the buffer
+	 * into system memory to make sure it is generally accessible by other
+	 * devices.
 	 *
 	 * This is called with the &dmabuf.resv object locked and is mutual
 	 * exclusive with @cache_sgt_mapping.
 	 *
-	 * This callback is optional and should only be used in limited use
-	 * cases like scanout and not for temporary pin operations.
+	 * This is called automatically for non-dynamic importers from
+	 * dma_buf_attach().
 	 *
 	 * Returns:
 	 *
-- 
cgit v1.2.3


From 2ee5f8f05949735fa2f4c463a5e13fcb3660c719 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 8 Dec 2020 17:41:42 +0100
Subject: units: Add Watt units

As there are the temperature units, let's add the Watt macros definition.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/units.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index aaf716364ec3..92c234e71cab 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -4,6 +4,10 @@
 
 #include <linux/kernel.h>
 
+#define MILLIWATT_PER_WATT	1000L
+#define MICROWATT_PER_MILLIWATT	1000L
+#define MICROWATT_PER_WATT	1000000L
+
 #define ABSOLUTE_ZERO_MILLICELSIUS -273150
 
 static inline long milli_kelvin_to_millicelsius(long t)
-- 
cgit v1.2.3


From a20d0ef97abf486a917aff066c457bdb930425af Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 8 Dec 2020 17:41:44 +0100
Subject: powercap/drivers/dtpm: Add API for dynamic thermal power management

On the embedded world, the complexity of the SoC leads to an
increasing number of hotspots which need to be monitored and mitigated
as a whole in order to prevent the temperature to go above the
normative and legally stated 'skin temperature'.

Another aspect is to sustain the performance for a given power budget,
for example virtual reality where the user can feel dizziness if the
GPU performance is capped while a big CPU is processing something
else. Or reduce the battery charging because the dissipated power is
too high compared with the power consumed by other devices.

The userspace is the most adequate place to dynamically act on the
different devices by limiting their power given an application
profile: it has the knowledge of the platform.

These userspace daemons are in charge of the Dynamic Thermal Power
Management (DTPM).

Nowadays, the dtpm daemons are abusing the thermal framework as they
act on the cooling device state to force a specific and arbitrary
state without taking care of the governor decisions. Given the closed
loop of some governors that can confuse the logic or directly enter in
a decision conflict.

As the number of cooling device support is limited today to the CPU
and the GPU, the dtpm daemons have little control on the power
dissipation of the system. The out of tree solutions are hacking
around here and there in the drivers, in the frameworks to have
control on the devices. The common solution is to declare them as
cooling devices.

There is no unification of the power limitation unit, opaque states
are used.

This patch provides a way to create a hierarchy of constraints using
the powercap framework. The devices which are registered as power
limit-able devices are represented in this hierarchy as a tree. They
are linked together with intermediate nodes which are just there to
propagate the constraint to the children.

The leaves of the tree are the real devices, the intermediate nodes
are virtual, aggregating the children constraints and power
characteristics.

Each node have a weight on a 2^10 basis, in order to reflect the
percentage of power distribution of the children's node. This
percentage is used to dispatch the power limit to the children.

The weight is computed against the max power of the siblings.

This simple approach allows to do a fair distribution of the power
limit.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig          |   6 +
 drivers/powercap/Makefile         |   1 +
 drivers/powercap/dtpm.c           | 473 ++++++++++++++++++++++++++++++++++++++
 include/asm-generic/vmlinux.lds.h |  11 +
 include/linux/dtpm.h              |  75 ++++++
 5 files changed, 566 insertions(+)
 create mode 100644 drivers/powercap/dtpm.c
 create mode 100644 include/linux/dtpm.h

(limited to 'include/linux')

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index bc228725346b..cc1953bd8bed 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -43,4 +43,10 @@ config IDLE_INJECT
 	  CPUs for power capping. Idle period can be injected
 	  synchronously on a set of specified CPUs or alternatively
 	  on a per CPU basis.
+
+config DTPM
+	bool "Power capping for Dynamic Thermal Power Management"
+	help
+	  This enables support for the power capping for the dynamic
+	  thermal power management userspace engine.
 endif
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 7255c94ec61c..6482ac52054d 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_DTPM) += dtpm.o
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
 obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
 obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c
new file mode 100644
index 000000000000..5b6857e9b064
--- /dev/null
+++ b/drivers/powercap/dtpm.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * The powercap based Dynamic Thermal Power Management framework
+ * provides to the userspace a consistent API to set the power limit
+ * on some devices.
+ *
+ * DTPM defines the functions to create a tree of constraints. Each
+ * parent node is a virtual description of the aggregation of the
+ * children. It propagates the constraints set at its level to its
+ * children and collect the children power information. The leaves of
+ * the tree are the real devices which have the ability to get their
+ * current power consumption and set their power limit.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dtpm.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/powercap.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#define DTPM_POWER_LIMIT_FLAG BIT(0)
+
+static const char *constraint_name[] = {
+	"Instantaneous",
+};
+
+static DEFINE_MUTEX(dtpm_lock);
+static struct powercap_control_type *pct;
+static struct dtpm *root;
+
+static int get_time_window_us(struct powercap_zone *pcz, int cid, u64 *window)
+{
+	return -ENOSYS;
+}
+
+static int set_time_window_us(struct powercap_zone *pcz, int cid, u64 window)
+{
+	return -ENOSYS;
+}
+
+static int get_max_power_range_uw(struct powercap_zone *pcz, u64 *max_power_uw)
+{
+	struct dtpm *dtpm = to_dtpm(pcz);
+
+	mutex_lock(&dtpm_lock);
+	*max_power_uw = dtpm->power_max - dtpm->power_min;
+	mutex_unlock(&dtpm_lock);
+
+	return 0;
+}
+
+static int __get_power_uw(struct dtpm *dtpm, u64 *power_uw)
+{
+	struct dtpm *child;
+	u64 power;
+	int ret = 0;
+
+	if (dtpm->ops) {
+		*power_uw = dtpm->ops->get_power_uw(dtpm);
+		return 0;
+	}
+
+	*power_uw = 0;
+
+	list_for_each_entry(child, &dtpm->children, sibling) {
+		ret = __get_power_uw(child, &power);
+		if (ret)
+			break;
+		*power_uw += power;
+	}
+
+	return ret;
+}
+
+static int get_power_uw(struct powercap_zone *pcz, u64 *power_uw)
+{
+	struct dtpm *dtpm = to_dtpm(pcz);
+	int ret;
+
+	mutex_lock(&dtpm_lock);
+	ret = __get_power_uw(dtpm, power_uw);
+	mutex_unlock(&dtpm_lock);
+
+	return ret;
+}
+
+static void __dtpm_rebalance_weight(struct dtpm *dtpm)
+{
+	struct dtpm *child;
+
+	list_for_each_entry(child, &dtpm->children, sibling) {
+
+		pr_debug("Setting weight '%d' for '%s'\n",
+			 child->weight, child->zone.name);
+
+		child->weight = DIV_ROUND_CLOSEST(child->power_max * 1024,
+						  dtpm->power_max);
+
+		__dtpm_rebalance_weight(child);
+	}
+}
+
+static void __dtpm_sub_power(struct dtpm *dtpm)
+{
+	struct dtpm *parent = dtpm->parent;
+
+	while (parent) {
+		parent->power_min -= dtpm->power_min;
+		parent->power_max -= dtpm->power_max;
+		parent->power_limit -= dtpm->power_limit;
+		parent = parent->parent;
+	}
+
+	__dtpm_rebalance_weight(root);
+}
+
+static void __dtpm_add_power(struct dtpm *dtpm)
+{
+	struct dtpm *parent = dtpm->parent;
+
+	while (parent) {
+		parent->power_min += dtpm->power_min;
+		parent->power_max += dtpm->power_max;
+		parent->power_limit += dtpm->power_limit;
+		parent = parent->parent;
+	}
+
+	__dtpm_rebalance_weight(root);
+}
+
+/**
+ * dtpm_update_power - Update the power on the dtpm
+ * @dtpm: a pointer to a dtpm structure to update
+ * @power_min: a u64 representing the new power_min value
+ * @power_max: a u64 representing the new power_max value
+ *
+ * Function to update the power values of the dtpm node specified in
+ * parameter. These new values will be propagated to the tree.
+ *
+ * Return: zero on success, -EINVAL if the values are inconsistent
+ */
+int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max)
+{
+	mutex_lock(&dtpm_lock);
+
+	if (power_min == dtpm->power_min && power_max == dtpm->power_max)
+		return 0;
+
+	if (power_max < power_min)
+		return -EINVAL;
+
+	__dtpm_sub_power(dtpm);
+
+	dtpm->power_min = power_min;
+	dtpm->power_max = power_max;
+	if (!test_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags))
+		dtpm->power_limit = power_max;
+
+	__dtpm_add_power(dtpm);
+
+	mutex_unlock(&dtpm_lock);
+
+	return 0;
+}
+
+/**
+ * dtpm_release_zone - Cleanup when the node is released
+ * @pcz: a pointer to a powercap_zone structure
+ *
+ * Do some housecleaning and update the weight on the tree. The
+ * release will be denied if the node has children. This function must
+ * be called by the specific release callback of the different
+ * backends.
+ *
+ * Return: 0 on success, -EBUSY if there are children
+ */
+int dtpm_release_zone(struct powercap_zone *pcz)
+{
+	struct dtpm *dtpm = to_dtpm(pcz);
+	struct dtpm *parent = dtpm->parent;
+
+	mutex_lock(&dtpm_lock);
+
+	if (!list_empty(&dtpm->children))
+		return -EBUSY;
+
+	if (parent)
+		list_del(&dtpm->sibling);
+
+	__dtpm_sub_power(dtpm);
+
+	mutex_unlock(&dtpm_lock);
+
+	if (dtpm->ops)
+		dtpm->ops->release(dtpm);
+
+	kfree(dtpm);
+
+	return 0;
+}
+
+static int __get_power_limit_uw(struct dtpm *dtpm, int cid, u64 *power_limit)
+{
+	*power_limit = dtpm->power_limit;
+	return 0;
+}
+
+static int get_power_limit_uw(struct powercap_zone *pcz,
+			      int cid, u64 *power_limit)
+{
+	struct dtpm *dtpm = to_dtpm(pcz);
+	int ret;
+
+	mutex_lock(&dtpm_lock);
+	ret = __get_power_limit_uw(dtpm, cid, power_limit);
+	mutex_unlock(&dtpm_lock);
+
+	return ret;
+}
+
+/*
+ * Set the power limit on the nodes, the power limit is distributed
+ * given the weight of the children.
+ *
+ * The dtpm node lock must be held when calling this function.
+ */
+static int __set_power_limit_uw(struct dtpm *dtpm, int cid, u64 power_limit)
+{
+	struct dtpm *child;
+	int ret = 0;
+	u64 power;
+
+	/*
+	 * A max power limitation means we remove the power limit,
+	 * otherwise we set a constraint and flag the dtpm node.
+	 */
+	if (power_limit == dtpm->power_max) {
+		clear_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags);
+	} else {
+		set_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags);
+	}
+
+	pr_debug("Setting power limit for '%s': %llu uW\n",
+		 dtpm->zone.name, power_limit);
+
+	/*
+	 * Only leaves of the dtpm tree has ops to get/set the power
+	 */
+	if (dtpm->ops) {
+		dtpm->power_limit = dtpm->ops->set_power_uw(dtpm, power_limit);
+	} else {
+		dtpm->power_limit = 0;
+
+		list_for_each_entry(child, &dtpm->children, sibling) {
+
+			/*
+			 * Integer division rounding will inevitably
+			 * lead to a different min or max value when
+			 * set several times. In order to restore the
+			 * initial value, we force the child's min or
+			 * max power every time if the constraint is
+			 * at the boundaries.
+			 */
+			if (power_limit == dtpm->power_max) {
+				power = child->power_max;
+			} else if (power_limit == dtpm->power_min) {
+				power = child->power_min;
+			} else {
+				power = DIV_ROUND_CLOSEST(
+					power_limit * child->weight, 1024);
+			}
+
+			pr_debug("Setting power limit for '%s': %llu uW\n",
+				 child->zone.name, power);
+
+			ret = __set_power_limit_uw(child, cid, power);
+			if (!ret)
+				ret = __get_power_limit_uw(child, cid, &power);
+
+			if (ret)
+				break;
+
+			dtpm->power_limit += power;
+		}
+	}
+
+	return ret;
+}
+
+static int set_power_limit_uw(struct powercap_zone *pcz,
+			      int cid, u64 power_limit)
+{
+	struct dtpm *dtpm = to_dtpm(pcz);
+	int ret;
+
+	mutex_lock(&dtpm_lock);
+
+	/*
+	 * Don't allow values outside of the power range previously
+	 * set when initializing the power numbers.
+	 */
+	power_limit = clamp_val(power_limit, dtpm->power_min, dtpm->power_max);
+
+	ret = __set_power_limit_uw(dtpm, cid, power_limit);
+
+	pr_debug("%s: power limit: %llu uW, power max: %llu uW\n",
+		 dtpm->zone.name, dtpm->power_limit, dtpm->power_max);
+
+	mutex_unlock(&dtpm_lock);
+
+	return ret;
+}
+
+static const char *get_constraint_name(struct powercap_zone *pcz, int cid)
+{
+	return constraint_name[cid];
+}
+
+static int get_max_power_uw(struct powercap_zone *pcz, int id, u64 *max_power)
+{
+	struct dtpm *dtpm = to_dtpm(pcz);
+
+	mutex_lock(&dtpm_lock);
+	*max_power = dtpm->power_max;
+	mutex_unlock(&dtpm_lock);
+
+	return 0;
+}
+
+static struct powercap_zone_constraint_ops constraint_ops = {
+	.set_power_limit_uw = set_power_limit_uw,
+	.get_power_limit_uw = get_power_limit_uw,
+	.set_time_window_us = set_time_window_us,
+	.get_time_window_us = get_time_window_us,
+	.get_max_power_uw = get_max_power_uw,
+	.get_name = get_constraint_name,
+};
+
+static struct powercap_zone_ops zone_ops = {
+	.get_max_power_range_uw = get_max_power_range_uw,
+	.get_power_uw = get_power_uw,
+	.release = dtpm_release_zone,
+};
+
+/**
+ * dtpm_alloc - Allocate and initialize a dtpm struct
+ * @name: a string specifying the name of the node
+ *
+ * Return: a struct dtpm pointer, NULL in case of error
+ */
+struct dtpm *dtpm_alloc(struct dtpm_ops *ops)
+{
+	struct dtpm *dtpm;
+
+	dtpm = kzalloc(sizeof(*dtpm), GFP_KERNEL);
+	if (dtpm) {
+		INIT_LIST_HEAD(&dtpm->children);
+		INIT_LIST_HEAD(&dtpm->sibling);
+		dtpm->weight = 1024;
+		dtpm->ops = ops;
+	}
+
+	return dtpm;
+}
+
+/**
+ * dtpm_unregister - Unregister a dtpm node from the hierarchy tree
+ * @dtpm: a pointer to a dtpm structure corresponding to the node to be removed
+ *
+ * Call the underlying powercap unregister function. That will call
+ * the release callback of the powercap zone.
+ */
+void dtpm_unregister(struct dtpm *dtpm)
+{
+	powercap_unregister_zone(pct, &dtpm->zone);
+
+	pr_info("Unregistered dtpm node '%s'\n", dtpm->zone.name);
+}
+
+/**
+ * dtpm_register - Register a dtpm node in the hierarchy tree
+ * @name: a string specifying the name of the node
+ * @dtpm: a pointer to a dtpm structure corresponding to the new node
+ * @parent: a pointer to a dtpm structure corresponding to the parent node
+ *
+ * Create a dtpm node in the tree. If no parent is specified, the node
+ * is the root node of the hierarchy. If the root node already exists,
+ * then the registration will fail. The powercap controller must be
+ * initialized before calling this function.
+ *
+ * The dtpm structure must be initialized with the power numbers
+ * before calling this function.
+ *
+ * Return: zero on success, a negative value in case of error:
+ *  -EAGAIN: the function is called before the framework is initialized.
+ *  -EBUSY: the root node is already inserted
+ *  -EINVAL: * there is no root node yet and @parent is specified
+ *           * no all ops are defined
+ *           * parent have ops which are reserved for leaves
+ *   Other negative values are reported back from the powercap framework
+ */
+int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent)
+{
+	struct powercap_zone *pcz;
+
+	if (!pct)
+		return -EAGAIN;
+
+	if (root && !parent)
+		return -EBUSY;
+
+	if (!root && parent)
+		return -EINVAL;
+
+	if (parent && parent->ops)
+		return -EINVAL;
+
+	if (!dtpm)
+		return -EINVAL;
+
+	if (dtpm->ops && !(dtpm->ops->set_power_uw &&
+			   dtpm->ops->get_power_uw &&
+			   dtpm->ops->release))
+		return -EINVAL;
+
+	pcz = powercap_register_zone(&dtpm->zone, pct, name,
+				     parent ? &parent->zone : NULL,
+				     &zone_ops, MAX_DTPM_CONSTRAINTS,
+				     &constraint_ops);
+	if (IS_ERR(pcz))
+		return PTR_ERR(pcz);
+
+	mutex_lock(&dtpm_lock);
+
+	if (parent) {
+		list_add_tail(&dtpm->sibling, &parent->children);
+		dtpm->parent = parent;
+	} else {
+		root = dtpm;
+	}
+
+	__dtpm_add_power(dtpm);
+
+	pr_info("Registered dtpm node '%s' / %llu-%llu uW, \n",
+		dtpm->zone.name, dtpm->power_min, dtpm->power_max);
+
+	mutex_unlock(&dtpm_lock);
+
+	return 0;
+}
+
+static int __init dtpm_init(void)
+{
+	struct dtpm_descr **dtpm_descr;
+
+	pct = powercap_register_control_type(NULL, "dtpm", NULL);
+	if (!pct) {
+		pr_err("Failed to register control type\n");
+		return -EINVAL;
+	}
+
+	for_each_dtpm_table(dtpm_descr)
+		(*dtpm_descr)->init(*dtpm_descr);
+
+	return 0;
+}
+late_initcall(dtpm_init);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b2b3d81b1535..b3e4e0740089 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -316,6 +316,16 @@
 #define THERMAL_TABLE(name)
 #endif
 
+#ifdef CONFIG_DTPM
+#define DTPM_TABLE()							\
+	. = ALIGN(8);							\
+	__dtpm_table = .;						\
+	KEEP(*(__dtpm_table))						\
+	__dtpm_table_end = .;
+#else
+#define DTPM_TABLE()
+#endif
+
 #define KERNEL_DTB()							\
 	STRUCT_ALIGN();							\
 	__dtb_start = .;						\
@@ -733,6 +743,7 @@
 	ACPI_PROBE_TABLE(irqchip)					\
 	ACPI_PROBE_TABLE(timer)						\
 	THERMAL_TABLE(governor)						\
+	DTPM_TABLE()							\
 	EARLYCON_TABLE()						\
 	LSM_TABLE()							\
 	EARLY_LSM_TABLE()						\
diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h
new file mode 100644
index 000000000000..7a1d0b50e334
--- /dev/null
+++ b/include/linux/dtpm.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Linaro Ltd
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ */
+#ifndef ___DTPM_H__
+#define ___DTPM_H__
+
+#include <linux/powercap.h>
+
+#define MAX_DTPM_DESCR 8
+#define MAX_DTPM_CONSTRAINTS 1
+
+struct dtpm {
+	struct powercap_zone zone;
+	struct dtpm *parent;
+	struct list_head sibling;
+	struct list_head children;
+	struct dtpm_ops *ops;
+	unsigned long flags;
+	u64 power_limit;
+	u64 power_max;
+	u64 power_min;
+	int weight;
+	void *private;
+};
+
+struct dtpm_ops {
+	u64 (*set_power_uw)(struct dtpm *, u64);
+	u64 (*get_power_uw)(struct dtpm *);
+	void (*release)(struct dtpm *);
+};
+
+struct dtpm_descr;
+
+typedef int (*dtpm_init_t)(struct dtpm_descr *);
+
+struct dtpm_descr {
+	struct dtpm *parent;
+	const char *name;
+	dtpm_init_t init;
+};
+
+/* Init section thermal table */
+extern struct dtpm_descr *__dtpm_table[];
+extern struct dtpm_descr *__dtpm_table_end[];
+
+#define DTPM_TABLE_ENTRY(name)			\
+	static typeof(name) *__dtpm_table_entry_##name	\
+	__used __section("__dtpm_table") = &name
+
+#define DTPM_DECLARE(name)	DTPM_TABLE_ENTRY(name)
+
+#define for_each_dtpm_table(__dtpm)	\
+	for (__dtpm = __dtpm_table;	\
+	     __dtpm < __dtpm_table_end;	\
+	     __dtpm++)
+
+static inline struct dtpm *to_dtpm(struct powercap_zone *zone)
+{
+	return container_of(zone, struct dtpm, zone);
+}
+
+int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max);
+
+int dtpm_release_zone(struct powercap_zone *pcz);
+
+struct dtpm *dtpm_alloc(struct dtpm_ops *ops);
+
+void dtpm_unregister(struct dtpm *dtpm);
+
+int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent);
+
+#endif
-- 
cgit v1.2.3


From 0e8f68d7f04856a9e2ad4817b477fa35124888bd Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 8 Dec 2020 17:41:45 +0100
Subject: powercap/drivers/dtpm: Add CPU energy model based support

With the powercap dtpm controller, we are able to plug devices with
power limitation features in the tree.

The following patch introduces the CPU power limitation based on the
energy model and the performance states.

The power limitation is done at the performance domain level. If some
CPUs are unplugged, the corresponding power will be subtracted from
the performance domain total power.

It is up to the platform to initialize the dtpm tree and add the CPU.

Here is an example to create a simple tree with one root node called
"pkg" and the CPU's performance domains.

static int dtpm_register_pkg(struct dtpm_descr *descr)
{
	struct dtpm *pkg;
	int ret;

	pkg = dtpm_alloc(NULL);
	if (!pkg)
		return -ENOMEM;

	ret = dtpm_register(descr->name, pkg, descr->parent);
	if (ret)
		return ret;

	return dtpm_register_cpu(pkg);
}

static struct dtpm_descr descr = {
	.name = "pkg",
	.init = dtpm_register_pkg,
};
DTPM_DECLARE(descr);

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig    |   7 ++
 drivers/powercap/Makefile   |   1 +
 drivers/powercap/dtpm_cpu.c | 257 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h  |   1 +
 include/linux/dtpm.h        |   2 +
 5 files changed, 268 insertions(+)
 create mode 100644 drivers/powercap/dtpm_cpu.c

(limited to 'include/linux')

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index cc1953bd8bed..20b4325c6161 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -49,4 +49,11 @@ config DTPM
 	help
 	  This enables support for the power capping for the dynamic
 	  thermal power management userspace engine.
+
+config DTPM_CPU
+	bool "Add CPU power capping based on the energy model"
+	depends on DTPM && ENERGY_MODEL
+	help
+	  This enables support for CPU power limitation based on
+	  energy model.
 endif
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 6482ac52054d..fabcf388a8d3 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_DTPM) += dtpm.o
+obj-$(CONFIG_DTPM_CPU) += dtpm_cpu.o
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
 obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
 obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
new file mode 100644
index 000000000000..6933c783c6b4
--- /dev/null
+++ b/drivers/powercap/dtpm_cpu.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * The DTPM CPU is based on the energy model. It hooks the CPU in the
+ * DTPM tree which in turns update the power number by propagating the
+ * power number from the CPU energy model information to the parents.
+ *
+ * The association between the power and the performance state, allows
+ * to set the power of the CPU at the OPP granularity.
+ *
+ * The CPU hotplug is supported and the power numbers will be updated
+ * if a CPU is hot plugged / unplugged.
+ */
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuhotplug.h>
+#include <linux/dtpm.h>
+#include <linux/energy_model.h>
+#include <linux/pm_qos.h>
+#include <linux/slab.h>
+#include <linux/units.h>
+
+static struct dtpm *__parent;
+
+static DEFINE_PER_CPU(struct dtpm *, dtpm_per_cpu);
+
+struct dtpm_cpu {
+	struct freq_qos_request qos_req;
+	int cpu;
+};
+
+/*
+ * When a new CPU is inserted at hotplug or boot time, add the power
+ * contribution and update the dtpm tree.
+ */
+static int power_add(struct dtpm *dtpm, struct em_perf_domain *em)
+{
+	u64 power_min, power_max;
+
+	power_min = em->table[0].power;
+	power_min *= MICROWATT_PER_MILLIWATT;
+	power_min += dtpm->power_min;
+
+	power_max = em->table[em->nr_perf_states - 1].power;
+	power_max *= MICROWATT_PER_MILLIWATT;
+	power_max += dtpm->power_max;
+
+	return dtpm_update_power(dtpm, power_min, power_max);
+}
+
+/*
+ * When a CPU is unplugged, remove its power contribution from the
+ * dtpm tree.
+ */
+static int power_sub(struct dtpm *dtpm, struct em_perf_domain *em)
+{
+	u64 power_min, power_max;
+
+	power_min = em->table[0].power;
+	power_min *= MICROWATT_PER_MILLIWATT;
+	power_min = dtpm->power_min - power_min;
+
+	power_max = em->table[em->nr_perf_states - 1].power;
+	power_max *= MICROWATT_PER_MILLIWATT;
+	power_max = dtpm->power_max - power_max;
+
+	return dtpm_update_power(dtpm, power_min, power_max);
+}
+
+static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
+{
+	struct dtpm_cpu *dtpm_cpu = dtpm->private;
+	struct em_perf_domain *pd;
+	struct cpumask cpus;
+	unsigned long freq;
+	u64 power;
+	int i, nr_cpus;
+
+	pd = em_cpu_get(dtpm_cpu->cpu);
+
+	cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus));
+
+	nr_cpus = cpumask_weight(&cpus);
+
+	for (i = 0; i < pd->nr_perf_states; i++) {
+
+		power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus;
+
+		if (power > power_limit)
+			break;
+	}
+
+	freq = pd->table[i - 1].frequency;
+
+	freq_qos_update_request(&dtpm_cpu->qos_req, freq);
+
+	power_limit = pd->table[i - 1].power *
+		MICROWATT_PER_MILLIWATT * nr_cpus;
+
+	return power_limit;
+}
+
+static u64 get_pd_power_uw(struct dtpm *dtpm)
+{
+	struct dtpm_cpu *dtpm_cpu = dtpm->private;
+	struct em_perf_domain *pd;
+	struct cpumask cpus;
+	unsigned long freq;
+	int i, nr_cpus;
+
+	pd = em_cpu_get(dtpm_cpu->cpu);
+	freq = cpufreq_quick_get(dtpm_cpu->cpu);
+	cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus));
+	nr_cpus = cpumask_weight(&cpus);
+
+	for (i = 0; i < pd->nr_perf_states; i++) {
+
+		if (pd->table[i].frequency < freq)
+			continue;
+
+		return pd->table[i].power *
+			MICROWATT_PER_MILLIWATT * nr_cpus;
+	}
+
+	return 0;
+}
+
+static void pd_release(struct dtpm *dtpm)
+{
+	struct dtpm_cpu *dtpm_cpu = dtpm->private;
+
+	if (freq_qos_request_active(&dtpm_cpu->qos_req))
+		freq_qos_remove_request(&dtpm_cpu->qos_req);
+
+	kfree(dtpm_cpu);
+}
+
+static struct dtpm_ops dtpm_ops = {
+	.set_power_uw = set_pd_power_limit,
+	.get_power_uw = get_pd_power_uw,
+	.release = pd_release,
+};
+
+static int cpuhp_dtpm_cpu_offline(unsigned int cpu)
+{
+	struct cpufreq_policy *policy;
+	struct em_perf_domain *pd;
+	struct dtpm *dtpm;
+
+	policy = cpufreq_cpu_get(cpu);
+
+	if (!policy)
+		return 0;
+
+	pd = em_cpu_get(cpu);
+	if (!pd)
+		return -EINVAL;
+
+	dtpm = per_cpu(dtpm_per_cpu, cpu);
+
+	power_sub(dtpm, pd);
+
+	if (cpumask_weight(policy->cpus) != 1)
+		return 0;
+
+	for_each_cpu(cpu, policy->related_cpus)
+		per_cpu(dtpm_per_cpu, cpu) = NULL;
+
+	dtpm_unregister(dtpm);
+
+	return 0;
+}
+
+static int cpuhp_dtpm_cpu_online(unsigned int cpu)
+{
+	struct dtpm *dtpm;
+	struct dtpm_cpu *dtpm_cpu;
+	struct cpufreq_policy *policy;
+	struct em_perf_domain *pd;
+	char name[CPUFREQ_NAME_LEN];
+	int ret = -ENOMEM;
+
+	policy = cpufreq_cpu_get(cpu);
+
+	if (!policy)
+		return 0;
+
+	pd = em_cpu_get(cpu);
+	if (!pd)
+		return -EINVAL;
+
+	dtpm = per_cpu(dtpm_per_cpu, cpu);
+	if (dtpm)
+		return power_add(dtpm, pd);
+
+	dtpm = dtpm_alloc(&dtpm_ops);
+	if (!dtpm)
+		return -EINVAL;
+
+	dtpm_cpu = kzalloc(sizeof(dtpm_cpu), GFP_KERNEL);
+	if (!dtpm_cpu)
+		goto out_kfree_dtpm;
+
+	dtpm->private = dtpm_cpu;
+	dtpm_cpu->cpu = cpu;
+
+	for_each_cpu(cpu, policy->related_cpus)
+		per_cpu(dtpm_per_cpu, cpu) = dtpm;
+
+	sprintf(name, "cpu%d", dtpm_cpu->cpu);
+
+	ret = dtpm_register(name, dtpm, __parent);
+	if (ret)
+		goto out_kfree_dtpm_cpu;
+
+	ret = power_add(dtpm, pd);
+	if (ret)
+		goto out_dtpm_unregister;
+
+	ret = freq_qos_add_request(&policy->constraints,
+				   &dtpm_cpu->qos_req, FREQ_QOS_MAX,
+				   pd->table[pd->nr_perf_states - 1].frequency);
+	if (ret)
+		goto out_power_sub;
+
+	return 0;
+
+out_power_sub:
+	power_sub(dtpm, pd);
+
+out_dtpm_unregister:
+	dtpm_unregister(dtpm);
+	dtpm_cpu = NULL;
+	dtpm = NULL;
+
+out_kfree_dtpm_cpu:
+	for_each_cpu(cpu, policy->related_cpus)
+		per_cpu(dtpm_per_cpu, cpu) = NULL;
+	kfree(dtpm_cpu);
+
+out_kfree_dtpm:
+	kfree(dtpm);
+	return ret;
+}
+
+int dtpm_register_cpu(struct dtpm *parent)
+{
+	__parent = parent;
+
+	return cpuhp_setup_state(CPUHP_AP_DTPM_CPU_ONLINE,
+				 "dtpm_cpu:online",
+				 cpuhp_dtpm_cpu_online,
+				 cpuhp_dtpm_cpu_offline);
+}
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0042ef362511..ee09a39627d6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -193,6 +193,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
 	CPUHP_AP_X86_HPET_ONLINE,
 	CPUHP_AP_X86_KVM_CLK_ONLINE,
+	CPUHP_AP_DTPM_CPU_ONLINE,
 	CPUHP_AP_ACTIVE,
 	CPUHP_ONLINE,
 };
diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h
index 7a1d0b50e334..e80a332e3d8a 100644
--- a/include/linux/dtpm.h
+++ b/include/linux/dtpm.h
@@ -72,4 +72,6 @@ void dtpm_unregister(struct dtpm *dtpm);
 
 int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent);
 
+int dtpm_register_cpu(struct dtpm *parent);
+
 #endif
-- 
cgit v1.2.3


From f7005142dacea1769fba0152c493aaa61b33205c Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 21 Dec 2020 17:29:34 +0200
Subject: spi: uapi: unify SPI modes into a single spi.h header

This change moves all the SPI mode bits into a separate 'spi.h' header in
uAPI. This is meant to re-use these definitions inside the kernel as well
as export them to userspace (via uAPI).

The SPI mode definitions have usually been duplicated between between
'include/linux/spi/spi.h' and 'include/uapi/linux/spi/spidev.h', so
whenever adding a new entry, this would need to be put in both headers.

They've been moved from 'include/linux/spi/spi.h', since that seems a bit
more complete; the bits have descriptions and there is the SPI_MODE_X_MASK.

This change also does a conversion of these bitfields to _BITUL() macro.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201221152936.53873-1-alexandru.ardelean@analog.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h         | 23 ++---------------------
 include/uapi/linux/spi/spi.h    | 31 +++++++++++++++++++++++++++++++
 include/uapi/linux/spi/spidev.h | 30 +-----------------------------
 3 files changed, 34 insertions(+), 50 deletions(-)
 create mode 100644 include/uapi/linux/spi/spi.h

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index aa09fdc8042d..a08c3f37e202 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -15,6 +15,8 @@
 #include <linux/gpio/consumer.h>
 #include <linux/ptp_clock_kernel.h>
 
+#include <uapi/linux/spi/spi.h>
+
 struct dma_chan;
 struct property_entry;
 struct spi_controller;
@@ -165,27 +167,6 @@ struct spi_device {
 	u8			bits_per_word;
 	bool			rt;
 	u32			mode;
-#define	SPI_CPHA	0x01			/* clock phase */
-#define	SPI_CPOL	0x02			/* clock polarity */
-#define	SPI_MODE_0	(0|0)			/* (original MicroWire) */
-#define	SPI_MODE_1	(0|SPI_CPHA)
-#define	SPI_MODE_2	(SPI_CPOL|0)
-#define	SPI_MODE_3	(SPI_CPOL|SPI_CPHA)
-#define	SPI_MODE_X_MASK	(SPI_CPOL|SPI_CPHA)
-#define	SPI_CS_HIGH	0x04			/* chipselect active high? */
-#define	SPI_LSB_FIRST	0x08			/* per-word bits-on-wire */
-#define	SPI_3WIRE	0x10			/* SI/SO signals shared */
-#define	SPI_LOOP	0x20			/* loopback mode */
-#define	SPI_NO_CS	0x40			/* 1 dev/bus, no chipselect */
-#define	SPI_READY	0x80			/* slave pulls low to pause */
-#define	SPI_TX_DUAL	0x100			/* transmit with 2 wires */
-#define	SPI_TX_QUAD	0x200			/* transmit with 4 wires */
-#define	SPI_RX_DUAL	0x400			/* receive with 2 wires */
-#define	SPI_RX_QUAD	0x800			/* receive with 4 wires */
-#define	SPI_CS_WORD	0x1000			/* toggle cs after each word */
-#define	SPI_TX_OCTAL	0x2000			/* transmit with 8 wires */
-#define	SPI_RX_OCTAL	0x4000			/* receive with 8 wires */
-#define	SPI_3WIRE_HIZ	0x8000			/* high impedance turnaround */
 	int			irq;
 	void			*controller_state;
 	void			*controller_data;
diff --git a/include/uapi/linux/spi/spi.h b/include/uapi/linux/spi/spi.h
new file mode 100644
index 000000000000..703b586f35df
--- /dev/null
+++ b/include/uapi/linux/spi/spi.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_SPI_H
+#define _UAPI_SPI_H
+
+#include <linux/const.h>
+
+#define	SPI_CPHA		_BITUL(0)	/* clock phase */
+#define	SPI_CPOL		_BITUL(1)	/* clock polarity */
+
+#define	SPI_MODE_0		(0|0)		/* (original MicroWire) */
+#define	SPI_MODE_1		(0|SPI_CPHA)
+#define	SPI_MODE_2		(SPI_CPOL|0)
+#define	SPI_MODE_3		(SPI_CPOL|SPI_CPHA)
+#define	SPI_MODE_X_MASK		(SPI_CPOL|SPI_CPHA)
+
+#define	SPI_CS_HIGH		_BITUL(2)	/* chipselect active high? */
+#define	SPI_LSB_FIRST		_BITUL(3)	/* per-word bits-on-wire */
+#define	SPI_3WIRE		_BITUL(4)	/* SI/SO signals shared */
+#define	SPI_LOOP		_BITUL(5)	/* loopback mode */
+#define	SPI_NO_CS		_BITUL(6)	/* 1 dev/bus, no chipselect */
+#define	SPI_READY		_BITUL(7)	/* slave pulls low to pause */
+#define	SPI_TX_DUAL		_BITUL(8)	/* transmit with 2 wires */
+#define	SPI_TX_QUAD		_BITUL(9)	/* transmit with 4 wires */
+#define	SPI_RX_DUAL		_BITUL(10)	/* receive with 2 wires */
+#define	SPI_RX_QUAD		_BITUL(11)	/* receive with 4 wires */
+#define	SPI_CS_WORD		_BITUL(12)	/* toggle cs after each word */
+#define	SPI_TX_OCTAL		_BITUL(13)	/* transmit with 8 wires */
+#define	SPI_RX_OCTAL		_BITUL(14)	/* receive with 8 wires */
+#define	SPI_3WIRE_HIZ		_BITUL(15)	/* high impedance turnaround */
+
+#endif /* _UAPI_SPI_H */
diff --git a/include/uapi/linux/spi/spidev.h b/include/uapi/linux/spi/spidev.h
index d56427c0b3e0..0c3da08f2aff 100644
--- a/include/uapi/linux/spi/spidev.h
+++ b/include/uapi/linux/spi/spidev.h
@@ -25,35 +25,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
-
-/* User space versions of kernel symbols for SPI clocking modes,
- * matching <linux/spi/spi.h>
- */
-
-#define SPI_CPHA		0x01
-#define SPI_CPOL		0x02
-
-#define SPI_MODE_0		(0|0)
-#define SPI_MODE_1		(0|SPI_CPHA)
-#define SPI_MODE_2		(SPI_CPOL|0)
-#define SPI_MODE_3		(SPI_CPOL|SPI_CPHA)
-
-#define SPI_CS_HIGH		0x04
-#define SPI_LSB_FIRST		0x08
-#define SPI_3WIRE		0x10
-#define SPI_LOOP		0x20
-#define SPI_NO_CS		0x40
-#define SPI_READY		0x80
-#define SPI_TX_DUAL		0x100
-#define SPI_TX_QUAD		0x200
-#define SPI_RX_DUAL		0x400
-#define SPI_RX_QUAD		0x800
-#define SPI_CS_WORD		0x1000
-#define SPI_TX_OCTAL		0x2000
-#define SPI_RX_OCTAL		0x4000
-#define SPI_3WIRE_HIZ		0x8000
-
-/*---------------------------------------------------------------------------*/
+#include <linux/spi/spi.h>
 
 /* IOCTL commands */
 
-- 
cgit v1.2.3


From d962608ce2188a1d46ec9d356d6fad5cd6fc0341 Mon Sep 17 00:00:00 2001
From: Dragos Bogdan <dragos.bogdan@analog.com>
Date: Mon, 21 Dec 2020 17:29:35 +0200
Subject: spi: Add SPI_NO_TX/RX support

Transmit/receive only is a valid SPI mode. For example, the MOSI/TX line
might be missing from an ADC while for a DAC the MISO/RX line may be
optional. This patch adds these two new modes: SPI_NO_TX and
SPI_NO_RX. This way, the drivers will be able to identify if any of
these two lines is missing and to adjust the transfers accordingly.

Signed-off-by: Dragos Bogdan <dragos.bogdan@analog.com>
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20201221152936.53873-2-alexandru.ardelean@analog.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c            | 25 ++++++++++++++++++++-----
 include/linux/spi/spi.h      | 17 +++++++++++++++++
 include/uapi/linux/spi/spi.h | 10 ++++++++++
 3 files changed, 47 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 51d7c004fbab..ca75f4036eda 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1941,6 +1941,9 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi,
 	/* Device DUAL/QUAD mode */
 	if (!of_property_read_u32(nc, "spi-tx-bus-width", &value)) {
 		switch (value) {
+		case 0:
+			spi->mode |= SPI_NO_TX;
+			break;
 		case 1:
 			break;
 		case 2:
@@ -1962,6 +1965,9 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi,
 
 	if (!of_property_read_u32(nc, "spi-rx-bus-width", &value)) {
 		switch (value) {
+		case 0:
+			spi->mode |= SPI_NO_RX;
+			break;
 		case 1:
 			break;
 		case 2:
@@ -3329,12 +3335,16 @@ int spi_setup(struct spi_device *spi)
 	unsigned	bad_bits, ugly_bits;
 	int		status;
 
-	/* check mode to prevent that DUAL and QUAD set at the same time
+	/*
+	 * check mode to prevent that any two of DUAL, QUAD and NO_MOSI/MISO
+	 * are set at the same time
 	 */
-	if (((spi->mode & SPI_TX_DUAL) && (spi->mode & SPI_TX_QUAD)) ||
-		((spi->mode & SPI_RX_DUAL) && (spi->mode & SPI_RX_QUAD))) {
+	if ((hweight_long(spi->mode &
+		(SPI_TX_DUAL | SPI_TX_QUAD | SPI_NO_TX)) > 1) ||
+	    (hweight_long(spi->mode &
+		(SPI_RX_DUAL | SPI_RX_QUAD | SPI_NO_RX)) > 1)) {
 		dev_err(&spi->dev,
-		"setup: can not select dual and quad at the same time\n");
+		"setup: can not select any two of dual, quad and no-rx/tx at the same time\n");
 		return -EINVAL;
 	}
 	/* if it is SPI_3WIRE mode, DUAL and QUAD should be forbidden
@@ -3348,7 +3358,8 @@ int spi_setup(struct spi_device *spi)
 	 * SPI_CS_WORD has a fallback software implementation,
 	 * so it is ignored here.
 	 */
-	bad_bits = spi->mode & ~(spi->controller->mode_bits | SPI_CS_WORD);
+	bad_bits = spi->mode & ~(spi->controller->mode_bits | SPI_CS_WORD |
+				 SPI_NO_TX | SPI_NO_RX);
 	/* nothing prevents from working with active-high CS in case if it
 	 * is driven by GPIO.
 	 */
@@ -3610,6 +3621,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 		 * 2. check tx/rx_nbits match the mode in spi_device
 		 */
 		if (xfer->tx_buf) {
+			if (spi->mode & SPI_NO_TX)
+				return -EINVAL;
 			if (xfer->tx_nbits != SPI_NBITS_SINGLE &&
 				xfer->tx_nbits != SPI_NBITS_DUAL &&
 				xfer->tx_nbits != SPI_NBITS_QUAD)
@@ -3623,6 +3636,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 		}
 		/* check transfer rx_nbits */
 		if (xfer->rx_buf) {
+			if (spi->mode & SPI_NO_RX)
+				return -EINVAL;
 			if (xfer->rx_nbits != SPI_NBITS_SINGLE &&
 				xfer->rx_nbits != SPI_NBITS_DUAL &&
 				xfer->rx_nbits != SPI_NBITS_QUAD)
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a08c3f37e202..9bfdfaf286eb 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -6,6 +6,7 @@
 #ifndef __LINUX_SPI_H
 #define __LINUX_SPI_H
 
+#include <linux/bits.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
@@ -166,6 +167,18 @@ struct spi_device {
 	u8			chip_select;
 	u8			bits_per_word;
 	bool			rt;
+#define SPI_NO_TX	BIT(31)		/* no transmit wire */
+#define SPI_NO_RX	BIT(30)		/* no receive wire */
+	/*
+	 * All bits defined above should be covered by SPI_MODE_KERNEL_MASK.
+	 * The SPI_MODE_KERNEL_MASK has the SPI_MODE_USER_MASK counterpart,
+	 * which is defined in 'include/uapi/linux/spi/spi.h'.
+	 * The bits defined here are from bit 31 downwards, while in
+	 * SPI_MODE_USER_MASK are from 0 upwards.
+	 * These bits must not overlap. A static assert check should make sure of that.
+	 * If adding extra bits, make sure to decrease the bit index below as well.
+	 */
+#define SPI_MODE_KERNEL_MASK	(~(BIT(30) - 1))
 	u32			mode;
 	int			irq;
 	void			*controller_state;
@@ -189,6 +202,10 @@ struct spi_device {
 	 */
 };
 
+/* Make sure that SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK don't overlap */
+static_assert((SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK) == 0,
+	      "SPI_MODE_USER_MASK & SPI_MODE_KERNEL_MASK must not overlap");
+
 static inline struct spi_device *to_spi_device(struct device *dev)
 {
 	return dev ? container_of(dev, struct spi_device, dev) : NULL;
diff --git a/include/uapi/linux/spi/spi.h b/include/uapi/linux/spi/spi.h
index 703b586f35df..236a85f08ded 100644
--- a/include/uapi/linux/spi/spi.h
+++ b/include/uapi/linux/spi/spi.h
@@ -28,4 +28,14 @@
 #define	SPI_RX_OCTAL		_BITUL(14)	/* receive with 8 wires */
 #define	SPI_3WIRE_HIZ		_BITUL(15)	/* high impedance turnaround */
 
+/*
+ * All the bits defined above should be covered by SPI_MODE_USER_MASK.
+ * The SPI_MODE_USER_MASK has the SPI_MODE_KERNEL_MASK counterpart in
+ * 'include/linux/spi/spi.h'. The bits defined here are from bit 0 upwards
+ * while in SPI_MODE_KERNEL_MASK they are from the other end downwards.
+ * These bits must not overlap. A static assert check should make sure of that.
+ * If adding extra bits, make sure to increase the bit index below as well.
+ */
+#define SPI_MODE_USER_MASK	(_BITUL(16) - 1)
+
 #endif /* _UAPI_SPI_H */
-- 
cgit v1.2.3


From 4ea3cd65e0d47c4d3fc0c86d2d93d97457dc86fb Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 18 Dec 2020 11:42:44 +0100
Subject: tty: rename tty_kopen() and add new function tty_kopen_shared()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new function tty_kopen_shared() that yields a struct
tty_struct. The semantic difference to tty_kopen() is that the tty is
expected to be used already. So rename tty_kopen() to
tty_kopen_exclusive() for clearness, adapt the single user and put the
common code in a new static helper function.

tty_kopen_shared is to be used to implement an LED trigger for tty
devices in one of the next patches.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20201218104246.591315-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/accessibility/speakup/spk_ttyio.c |  2 +-
 drivers/tty/tty_io.c                      | 56 +++++++++++++++++++++----------
 include/linux/tty.h                       |  5 +--
 3 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accessibility/speakup/spk_ttyio.c b/drivers/accessibility/speakup/spk_ttyio.c
index 6284aff434a1..835d17455fcd 100644
--- a/drivers/accessibility/speakup/spk_ttyio.c
+++ b/drivers/accessibility/speakup/spk_ttyio.c
@@ -152,7 +152,7 @@ static int spk_ttyio_initialise_ldisc(struct spk_synth *synth)
 	if (ret)
 		return ret;
 
-	tty = tty_kopen(dev);
+	tty = tty_kopen_exclusive(dev);
 	if (IS_ERR(tty))
 		return PTR_ERR(tty);
 
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8034489337d7..62ccd021102d 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1875,22 +1875,7 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 	return driver;
 }
 
-/**
- *	tty_kopen	-	open a tty device for kernel
- *	@device: dev_t of device to open
- *
- *	Opens tty exclusively for kernel. Performs the driver lookup,
- *	makes sure it's not already opened and performs the first-time
- *	tty initialization.
- *
- *	Returns the locked initialized &tty_struct
- *
- *	Claims the global tty_mutex to serialize:
- *	  - concurrent first-time tty initialization
- *	  - concurrent tty driver removal w/ lookup
- *	  - concurrent tty removal from driver table
- */
-struct tty_struct *tty_kopen(dev_t device)
+static struct tty_struct *tty_kopen(dev_t device, int shared)
 {
 	struct tty_struct *tty;
 	struct tty_driver *driver;
@@ -1905,7 +1890,7 @@ struct tty_struct *tty_kopen(dev_t device)
 
 	/* check whether we're reopening an existing tty */
 	tty = tty_driver_lookup_tty(driver, NULL, index);
-	if (IS_ERR(tty))
+	if (IS_ERR(tty) || shared)
 		goto out;
 
 	if (tty) {
@@ -1923,7 +1908,42 @@ out:
 	tty_driver_kref_put(driver);
 	return tty;
 }
-EXPORT_SYMBOL_GPL(tty_kopen);
+
+/**
+ *	tty_kopen_exclusive	-	open a tty device for kernel
+ *	@device: dev_t of device to open
+ *
+ *	Opens tty exclusively for kernel. Performs the driver lookup,
+ *	makes sure it's not already opened and performs the first-time
+ *	tty initialization.
+ *
+ *	Returns the locked initialized &tty_struct
+ *
+ *	Claims the global tty_mutex to serialize:
+ *	  - concurrent first-time tty initialization
+ *	  - concurrent tty driver removal w/ lookup
+ *	  - concurrent tty removal from driver table
+ */
+struct tty_struct *tty_kopen_exclusive(dev_t device)
+{
+	return tty_kopen(device, 0);
+}
+EXPORT_SYMBOL_GPL(tty_kopen_exclusive);
+
+/**
+ *	tty_kopen_shared	-	open a tty device for shared in-kernel use
+ *	@device: dev_t of device to open
+ *
+ *	Opens an already existing tty for in-kernel use. Compared to
+ *	tty_kopen_exclusive() above it doesn't ensure to be the only user.
+ *
+ *	Locking is identical to tty_kopen() above.
+ */
+struct tty_struct *tty_kopen_shared(dev_t device)
+{
+	return tty_kopen(device, 1);
+}
+EXPORT_SYMBOL_GPL(tty_kopen_shared);
 
 /**
  *	tty_open_by_driver	-	open a tty device
diff --git a/include/linux/tty.h b/include/linux/tty.h
index c873f475f0a7..df707a5abfac 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -416,7 +416,8 @@ extern struct tty_struct *get_current_tty(void);
 /* tty_io.c */
 extern int __init tty_init(void);
 extern const char *tty_name(const struct tty_struct *tty);
-extern struct tty_struct *tty_kopen(dev_t device);
+extern struct tty_struct *tty_kopen_exclusive(dev_t device);
+extern struct tty_struct *tty_kopen_shared(dev_t device);
 extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
 extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
@@ -441,7 +442,7 @@ static inline int __init tty_init(void)
 { return 0; }
 static inline const char *tty_name(const struct tty_struct *tty)
 { return "(none)"; }
-static inline struct tty_struct *tty_kopen(dev_t device)
+static inline struct tty_struct *tty_kopen_exclusive(dev_t device)
 { return ERR_PTR(-ENODEV); }
 static inline void tty_kclose(struct tty_struct *tty)
 { }
-- 
cgit v1.2.3


From d20c219c7317843cec7f03eba15bfeae54f29654 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 18 Dec 2020 11:42:45 +0100
Subject: tty: new helper function tty_get_icount()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For a given struct tty_struct this yields the corresponding statistics
about sent and received characters (and some more) which is needed to
implement an LED trigger for tty devices.

The new function is then used to simplify tty_tiocgicount().

Reviewed-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20201218104246.591315-3-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 29 +++++++++++++++++++++++++----
 include/linux/tty.h  |  2 ++
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 62ccd021102d..95ba028ef668 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2488,15 +2488,36 @@ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd,
 	return tty->ops->tiocmset(tty, set, clear);
 }
 
+/**
+ *	tty_get_icount		-	get tty statistics
+ *	@tty: tty device
+ *	@icount: output parameter
+ *
+ *	Gets a copy of the tty's icount statistics.
+ *
+ *	Locking: none (up to the driver)
+ */
+int tty_get_icount(struct tty_struct *tty,
+		   struct serial_icounter_struct *icount)
+{
+	memset(icount, 0, sizeof(*icount));
+
+	if (tty->ops->get_icount)
+		return tty->ops->get_icount(tty, icount);
+	else
+		return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(tty_get_icount);
+
 static int tty_tiocgicount(struct tty_struct *tty, void __user *arg)
 {
-	int retval = -EINVAL;
 	struct serial_icounter_struct icount;
-	memset(&icount, 0, sizeof(icount));
-	if (tty->ops->get_icount)
-		retval = tty->ops->get_icount(tty, &icount);
+	int retval;
+
+	retval = tty_get_icount(tty, &icount);
 	if (retval != 0)
 		return retval;
+
 	if (copy_to_user(arg, &icount, sizeof(icount)))
 		return -EFAULT;
 	return 0;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index df707a5abfac..12be8b16cdef 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -500,6 +500,8 @@ extern void tty_unthrottle(struct tty_struct *tty);
 extern int tty_throttle_safe(struct tty_struct *tty);
 extern int tty_unthrottle_safe(struct tty_struct *tty);
 extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
+extern int tty_get_icount(struct tty_struct *tty,
+			  struct serial_icounter_struct *icount);
 extern int is_current_pgrp_orphaned(void);
 extern void tty_hangup(struct tty_struct *tty);
 extern void tty_vhangup(struct tty_struct *tty);
-- 
cgit v1.2.3


From 916c0c05521a52f13283ad3600793fc79516ff31 Mon Sep 17 00:00:00 2001
From: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Date: Mon, 30 Nov 2020 15:09:23 +0530
Subject: soc: qcom: llcc-qcom: Extract major hardware version

The major hardware version of the LLCC IP is encoded in its
LLCC_COMMON_HW_INFO register. Extract the version and cache it in the
driver data so that it can be used to implement version specific
functionality like enabling Write sub cache for given SCID.

Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
[mani: splitted the version extract as a single patch and few cleanups]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20201130093924.45057-4-manivannan.sadhasivam@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 12 ++++++++++++
 include/linux/soc/qcom/llcc-qcom.h |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 16b421608e9c..a559617ea7c0 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -4,6 +4,7 @@
  *
  */
 
+#include <linux/bitfield.h>
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <linux/device.h>
@@ -35,6 +36,9 @@
 
 #define CACHE_LINE_SIZE_SHIFT         6
 
+#define LLCC_COMMON_HW_INFO           0x00030000
+#define LLCC_MAJOR_VERSION_MASK       GENMASK(31, 24)
+
 #define LLCC_COMMON_STATUS0           0x0003000c
 #define LLCC_LB_CNT_MASK              GENMASK(31, 28)
 #define LLCC_LB_CNT_SHIFT             28
@@ -476,6 +480,7 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 	const struct qcom_llcc_config *cfg;
 	const struct llcc_slice_config *llcc_cfg;
 	u32 sz;
+	u32 version;
 
 	drv_data = devm_kzalloc(dev, sizeof(*drv_data), GFP_KERNEL);
 	if (!drv_data) {
@@ -496,6 +501,13 @@ static int qcom_llcc_probe(struct platform_device *pdev)
 		goto err;
 	}
 
+	/* Extract major version of the IP */
+	ret = regmap_read(drv_data->bcast_regmap, LLCC_COMMON_HW_INFO, &version);
+	if (ret)
+		goto err;
+
+	drv_data->major_version = FIELD_GET(LLCC_MAJOR_VERSION_MASK, version);
+
 	ret = regmap_read(drv_data->regmap, LLCC_COMMON_STATUS0,
 						&num_banks);
 	if (ret)
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 3db6797ba6ff..d17a3de80510 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -79,6 +79,7 @@ struct llcc_edac_reg_data {
  * @bitmap: Bit map to track the active slice ids
  * @offsets: Pointer to the bank offsets array
  * @ecc_irq: interrupt for llcc cache error detection and reporting
+ * @major_version: Indicates the LLCC major version
  */
 struct llcc_drv_data {
 	struct regmap *regmap;
@@ -91,6 +92,7 @@ struct llcc_drv_data {
 	unsigned long *bitmap;
 	u32 *offsets;
 	int ecc_irq;
+	u32 major_version;
 };
 
 #if IS_ENABLED(CONFIG_QCOM_LLCC)
-- 
cgit v1.2.3


From c4df37fe186de4df8895a7a4793f5221eda6e5ae Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Mon, 30 Nov 2020 15:09:24 +0530
Subject: soc: qcom: llcc-qcom: Add support for SM8250 SoC

SM8250 SoC uses LLCC IP version 2. In this version, the WRSC_EN register
needs to be written to enable the Write Sub Cache for each SCID. Hence,
use a dedicated "write_scid_en" member with predefined values and write
them for LLCC IP version 2.

Reviewed-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20201130093924.45057-5-manivannan.sadhasivam@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/soc/qcom/llcc-qcom.h |  1 +
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index a559617ea7c0..8403a77b59fe 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -51,6 +51,7 @@
 
 #define LLCC_TRP_SCID_DIS_CAP_ALLOC   0x21f00
 #define LLCC_TRP_PCB_ACT              0x21f04
+#define LLCC_TRP_WRSC_EN              0x21f20
 
 #define BANK_OFFSET_STRIDE	      0x80000
 
@@ -77,6 +78,7 @@
  *               then the ways assigned to this client are not flushed on power
  *               collapse.
  * @activate_on_init: Activate the slice immediately after it is programmed
+ * @write_scid_en: Bit enables write cache support for a given scid.
  */
 struct llcc_slice_config {
 	u32 usecase_id;
@@ -91,6 +93,7 @@ struct llcc_slice_config {
 	bool dis_cap_alloc;
 	bool retain_on_pc;
 	bool activate_on_init;
+	bool write_scid_en;
 };
 
 struct qcom_llcc_config {
@@ -151,6 +154,25 @@ static const struct llcc_slice_config sm8150_data[] =  {
 	{  LLCC_WRCACHE, 31, 128,  1, 1, 0xFFF, 0x0,   0, 0, 0, 0, 0 },
 };
 
+static const struct llcc_slice_config sm8250_data[] =  {
+	{ LLCC_CPUSS,    1, 3072, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 1, 0 },
+	{ LLCC_VIDSC0,   2, 512,  3, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_AUDIO,    6, 1024, 1, 0, 0xfff, 0x0, 0, 0, 0, 0, 0, 0 },
+	{ LLCC_CMPT,    10, 1024, 1, 0, 0xfff, 0x0, 0, 0, 0, 0, 0, 0 },
+	{ LLCC_GPUHTW,  11, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_GPU,     12, 1024, 1, 0, 0xfff, 0x0, 0, 0, 0, 1, 0, 1 },
+	{ LLCC_MMUHWT,  13, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 0, 1, 0 },
+	{ LLCC_CMPTDMA, 15, 1024, 1, 0, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_DISP,    16, 3072, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_VIDFW,   17, 512,  1, 0, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_AUDHW,   22, 1024, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_NPU,     23, 3072, 1, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_WLHW,    24, 1024, 1, 0, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_CVP,     28, 256,  3, 1, 0xfff, 0x0, 0, 0, 0, 1, 0, 0 },
+	{ LLCC_APTCM,   30, 128,  3, 0, 0x0,   0x3, 1, 0, 0, 1, 0, 0 },
+	{ LLCC_WRCACHE, 31, 256,  1, 1, 0xfff, 0x0, 0, 0, 0, 0, 1, 0 },
+};
+
 static const struct qcom_llcc_config sc7180_cfg = {
 	.sct_data	= sc7180_data,
 	.size		= ARRAY_SIZE(sc7180_data),
@@ -168,6 +190,11 @@ static const struct qcom_llcc_config sm8150_cfg = {
 	.size           = ARRAY_SIZE(sm8150_data),
 };
 
+static const struct qcom_llcc_config sm8250_cfg = {
+	.sct_data       = sm8250_data,
+	.size           = ARRAY_SIZE(sm8250_data),
+};
+
 static struct llcc_drv_data *drv_data = (void *) -EPROBE_DEFER;
 
 /**
@@ -417,6 +444,16 @@ static int _qcom_llcc_cfg_program(const struct llcc_slice_config *config,
 			return ret;
 	}
 
+	if (drv_data->major_version == 2) {
+		u32 wren;
+
+		wren = config->write_scid_en << config->slice_id;
+		ret = regmap_update_bits(drv_data->bcast_regmap, LLCC_TRP_WRSC_EN,
+					 BIT(config->slice_id), wren);
+		if (ret)
+			return ret;
+	}
+
 	if (config->activate_on_init) {
 		desc.slice_id = config->slice_id;
 		ret = llcc_slice_activate(&desc);
@@ -571,6 +608,7 @@ static const struct of_device_id qcom_llcc_of_match[] = {
 	{ .compatible = "qcom,sc7180-llcc", .data = &sc7180_cfg },
 	{ .compatible = "qcom,sdm845-llcc", .data = &sdm845_cfg },
 	{ .compatible = "qcom,sm8150-llcc", .data = &sm8150_cfg },
+	{ .compatible = "qcom,sm8250-llcc", .data = &sm8250_cfg },
 	{ }
 };
 
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index d17a3de80510..64fc582ae415 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -29,6 +29,7 @@
 #define LLCC_AUDHW       22
 #define LLCC_NPU         23
 #define LLCC_WLHW        24
+#define LLCC_CVP         28
 #define LLCC_MODPE       29
 #define LLCC_APTCM       30
 #define LLCC_WRCACHE     31
-- 
cgit v1.2.3


From a2ff95e018f1d2bc816f3078d5110a655e355f18 Mon Sep 17 00:00:00 2001
From: Mark Pearson <markpearson@lenovo.com>
Date: Tue, 29 Dec 2020 19:18:26 -0500
Subject: ACPI: platform: Add platform profile support

This is the initial implementation of the platform-profile feature.
It provides the details discussed and outlined in the
sysfs-platform_profile document.

Many modern systems have the ability to modify the operating profile to
control aspects like fan speed, temperature and power levels. This
module provides a common sysfs interface that platform modules can register
against to control their individual profile options.

Signed-off-by: Mark Pearson <markpearson@lenovo.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
[ rjw: Use full words in enum values names ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig             |  17 ++++
 drivers/acpi/Makefile            |   1 +
 drivers/acpi/platform_profile.c  | 181 +++++++++++++++++++++++++++++++++++++++
 include/linux/platform_profile.h |  39 +++++++++
 4 files changed, 238 insertions(+)
 create mode 100644 drivers/acpi/platform_profile.c
 create mode 100644 include/linux/platform_profile.h

(limited to 'include/linux')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index edf1558c1105..5ddff93e38c2 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -326,6 +326,23 @@ config ACPI_THERMAL
 	  To compile this driver as a module, choose M here:
 	  the module will be called thermal.
 
+config ACPI_PLATFORM_PROFILE
+	tristate "ACPI Platform Profile Driver"
+	default m
+	help
+	  This driver adds support for platform-profiles on platforms that
+	  support it.
+
+	  Platform-profiles can be used to control the platform behaviour. For
+	  example whether to operate in a lower power mode, in a higher
+	  power performance mode or between the two.
+
+	  This driver provides the sysfs interface and is used as the registration
+	  point for platform specific drivers.
+
+	  Which profiles are supported is determined on a per-platform basis and
+	  should be obtained from the platform specific driver.
+
 config ACPI_CUSTOM_DSDT_FILE
 	string "Custom DSDT Table file to include"
 	default ""
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 076894a3330f..52b627c7f977 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -79,6 +79,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT)	+= pci_slot.o
 obj-$(CONFIG_ACPI_PROCESSOR)	+= processor.o
 obj-$(CONFIG_ACPI)		+= container.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
+obj-$(CONFIG_ACPI_PLATFORM_PROFILE) 	+= platform_profile.o
 obj-$(CONFIG_ACPI_NFIT)		+= nfit/
 obj-$(CONFIG_ACPI_NUMA)		+= numa/
 obj-$(CONFIG_ACPI)		+= acpi_memhotplug.o
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
new file mode 100644
index 000000000000..91be50a32cc8
--- /dev/null
+++ b/drivers/acpi/platform_profile.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* Platform profile sysfs interface */
+
+#include <linux/acpi.h>
+#include <linux/bits.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/platform_profile.h>
+#include <linux/sysfs.h>
+
+static const struct platform_profile_handler *cur_profile;
+static DEFINE_MUTEX(profile_lock);
+
+static const char * const profile_names[] = {
+	[PLATFORM_PROFILE_LOW_POWER] = "low-power",
+	[PLATFORM_PROFILE_COOL] = "cool",
+	[PLATFORM_PROFILE_QUIET] = "quiet",
+	[PLATFORM_PROFILE_BALANCED] = "balanced",
+	[PLATFORM_PROFILE_PERFORMANCE] = "performance",
+};
+static_assert(ARRAY_SIZE(profile_names) == PLATFORM_PROFILE_LAST);
+
+static ssize_t platform_profile_choices_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	int len = 0;
+	int err, i;
+
+	err = mutex_lock_interruptible(&profile_lock);
+	if (err)
+		return err;
+
+	if (!cur_profile) {
+		mutex_unlock(&profile_lock);
+		return -ENODEV;
+	}
+
+	for_each_set_bit(i, cur_profile->choices, PLATFORM_PROFILE_LAST) {
+		if (len == 0)
+			len += sysfs_emit_at(buf, len, "%s", profile_names[i]);
+		else
+			len += sysfs_emit_at(buf, len, " %s", profile_names[i]);
+	}
+	len += sysfs_emit_at(buf, len, "\n");
+	mutex_unlock(&profile_lock);
+	return len;
+}
+
+static ssize_t platform_profile_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	enum platform_profile_option profile = PLATFORM_PROFILE_BALANCED;
+	int err;
+
+	err = mutex_lock_interruptible(&profile_lock);
+	if (err)
+		return err;
+
+	if (!cur_profile) {
+		mutex_unlock(&profile_lock);
+		return -ENODEV;
+	}
+
+	err = cur_profile->profile_get(&profile);
+	mutex_unlock(&profile_lock);
+	if (err)
+		return err;
+
+	/* Check that profile is valid index */
+	if (WARN_ON((profile < 0) || (profile >= ARRAY_SIZE(profile_names))))
+		return -EIO;
+
+	return sysfs_emit(buf, "%s\n", profile_names[profile]);
+}
+
+static ssize_t platform_profile_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int err, i;
+
+	err = mutex_lock_interruptible(&profile_lock);
+	if (err)
+		return err;
+
+	if (!cur_profile) {
+		mutex_unlock(&profile_lock);
+		return -ENODEV;
+	}
+
+	/* Scan for a matching profile */
+	i = sysfs_match_string(profile_names, buf);
+	if (i < 0) {
+		mutex_unlock(&profile_lock);
+		return -EINVAL;
+	}
+
+	/* Check that platform supports this profile choice */
+	if (!test_bit(i, cur_profile->choices)) {
+		mutex_unlock(&profile_lock);
+		return -EOPNOTSUPP;
+	}
+
+	err = cur_profile->profile_set(i);
+	mutex_unlock(&profile_lock);
+	if (err)
+		return err;
+	return count;
+}
+
+static DEVICE_ATTR_RO(platform_profile_choices);
+static DEVICE_ATTR_RW(platform_profile);
+
+static struct attribute *platform_profile_attrs[] = {
+	&dev_attr_platform_profile_choices.attr,
+	&dev_attr_platform_profile.attr,
+	NULL
+};
+
+static const struct attribute_group platform_profile_group = {
+	.attrs = platform_profile_attrs
+};
+
+void platform_profile_notify(void)
+{
+	if (!cur_profile)
+		return;
+	sysfs_notify(acpi_kobj, NULL, "platform_profile");
+}
+EXPORT_SYMBOL_GPL(platform_profile_notify);
+
+int platform_profile_register(const struct platform_profile_handler *pprof)
+{
+	int err;
+
+	mutex_lock(&profile_lock);
+	/* We can only have one active profile */
+	if (cur_profile) {
+		mutex_unlock(&profile_lock);
+		return -EEXIST;
+	}
+
+	/* Sanity check the profile handler field are set */
+	if (!pprof || bitmap_empty(pprof->choices, PLATFORM_PROFILE_LAST) ||
+		!pprof->profile_set || !pprof->profile_get) {
+		mutex_unlock(&profile_lock);
+		return -EINVAL;
+	}
+
+	err = sysfs_create_group(acpi_kobj, &platform_profile_group);
+	if (err) {
+		mutex_unlock(&profile_lock);
+		return err;
+	}
+
+	cur_profile = pprof;
+	mutex_unlock(&profile_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(platform_profile_register);
+
+int platform_profile_remove(void)
+{
+	mutex_lock(&profile_lock);
+	if (!cur_profile) {
+		mutex_unlock(&profile_lock);
+		return -ENODEV;
+	}
+
+	sysfs_remove_group(acpi_kobj, &platform_profile_group);
+	cur_profile = NULL;
+	mutex_unlock(&profile_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(platform_profile_remove);
+
+MODULE_AUTHOR("Mark Pearson <markpearson@lenovo.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
new file mode 100644
index 000000000000..3623d7108421
--- /dev/null
+++ b/include/linux/platform_profile.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Platform profile sysfs interface
+ *
+ * See Documentation/ABI/testing/sysfs-platform_profile.rst for more
+ * information.
+ */
+
+#ifndef _PLATFORM_PROFILE_H_
+#define _PLATFORM_PROFILE_H_
+
+#include <linux/bitops.h>
+
+/*
+ * If more options are added please update profile_names
+ * array in platform-profile.c and sysfs-platform-profile.rst
+ * documentation.
+ */
+
+enum platform_profile_option {
+	PLATFORM_PROFILE_LOW_POWER,
+	PLATFORM_PROFILE_COOL,
+	PLATFORM_PROFILE_QUIET,
+	PLATFORM_PROFILE_BALANCED,
+	PLATFORM_PROFILE_PERFORMANCE,
+	PLATFORM_PROFILE_LAST, /*must always be last */
+};
+
+struct platform_profile_handler {
+	unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
+	int (*profile_get)(enum platform_profile_option *profile);
+	int (*profile_set)(enum platform_profile_option profile);
+};
+
+int platform_profile_register(const struct platform_profile_handler *pprof);
+int platform_profile_remove(void);
+void platform_profile_notify(void);
+
+#endif  /*_PLATFORM_PROFILE_H_*/
-- 
cgit v1.2.3


From 0eb76ba29d16df2951d37c54ca279c4e5630b071 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 11 Dec 2020 13:27:15 +0100
Subject: crypto: remove cipher routines from public crypto API

The cipher routines in the crypto API are mostly intended for templates
implementing skcipher modes generically in software, and shouldn't be
used outside of the crypto subsystem. So move the prototypes and all
related definitions to a new header file under include/crypto/internal.
Also, let's use the new module namespace feature to move the symbol
exports into a new namespace CRYPTO_INTERNAL.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/api-skcipher.rst        |   4 +-
 arch/arm/crypto/aes-neonbs-glue.c            |   3 +
 arch/s390/crypto/aes_s390.c                  |   2 +
 crypto/adiantum.c                            |   2 +
 crypto/ansi_cprng.c                          |   2 +
 crypto/cbc.c                                 |   1 +
 crypto/ccm.c                                 |   2 +
 crypto/cfb.c                                 |   2 +
 crypto/cipher.c                              |   7 +-
 crypto/cmac.c                                |   2 +
 crypto/ctr.c                                 |   2 +
 crypto/drbg.c                                |   2 +
 crypto/ecb.c                                 |   1 +
 crypto/essiv.c                               |   2 +
 crypto/keywrap.c                             |   2 +
 crypto/ofb.c                                 |   2 +
 crypto/pcbc.c                                |   2 +
 crypto/skcipher.c                            |   2 +
 crypto/testmgr.c                             |   3 +
 crypto/vmac.c                                |   2 +
 crypto/xcbc.c                                |   2 +
 crypto/xts.c                                 |   2 +
 drivers/crypto/geode-aes.c                   |   2 +
 drivers/crypto/inside-secure/safexcel.c      |   1 +
 drivers/crypto/inside-secure/safexcel_hash.c |   1 +
 drivers/crypto/qat/qat_common/adf_ctl_drv.c  |   1 +
 drivers/crypto/qat/qat_common/qat_algs.c     |   1 +
 drivers/crypto/vmx/aes.c                     |   1 +
 drivers/crypto/vmx/vmx.c                     |   1 +
 include/crypto/algapi.h                      |  39 -----
 include/crypto/internal/cipher.h             | 218 +++++++++++++++++++++++++++
 include/crypto/internal/skcipher.h           |   1 +
 include/linux/crypto.h                       | 163 --------------------
 33 files changed, 273 insertions(+), 207 deletions(-)
 create mode 100644 include/crypto/internal/cipher.h

(limited to 'include/linux')

diff --git a/Documentation/crypto/api-skcipher.rst b/Documentation/crypto/api-skcipher.rst
index 1aaf8985894b..04d6cc5357c8 100644
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@@ -28,8 +28,8 @@ Symmetric Key Cipher Request Handle
 Single Block Cipher API
 -----------------------
 
-.. kernel-doc:: include/linux/crypto.h
+.. kernel-doc:: include/crypto/internal/cipher.h
    :doc: Single Block Cipher API
 
-.. kernel-doc:: include/linux/crypto.h
+.. kernel-doc:: include/crypto/internal/cipher.h
    :functions: crypto_alloc_cipher crypto_free_cipher crypto_has_cipher crypto_cipher_blocksize crypto_cipher_setkey crypto_cipher_encrypt_one crypto_cipher_decrypt_one
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index f70af1d0514b..5c6cd3c63cbc 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -9,6 +9,7 @@
 #include <asm/simd.h>
 #include <crypto/aes.h>
 #include <crypto/ctr.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
@@ -23,6 +24,8 @@ MODULE_ALIAS_CRYPTO("cbc(aes)-all");
 MODULE_ALIAS_CRYPTO("ctr(aes)");
 MODULE_ALIAS_CRYPTO("xts(aes)");
 
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+
 asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
 
 asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 73044634d342..54c7536f2482 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -21,6 +21,7 @@
 #include <crypto/algapi.h>
 #include <crypto/ghash.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
@@ -1055,3 +1056,4 @@ MODULE_ALIAS_CRYPTO("aes-all");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
 MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/adiantum.c b/crypto/adiantum.c
index ce4d5725342c..84450130cb6b 100644
--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -32,6 +32,7 @@
 
 #include <crypto/b128ops.h>
 #include <crypto/chacha.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/poly1305.h>
 #include <crypto/internal/skcipher.h>
@@ -616,3 +617,4 @@ MODULE_DESCRIPTION("Adiantum length-preserving encryption mode");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
 MODULE_ALIAS_CRYPTO("adiantum");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/ansi_cprng.c b/crypto/ansi_cprng.c
index c475c1129ff2..3f512efaba3a 100644
--- a/crypto/ansi_cprng.c
+++ b/crypto/ansi_cprng.c
@@ -7,6 +7,7 @@
  *  (C) Neil Horman <nhorman@tuxdriver.com>
  */
 
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/rng.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -470,3 +471,4 @@ subsys_initcall(prng_mod_init);
 module_exit(prng_mod_fini);
 MODULE_ALIAS_CRYPTO("stdrng");
 MODULE_ALIAS_CRYPTO("ansi_cprng");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/cbc.c b/crypto/cbc.c
index 0d9509dff891..6c03e96b945f 100644
--- a/crypto/cbc.c
+++ b/crypto/cbc.c
@@ -6,6 +6,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/init.h>
diff --git a/crypto/ccm.c b/crypto/ccm.c
index 494d70901186..6b815ece51c6 100644
--- a/crypto/ccm.c
+++ b/crypto/ccm.c
@@ -6,6 +6,7 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
@@ -954,3 +955,4 @@ MODULE_ALIAS_CRYPTO("ccm_base");
 MODULE_ALIAS_CRYPTO("rfc4309");
 MODULE_ALIAS_CRYPTO("ccm");
 MODULE_ALIAS_CRYPTO("cbcmac");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/cfb.c b/crypto/cfb.c
index 4e5219bbcd19..0d664dfb47bc 100644
--- a/crypto/cfb.c
+++ b/crypto/cfb.c
@@ -20,6 +20,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -250,3 +251,4 @@ module_exit(crypto_cfb_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("CFB block cipher mode of operation");
 MODULE_ALIAS_CRYPTO("cfb");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/cipher.c b/crypto/cipher.c
index fd78150deb1c..b47141ed4a9f 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -9,6 +9,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <linux/kernel.h>
 #include <linux/crypto.h>
 #include <linux/errno.h>
@@ -53,7 +54,7 @@ int crypto_cipher_setkey(struct crypto_cipher *tfm,
 
 	return cia->cia_setkey(crypto_cipher_tfm(tfm), key, keylen);
 }
-EXPORT_SYMBOL_GPL(crypto_cipher_setkey);
+EXPORT_SYMBOL_NS_GPL(crypto_cipher_setkey, CRYPTO_INTERNAL);
 
 static inline void cipher_crypt_one(struct crypto_cipher *tfm,
 				    u8 *dst, const u8 *src, bool enc)
@@ -81,11 +82,11 @@ void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
 {
 	cipher_crypt_one(tfm, dst, src, true);
 }
-EXPORT_SYMBOL_GPL(crypto_cipher_encrypt_one);
+EXPORT_SYMBOL_NS_GPL(crypto_cipher_encrypt_one, CRYPTO_INTERNAL);
 
 void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
 			       u8 *dst, const u8 *src)
 {
 	cipher_crypt_one(tfm, dst, src, false);
 }
-EXPORT_SYMBOL_GPL(crypto_cipher_decrypt_one);
+EXPORT_SYMBOL_NS_GPL(crypto_cipher_decrypt_one, CRYPTO_INTERNAL);
diff --git a/crypto/cmac.c b/crypto/cmac.c
index df36be1efb81..f4a5d3bfb376 100644
--- a/crypto/cmac.c
+++ b/crypto/cmac.c
@@ -11,6 +11,7 @@
  *   Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org>
  */
 
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/hash.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -313,3 +314,4 @@ module_exit(crypto_cmac_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("CMAC keyed hash algorithm");
 MODULE_ALIAS_CRYPTO("cmac");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/ctr.c b/crypto/ctr.c
index c39fcffba27f..23c698b22013 100644
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -7,6 +7,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/ctr.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -358,3 +359,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("CTR block cipher mode of operation");
 MODULE_ALIAS_CRYPTO("rfc3686");
 MODULE_ALIAS_CRYPTO("ctr");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/drbg.c b/crypto/drbg.c
index 3132967a1749..1b4587e0ddad 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -98,6 +98,7 @@
  */
 
 #include <crypto/drbg.h>
+#include <crypto/internal/cipher.h>
 #include <linux/kernel.h>
 
 /***************************************************************
@@ -2161,3 +2162,4 @@ MODULE_DESCRIPTION("NIST SP800-90A Deterministic Random Bit Generator (DRBG) "
 		   CRYPTO_DRBG_HMAC_STRING
 		   CRYPTO_DRBG_CTR_STRING);
 MODULE_ALIAS_CRYPTO("stdrng");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/ecb.c b/crypto/ecb.c
index 69a687cbdf21..71fbb0543d64 100644
--- a/crypto/ecb.c
+++ b/crypto/ecb.c
@@ -6,6 +6,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/init.h>
diff --git a/crypto/essiv.c b/crypto/essiv.c
index d012be23d496..8bcc5bdcb2a9 100644
--- a/crypto/essiv.c
+++ b/crypto/essiv.c
@@ -30,6 +30,7 @@
 
 #include <crypto/authenc.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
@@ -643,3 +644,4 @@ module_exit(essiv_module_exit);
 MODULE_DESCRIPTION("ESSIV skcipher/aead wrapper for block encryption");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("essiv");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/keywrap.c b/crypto/keywrap.c
index 0355cce21b1e..3517773bc7f7 100644
--- a/crypto/keywrap.c
+++ b/crypto/keywrap.c
@@ -85,6 +85,7 @@
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <crypto/scatterwalk.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 
 struct crypto_kw_block {
@@ -316,3 +317,4 @@ MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
 MODULE_DESCRIPTION("Key Wrapping (RFC3394 / NIST SP800-38F)");
 MODULE_ALIAS_CRYPTO("kw");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/ofb.c b/crypto/ofb.c
index 2ec68e3f2c55..b630fdecceee 100644
--- a/crypto/ofb.c
+++ b/crypto/ofb.c
@@ -8,6 +8,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -102,3 +103,4 @@ module_exit(crypto_ofb_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("OFB block cipher mode of operation");
 MODULE_ALIAS_CRYPTO("ofb");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/pcbc.c b/crypto/pcbc.c
index ae921fb74dc9..7030f59e46b6 100644
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -10,6 +10,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -191,3 +192,4 @@ module_exit(crypto_pcbc_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("PCBC block cipher mode of operation");
 MODULE_ALIAS_CRYPTO("pcbc");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index b4dae640de9f..ff16d05644c7 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -10,6 +10,7 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/bug.h>
@@ -986,3 +987,4 @@ EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Symmetric key cipher type");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 321e38eef51b..a896d77e9611 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -33,10 +33,13 @@
 #include <crypto/akcipher.h>
 #include <crypto/kpp.h>
 #include <crypto/acompress.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/simd.h>
 
 #include "internal.h"
 
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+
 static bool notests;
 module_param(notests, bool, 0644);
 MODULE_PARM_DESC(notests, "disable crypto self-tests");
diff --git a/crypto/vmac.c b/crypto/vmac.c
index 9b565d1040d6..4633b2dda1e0 100644
--- a/crypto/vmac.c
+++ b/crypto/vmac.c
@@ -36,6 +36,7 @@
 #include <linux/scatterlist.h>
 #include <asm/byteorder.h>
 #include <crypto/scatterwalk.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/hash.h>
 
 /*
@@ -693,3 +694,4 @@ module_exit(vmac_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("VMAC hash algorithm");
 MODULE_ALIAS_CRYPTO("vmac64");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/xcbc.c b/crypto/xcbc.c
index af3b7eb5d7c7..6074c5c1da49 100644
--- a/crypto/xcbc.c
+++ b/crypto/xcbc.c
@@ -6,6 +6,7 @@
  * 	Kazunori Miyazawa <miyazawa@linux-ipv6.org>
  */
 
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/hash.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -272,3 +273,4 @@ module_exit(crypto_xcbc_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("XCBC keyed hash algorithm");
 MODULE_ALIAS_CRYPTO("xcbc");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/crypto/xts.c b/crypto/xts.c
index ad45b009774b..6c12f30dbdd6 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -7,6 +7,7 @@
  * Based on ecb.c
  * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  */
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
@@ -464,3 +465,4 @@ module_exit(xts_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("XTS block cipher mode");
 MODULE_ALIAS_CRYPTO("xts");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c
index f4f18bfc2247..4ee010f39912 100644
--- a/drivers/crypto/geode-aes.c
+++ b/drivers/crypto/geode-aes.c
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 
 #include <linux/io.h>
@@ -434,3 +435,4 @@ module_pci_driver(geode_aes_driver);
 MODULE_AUTHOR("Advanced Micro Devices, Inc.");
 MODULE_DESCRIPTION("Geode LX Hardware AES driver");
 MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c
index 2e1562108a85..30aedfcfee7c 100644
--- a/drivers/crypto/inside-secure/safexcel.c
+++ b/drivers/crypto/inside-secure/safexcel.c
@@ -1999,3 +1999,4 @@ MODULE_AUTHOR("Ofer Heifetz <oferh@marvell.com>");
 MODULE_AUTHOR("Igal Liberman <igall@marvell.com>");
 MODULE_DESCRIPTION("Support for SafeXcel cryptographic engines: EIP97 & EIP197");
 MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index 50fb6d90a2e0..bc60b5802256 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -13,6 +13,7 @@
 #include <crypto/sha3.h>
 #include <crypto/skcipher.h>
 #include <crypto/sm3.h>
+#include <crypto/internal/cipher.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmapool.h>
diff --git a/drivers/crypto/qat/qat_common/adf_ctl_drv.c b/drivers/crypto/qat/qat_common/adf_ctl_drv.c
index eb9b3be9d8eb..96b437bfe3de 100644
--- a/drivers/crypto/qat/qat_common/adf_ctl_drv.c
+++ b/drivers/crypto/qat/qat_common/adf_ctl_drv.c
@@ -464,3 +464,4 @@ MODULE_AUTHOR("Intel");
 MODULE_DESCRIPTION("Intel(R) QuickAssist Technology");
 MODULE_ALIAS_CRYPTO("intel_qat");
 MODULE_VERSION(ADF_DRV_VERSION);
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c
index 31c7a206a629..ff78c73c47e3 100644
--- a/drivers/crypto/qat/qat_common/qat_algs.c
+++ b/drivers/crypto/qat/qat_common/qat_algs.c
@@ -4,6 +4,7 @@
 #include <linux/slab.h>
 #include <linux/crypto.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/aes.h>
 #include <crypto/sha1.h>
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index 2bc5d4e1adf4..d05c02baebcf 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -14,6 +14,7 @@
 #include <asm/simd.h>
 #include <asm/switch_to.h>
 #include <crypto/aes.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/internal/simd.h>
 
 #include "aesp8-ppc.h"
diff --git a/drivers/crypto/vmx/vmx.c b/drivers/crypto/vmx/vmx.c
index 3e0335fb406c..87a194455d6a 100644
--- a/drivers/crypto/vmx/vmx.c
+++ b/drivers/crypto/vmx/vmx.c
@@ -78,3 +78,4 @@ MODULE_DESCRIPTION("IBM VMX cryptographic acceleration instructions "
 		   "support on Power 8");
 MODULE_LICENSE("GPL");
 MODULE_VERSION("1.0.0");
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 18dd7a4aaf7d..86f0748009af 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -189,45 +189,6 @@ static inline void *crypto_instance_ctx(struct crypto_instance *inst)
 	return inst->__ctx;
 }
 
-struct crypto_cipher_spawn {
-	struct crypto_spawn base;
-};
-
-static inline int crypto_grab_cipher(struct crypto_cipher_spawn *spawn,
-				     struct crypto_instance *inst,
-				     const char *name, u32 type, u32 mask)
-{
-	type &= ~CRYPTO_ALG_TYPE_MASK;
-	type |= CRYPTO_ALG_TYPE_CIPHER;
-	mask |= CRYPTO_ALG_TYPE_MASK;
-	return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
-}
-
-static inline void crypto_drop_cipher(struct crypto_cipher_spawn *spawn)
-{
-	crypto_drop_spawn(&spawn->base);
-}
-
-static inline struct crypto_alg *crypto_spawn_cipher_alg(
-	struct crypto_cipher_spawn *spawn)
-{
-	return spawn->base.alg;
-}
-
-static inline struct crypto_cipher *crypto_spawn_cipher(
-	struct crypto_cipher_spawn *spawn)
-{
-	u32 type = CRYPTO_ALG_TYPE_CIPHER;
-	u32 mask = CRYPTO_ALG_TYPE_MASK;
-
-	return __crypto_cipher_cast(crypto_spawn_tfm(&spawn->base, type, mask));
-}
-
-static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm)
-{
-	return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher;
-}
-
 static inline struct crypto_async_request *crypto_get_backlog(
 	struct crypto_queue *queue)
 {
diff --git a/include/crypto/internal/cipher.h b/include/crypto/internal/cipher.h
new file mode 100644
index 000000000000..a9174ba90250
--- /dev/null
+++ b/include/crypto/internal/cipher.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
+ * and Nettle, by Niels Möller.
+ */
+
+#ifndef _CRYPTO_INTERNAL_CIPHER_H
+#define _CRYPTO_INTERNAL_CIPHER_H
+
+#include <crypto/algapi.h>
+
+struct crypto_cipher {
+	struct crypto_tfm base;
+};
+
+/**
+ * DOC: Single Block Cipher API
+ *
+ * The single block cipher API is used with the ciphers of type
+ * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto).
+ *
+ * Using the single block cipher API calls, operations with the basic cipher
+ * primitive can be implemented. These cipher primitives exclude any block
+ * chaining operations including IV handling.
+ *
+ * The purpose of this single block cipher API is to support the implementation
+ * of templates or other concepts that only need to perform the cipher operation
+ * on one block at a time. Templates invoke the underlying cipher primitive
+ * block-wise and process either the input or the output data of these cipher
+ * operations.
+ */
+
+static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm)
+{
+	return (struct crypto_cipher *)tfm;
+}
+
+/**
+ * crypto_alloc_cipher() - allocate single block cipher handle
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	     single block cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Allocate a cipher handle for a single block cipher. The returned struct
+ * crypto_cipher is the cipher handle that is required for any subsequent API
+ * invocation for that single block cipher.
+ *
+ * Return: allocated cipher handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name,
+							u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_CIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+
+	return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask));
+}
+
+static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm)
+{
+	return &tfm->base;
+}
+
+/**
+ * crypto_free_cipher() - zeroize and free the single block cipher handle
+ * @tfm: cipher handle to be freed
+ */
+static inline void crypto_free_cipher(struct crypto_cipher *tfm)
+{
+	crypto_free_tfm(crypto_cipher_tfm(tfm));
+}
+
+/**
+ * crypto_has_cipher() - Search for the availability of a single block cipher
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	     single block cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Return: true when the single block cipher is known to the kernel crypto API;
+ *	   false otherwise
+ */
+static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_CIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+
+	return crypto_has_alg(alg_name, type, mask);
+}
+
+/**
+ * crypto_cipher_blocksize() - obtain block size for cipher
+ * @tfm: cipher handle
+ *
+ * The block size for the single block cipher referenced with the cipher handle
+ * tfm is returned. The caller may use that information to allocate appropriate
+ * memory for the data returned by the encryption or decryption operation
+ *
+ * Return: block size of cipher
+ */
+static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm)
+{
+	return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm));
+}
+
+static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm));
+}
+
+static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm)
+{
+	return crypto_tfm_get_flags(crypto_cipher_tfm(tfm));
+}
+
+static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm,
+					   u32 flags)
+{
+	crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags);
+}
+
+static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm,
+					     u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags);
+}
+
+/**
+ * crypto_cipher_setkey() - set key for cipher
+ * @tfm: cipher handle
+ * @key: buffer holding the key
+ * @keylen: length of the key in bytes
+ *
+ * The caller provided key is set for the single block cipher referenced by the
+ * cipher handle.
+ *
+ * Note, the key length determines the cipher type. Many block ciphers implement
+ * different cipher modes depending on the key size, such as AES-128 vs AES-192
+ * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
+ * is performed.
+ *
+ * Return: 0 if the setting of the key was successful; < 0 if an error occurred
+ */
+int crypto_cipher_setkey(struct crypto_cipher *tfm,
+			 const u8 *key, unsigned int keylen);
+
+/**
+ * crypto_cipher_encrypt_one() - encrypt one block of plaintext
+ * @tfm: cipher handle
+ * @dst: points to the buffer that will be filled with the ciphertext
+ * @src: buffer holding the plaintext to be encrypted
+ *
+ * Invoke the encryption operation of one block. The caller must ensure that
+ * the plaintext and ciphertext buffers are at least one block in size.
+ */
+void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
+			       u8 *dst, const u8 *src);
+
+/**
+ * crypto_cipher_decrypt_one() - decrypt one block of ciphertext
+ * @tfm: cipher handle
+ * @dst: points to the buffer that will be filled with the plaintext
+ * @src: buffer holding the ciphertext to be decrypted
+ *
+ * Invoke the decryption operation of one block. The caller must ensure that
+ * the plaintext and ciphertext buffers are at least one block in size.
+ */
+void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
+			       u8 *dst, const u8 *src);
+
+struct crypto_cipher_spawn {
+	struct crypto_spawn base;
+};
+
+static inline int crypto_grab_cipher(struct crypto_cipher_spawn *spawn,
+				     struct crypto_instance *inst,
+				     const char *name, u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_CIPHER;
+	mask |= CRYPTO_ALG_TYPE_MASK;
+	return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
+}
+
+static inline void crypto_drop_cipher(struct crypto_cipher_spawn *spawn)
+{
+	crypto_drop_spawn(&spawn->base);
+}
+
+static inline struct crypto_alg *crypto_spawn_cipher_alg(
+       struct crypto_cipher_spawn *spawn)
+{
+	return spawn->base.alg;
+}
+
+static inline struct crypto_cipher *crypto_spawn_cipher(
+	struct crypto_cipher_spawn *spawn)
+{
+	u32 type = CRYPTO_ALG_TYPE_CIPHER;
+	u32 mask = CRYPTO_ALG_TYPE_MASK;
+
+	return __crypto_cipher_cast(crypto_spawn_tfm(&spawn->base, type, mask));
+}
+
+static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm)
+{
+	return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher;
+}
+
+#endif
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index 10226c12c5df..9dd6c0c17eb8 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -9,6 +9,7 @@
 #define _CRYPTO_INTERNAL_SKCIPHER_H
 
 #include <crypto/algapi.h>
+#include <crypto/internal/cipher.h>
 #include <crypto/skcipher.h>
 #include <linux/list.h>
 #include <linux/types.h>
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ef90e07c9635..9b55cd6b1f1b 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -636,10 +636,6 @@ struct crypto_tfm {
 	void *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-struct crypto_cipher {
-	struct crypto_tfm base;
-};
-
 struct crypto_comp {
 	struct crypto_tfm base;
 };
@@ -743,165 +739,6 @@ static inline unsigned int crypto_tfm_ctx_alignment(void)
 	return __alignof__(tfm->__crt_ctx);
 }
 
-/**
- * DOC: Single Block Cipher API
- *
- * The single block cipher API is used with the ciphers of type
- * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto).
- *
- * Using the single block cipher API calls, operations with the basic cipher
- * primitive can be implemented. These cipher primitives exclude any block
- * chaining operations including IV handling.
- *
- * The purpose of this single block cipher API is to support the implementation
- * of templates or other concepts that only need to perform the cipher operation
- * on one block at a time. Templates invoke the underlying cipher primitive
- * block-wise and process either the input or the output data of these cipher
- * operations.
- */
-
-static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm)
-{
-	return (struct crypto_cipher *)tfm;
-}
-
-/**
- * crypto_alloc_cipher() - allocate single block cipher handle
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *	     single block cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Allocate a cipher handle for a single block cipher. The returned struct
- * crypto_cipher is the cipher handle that is required for any subsequent API
- * invocation for that single block cipher.
- *
- * Return: allocated cipher handle in case of success; IS_ERR() is true in case
- *	   of an error, PTR_ERR() returns the error code.
- */
-static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name,
-							u32 type, u32 mask)
-{
-	type &= ~CRYPTO_ALG_TYPE_MASK;
-	type |= CRYPTO_ALG_TYPE_CIPHER;
-	mask |= CRYPTO_ALG_TYPE_MASK;
-
-	return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask));
-}
-
-static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm)
-{
-	return &tfm->base;
-}
-
-/**
- * crypto_free_cipher() - zeroize and free the single block cipher handle
- * @tfm: cipher handle to be freed
- */
-static inline void crypto_free_cipher(struct crypto_cipher *tfm)
-{
-	crypto_free_tfm(crypto_cipher_tfm(tfm));
-}
-
-/**
- * crypto_has_cipher() - Search for the availability of a single block cipher
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *	     single block cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Return: true when the single block cipher is known to the kernel crypto API;
- *	   false otherwise
- */
-static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask)
-{
-	type &= ~CRYPTO_ALG_TYPE_MASK;
-	type |= CRYPTO_ALG_TYPE_CIPHER;
-	mask |= CRYPTO_ALG_TYPE_MASK;
-
-	return crypto_has_alg(alg_name, type, mask);
-}
-
-/**
- * crypto_cipher_blocksize() - obtain block size for cipher
- * @tfm: cipher handle
- *
- * The block size for the single block cipher referenced with the cipher handle
- * tfm is returned. The caller may use that information to allocate appropriate
- * memory for the data returned by the encryption or decryption operation
- *
- * Return: block size of cipher
- */
-static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm)
-{
-	return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm));
-}
-
-static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm)
-{
-	return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm));
-}
-
-static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm)
-{
-	return crypto_tfm_get_flags(crypto_cipher_tfm(tfm));
-}
-
-static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm,
-					   u32 flags)
-{
-	crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags);
-}
-
-static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm,
-					     u32 flags)
-{
-	crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags);
-}
-
-/**
- * crypto_cipher_setkey() - set key for cipher
- * @tfm: cipher handle
- * @key: buffer holding the key
- * @keylen: length of the key in bytes
- *
- * The caller provided key is set for the single block cipher referenced by the
- * cipher handle.
- *
- * Note, the key length determines the cipher type. Many block ciphers implement
- * different cipher modes depending on the key size, such as AES-128 vs AES-192
- * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
- * is performed.
- *
- * Return: 0 if the setting of the key was successful; < 0 if an error occurred
- */
-int crypto_cipher_setkey(struct crypto_cipher *tfm,
-			 const u8 *key, unsigned int keylen);
-
-/**
- * crypto_cipher_encrypt_one() - encrypt one block of plaintext
- * @tfm: cipher handle
- * @dst: points to the buffer that will be filled with the ciphertext
- * @src: buffer holding the plaintext to be encrypted
- *
- * Invoke the encryption operation of one block. The caller must ensure that
- * the plaintext and ciphertext buffers are at least one block in size.
- */
-void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
-			       u8 *dst, const u8 *src);
-
-/**
- * crypto_cipher_decrypt_one() - decrypt one block of ciphertext
- * @tfm: cipher handle
- * @dst: points to the buffer that will be filled with the plaintext
- * @src: buffer holding the ciphertext to be decrypted
- *
- * Invoke the decryption operation of one block. The caller must ensure that
- * the plaintext and ciphertext buffers are at least one block in size.
- */
-void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
-			       u8 *dst, const u8 *src);
-
 static inline struct crypto_comp *__crypto_comp_cast(struct crypto_tfm *tfm)
 {
 	return (struct crypto_comp *)tfm;
-- 
cgit v1.2.3


From 8280de6ab07b4c63eb607662754f151e539031a1 Mon Sep 17 00:00:00 2001
From: Wesley Cheng <wcheng@codeaurora.org>
Date: Tue, 29 Dec 2020 15:03:30 -0800
Subject: usb: gadget: composite: Split composite reset and disconnect

Add a specific composite reset API to differentiate between disconnect and
reset events.  This is needed for adjusting the current draw accordingly
based on the USB battery charging specification.  The device is only allowed
to draw the 500/900 mA (HS/SS) while in the CONFIGURED state, and only 100 mA
in the connected and UNCONFIGURED state.

Reviewed-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Wesley Cheng <wcheng@codeaurora.org>
Link: https://lore.kernel.org/r/1609283011-21997-3-git-send-email-wcheng@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/composite.c | 21 +++++++++++++++++++--
 include/linux/usb/composite.h  |  2 ++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index c6d455f2bb92..86e86a005e25 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -2036,7 +2036,7 @@ done:
 	return value;
 }
 
-void composite_disconnect(struct usb_gadget *gadget)
+static void __composite_disconnect(struct usb_gadget *gadget)
 {
 	struct usb_composite_dev	*cdev = get_gadget_data(gadget);
 	unsigned long			flags;
@@ -2053,6 +2053,23 @@ void composite_disconnect(struct usb_gadget *gadget)
 	spin_unlock_irqrestore(&cdev->lock, flags);
 }
 
+void composite_disconnect(struct usb_gadget *gadget)
+{
+	usb_gadget_vbus_draw(gadget, 0);
+	__composite_disconnect(gadget);
+}
+
+void composite_reset(struct usb_gadget *gadget)
+{
+	/*
+	 * Section 1.4.13 Standard Downstream Port of the USB battery charging
+	 * specification v1.2 states that a device connected on a SDP shall only
+	 * draw at max 100mA while in a connected, but unconfigured state.
+	 */
+	usb_gadget_vbus_draw(gadget, 100);
+	__composite_disconnect(gadget);
+}
+
 /*-------------------------------------------------------------------------*/
 
 static ssize_t suspended_show(struct device *dev, struct device_attribute *attr,
@@ -2373,7 +2390,7 @@ static const struct usb_gadget_driver composite_driver_template = {
 	.unbind		= composite_unbind,
 
 	.setup		= composite_setup,
-	.reset		= composite_disconnect,
+	.reset		= composite_reset,
 	.disconnect	= composite_disconnect,
 
 	.suspend	= composite_suspend,
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index a2d229ab63ba..5646dad886e6 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -525,6 +525,8 @@ extern struct usb_string *usb_gstrings_attach(struct usb_composite_dev *cdev,
 extern int usb_string_ids_n(struct usb_composite_dev *c, unsigned n);
 
 extern void composite_disconnect(struct usb_gadget *gadget);
+extern void composite_reset(struct usb_gadget *gadget);
+
 extern int composite_setup(struct usb_gadget *gadget,
 		const struct usb_ctrlrequest *ctrl);
 extern void composite_suspend(struct usb_gadget *gadget);
-- 
cgit v1.2.3


From b5a8d233a588b3acf2a7a3a8da30f8f68f376626 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Mon, 4 Jan 2021 17:14:50 +0100
Subject: bus: mhi: core: Add device hardware reset support

The MHI specification allows to perform a hard reset of the device
when writing to the SOC_RESET register. It can be used to completely
restart the device (e.g. in case of unrecoverable MHI error).

This is up to the MHI controller driver to determine when this hard
reset should be used, and in case of MHI errors, should be used as
a reset of last resort (after standard MHI stack reset).

This function is a stateless function, the MHI layer do nothing except
triggering the reset by writing into the right register(s), this is up
to the caller to ensure right mhi_controller state (e.g. unregister the
controller if necessary).

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/main.c | 13 +++++++++++++
 include/linux/mhi.h         |  9 +++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index d34d7e90e38d..9b425402767c 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -135,6 +135,19 @@ enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl)
 }
 EXPORT_SYMBOL_GPL(mhi_get_mhi_state);
 
+void mhi_soc_reset(struct mhi_controller *mhi_cntrl)
+{
+	if (mhi_cntrl->reset) {
+		mhi_cntrl->reset(mhi_cntrl);
+		return;
+	}
+
+	/* Generic MHI SoC reset */
+	mhi_write_reg(mhi_cntrl, mhi_cntrl->regs, MHI_SOC_RESET_REQ_OFFSET,
+		      MHI_SOC_RESET_REQ);
+}
+EXPORT_SYMBOL_GPL(mhi_soc_reset);
+
 int mhi_map_single_no_bb(struct mhi_controller *mhi_cntrl,
 			 struct mhi_buf_info *buf_info)
 {
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 562862ff819c..54afcae1709a 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -347,6 +347,7 @@ struct mhi_controller_config {
  * @unmap_single: CB function to destroy TRE buffer
  * @read_reg: Read a MHI register via the physical link (required)
  * @write_reg: Write a MHI register via the physical link (required)
+ * @reset: Controller specific reset function (optional)
  * @buffer_len: Bounce buffer length
  * @index: Index of the MHI controller instance
  * @bounce_buf: Use of bounce buffer
@@ -437,6 +438,7 @@ struct mhi_controller {
 			u32 *out);
 	void (*write_reg)(struct mhi_controller *mhi_cntrl, void __iomem *addr,
 			  u32 val);
+	void (*reset)(struct mhi_controller *mhi_cntrl);
 
 	size_t buffer_len;
 	int index;
@@ -672,6 +674,13 @@ enum mhi_ee_type mhi_get_exec_env(struct mhi_controller *mhi_cntrl);
  */
 enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl);
 
+/**
+ * mhi_soc_reset - Trigger a device reset. This can be used as a last resort
+ *		   to reset and recover a device.
+ * @mhi_cntrl: MHI controller
+ */
+void mhi_soc_reset(struct mhi_controller *mhi_cntrl);
+
 /**
  * mhi_device_get - Disable device low power mode
  * @mhi_dev: Device associated with the channel
-- 
cgit v1.2.3


From 6c6ec2b0a3e0381d886d531bd1471dfdb1509237 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Dec 2020 09:19:09 -0700
Subject: fs: add support for LOOKUP_CACHED

io_uring always punts opens to async context, since there's no control
over whether the lookup blocks or not. Add LOOKUP_CACHED to support
just doing the fast RCU based lookups, which we know will not block. If
we can do a cached path resolution of the filename, then we don't have
to always punt lookups for a worker.

During path resolution, we always do LOOKUP_RCU first. If that fails and
we terminate LOOKUP_RCU, then fail a LOOKUP_CACHED attempt as well.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 9 +++++++++
 include/linux/namei.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 2ee219497460..4cae88733a5c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	nd->flags &= ~LOOKUP_RCU;
+	if (nd->flags & LOOKUP_CACHED)
+		goto out1;
 	if (unlikely(!legitimize_links(nd)))
 		goto out1;
 	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
@@ -722,6 +724,8 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	nd->flags &= ~LOOKUP_RCU;
+	if (nd->flags & LOOKUP_CACHED)
+		goto out2;
 	if (unlikely(!legitimize_links(nd)))
 		goto out2;
 	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
@@ -792,6 +796,7 @@ static int complete_walk(struct nameidata *nd)
 		 */
 		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
 			nd->root.mnt = NULL;
+		nd->flags &= ~LOOKUP_CACHED;
 		if (!try_to_unlazy(nd))
 			return -ECHILD;
 	}
@@ -2204,6 +2209,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	int error;
 	const char *s = nd->name->name;
 
+	/* LOOKUP_CACHED requires RCU, ask caller to retry */
+	if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
+		return ERR_PTR(-EAGAIN);
+
 	if (!*s)
 		flags &= ~LOOKUP_RCU;
 	if (flags & LOOKUP_RCU)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a4bb992623c4..b9605b2b46e7 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -46,6 +46,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 #define LOOKUP_NO_XDEV		0x040000 /* No mountpoint crossing. */
 #define LOOKUP_BENEATH		0x080000 /* No escaping from starting point. */
 #define LOOKUP_IN_ROOT		0x100000 /* Treat dirfd as fs root. */
+#define LOOKUP_CACHED		0x200000 /* Only do cached lookup */
 /* LOOKUP_* flags which do scope-related checks based on the dirfd. */
 #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
 
-- 
cgit v1.2.3


From 99668f618062816ca7ba639b007eb145b9d3d41e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Dec 2020 09:19:10 -0700
Subject: fs: expose LOOKUP_CACHED through openat2() RESOLVE_CACHED

Now that we support non-blocking path resolution internally, expose it
via openat2() in the struct open_how ->resolve flags. This allows
applications using openat2() to limit path resolution to the extent that
it is already cached.

If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
will return -1/-EAGAIN.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c                    | 6 ++++++
 include/linux/fcntl.h        | 2 +-
 include/uapi/linux/openat2.h | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 1e06e443a565..ca5444733acd 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1091,6 +1091,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		lookup_flags |= LOOKUP_BENEATH;
 	if (how->resolve & RESOLVE_IN_ROOT)
 		lookup_flags |= LOOKUP_IN_ROOT;
+	if (how->resolve & RESOLVE_CACHED) {
+		/* Don't bother even trying for create/truncate/tmpfile open */
+		if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+			return -EAGAIN;
+		lookup_flags |= LOOKUP_CACHED;
+	}
 
 	op->lookup_flags = lookup_flags;
 	return 0;
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 921e750843e6..766fcd973beb 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -19,7 +19,7 @@
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
-	 RESOLVE_BENEATH | RESOLVE_IN_ROOT)
+	 RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_CACHED)
 
 /* List of all open_how "versions". */
 #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
index 58b1eb711360..a5feb7604948 100644
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -35,5 +35,9 @@ struct open_how {
 #define RESOLVE_IN_ROOT		0x10 /* Make all jumps to "/" and ".."
 					be scoped inside the dirfd
 					(similar to chroot(2)). */
+#define RESOLVE_CACHED		0x20 /* Only complete if resolution can be
+					completed through cached lookup. May
+					return -EAGAIN if that's not
+					possible. */
 
 #endif /* _UAPI_LINUX_OPENAT2_H */
-- 
cgit v1.2.3


From 14e43bf435612639cab01541fce7cc41bf7e370b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 22 Sep 2020 09:44:18 -0700
Subject: vfs: don't unnecessarily clone write access for writable fds

There's no need for mnt_want_write_file() to increment mnt_writers when
the file is already open for writing, provided that
mnt_drop_write_file() is changed to conditionally decrement it.

We seem to have ended up in the current situation because
mnt_want_write_file() used to be paired with mnt_drop_write(), due to
mnt_drop_write_file() not having been added yet.  So originally
mnt_want_write_file() had to always increment mnt_writers.

But later mnt_drop_write_file() was added, and all callers of
mnt_want_write_file() were paired with it.  This makes the compatibility
between mnt_want_write_file() and mnt_drop_write() no longer necessary.

Therefore, make __mnt_want_write_file() and __mnt_drop_write_file() skip
incrementing mnt_writers on files already open for writing.  This
removes the only caller of mnt_clone_write(), so remove that too.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  7 +++++
 fs/namespace.c                        | 53 +++++++++++++----------------------
 include/linux/mount.h                 |  1 -
 3 files changed, 27 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 867036aa90b8..6a6d3e673b48 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -865,3 +865,10 @@ no matter what.  Everything is handled by the caller.
 
 clone_private_mount() returns a longterm mount now, so the proper destructor of
 its result is kern_unmount() or kern_unmount_array().
+
+---
+
+**mandatory**
+
+mnt_want_write_file() can now only be paired with mnt_drop_write_file(),
+whereas previously it could be paired with mnt_drop_write() as well.
diff --git a/fs/namespace.c b/fs/namespace.c
index d2db7dfe232b..9f2d94e0f3e0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -359,51 +359,37 @@ int mnt_want_write(struct vfsmount *m)
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-/**
- * mnt_clone_write - get write access to a mount
- * @mnt: the mount on which to take a write
- *
- * This is effectively like mnt_want_write, except
- * it must only be used to take an extra write reference
- * on a mountpoint that we already know has a write reference
- * on it. This allows some optimisation.
- *
- * After finished, mnt_drop_write must be called as usual to
- * drop the reference.
- */
-int mnt_clone_write(struct vfsmount *mnt)
-{
-	/* superblock may be r/o */
-	if (__mnt_is_readonly(mnt))
-		return -EROFS;
-	preempt_disable();
-	mnt_inc_writers(real_mount(mnt));
-	preempt_enable();
-	return 0;
-}
-EXPORT_SYMBOL_GPL(mnt_clone_write);
-
 /**
  * __mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like __mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the check for emergency r/o remounts.  This must be
+ * paired with __mnt_drop_write_file.
  */
 int __mnt_want_write_file(struct file *file)
 {
-	if (!(file->f_mode & FMODE_WRITER))
-		return __mnt_want_write(file->f_path.mnt);
-	else
-		return mnt_clone_write(file->f_path.mnt);
+	if (file->f_mode & FMODE_WRITER) {
+		/*
+		 * Superblock may have become readonly while there are still
+		 * writable fd's, e.g. due to a fs error with errors=remount-ro
+		 */
+		if (__mnt_is_readonly(file->f_path.mnt))
+			return -EROFS;
+		return 0;
+	}
+	return __mnt_want_write(file->f_path.mnt);
 }
 
 /**
  * mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the freeze protection and the check for emergency r/o
+ * remounts.  This must be paired with mnt_drop_write_file.
  */
 int mnt_want_write_file(struct file *file)
 {
@@ -449,7 +435,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write);
 
 void __mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write(file->f_path.mnt);
+	if (!(file->f_mode & FMODE_WRITER))
+		__mnt_drop_write(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aaf343b38671..b43191fe6af7 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -79,7 +79,6 @@ struct path;
 
 extern int mnt_want_write(struct vfsmount *mnt);
 extern int mnt_want_write_file(struct file *file);
-extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mnt_drop_write_file(struct file *file);
 extern void mntput(struct vfsmount *mnt);
-- 
cgit v1.2.3


From 4704bd317108c94b6e2d8309f3dbb70d2015568a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 16 Nov 2020 11:18:16 +0100
Subject: list: Fix a typo at the kernel-doc markup

hlist_add_behing -> hlist_add_behind

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/list.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 89bdc92e75c3..f2af4b4aa4e9 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -901,7 +901,7 @@ static inline void hlist_add_before(struct hlist_node *n,
 }
 
 /**
- * hlist_add_behing - add a new entry after the one specified
+ * hlist_add_behind - add a new entry after the one specified
  * @n: new entry to be added
  * @prev: hlist node to add it after, which must be non-NULL
  */
-- 
cgit v1.2.3


From 5130b8fd06901c1b3a4bd0d0f5c5ea99b2b0a6f0 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Fri, 20 Nov 2020 12:49:16 +0100
Subject: rcu: Introduce kfree_rcu() single-argument macro

There is a kvfree_rcu() single argument macro that handles pointers
returned by kvmalloc(). Even though it also handles pointer returned by
kmalloc(), readability suffers.

This commit therefore updates the kfree_rcu() macro to explicitly pair
with kmalloc(), thus improving readability.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de0826411311..b95373ea3b02 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -851,8 +851,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 
 /**
  * kfree_rcu() - kfree an object after a grace period.
- * @ptr:	pointer to kfree
- * @rhf:	the name of the struct rcu_head within the type of @ptr.
+ * @ptr: pointer to kfree for both single- and double-argument invocations.
+ * @rhf: the name of the struct rcu_head within the type of @ptr,
+ *       but only for double-argument invocations.
  *
  * Many rcu callbacks functions just call kfree() on the base structure.
  * These functions are trivial, but their size adds up, and furthermore
@@ -875,13 +876,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * The BUILD_BUG_ON check must not involve any function calls, hence the
  * checks are done in macros here.
  */
-#define kfree_rcu(ptr, rhf)						\
-do {									\
-	typeof (ptr) ___p = (ptr);					\
-									\
-	if (___p)							\
-		__kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
-} while (0)
+#define kfree_rcu kvfree_rcu
 
 /**
  * kvfree_rcu() - kvfree an object after a grace period.
@@ -913,7 +908,14 @@ do {									\
 	kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
 
 #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
-#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
+#define kvfree_rcu_arg_2(ptr, rhf)					\
+do {									\
+	typeof (ptr) ___p = (ptr);					\
+									\
+	if (___p)							\
+		__kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+} while (0)
+
 #define kvfree_rcu_arg_1(ptr)					\
 do {								\
 	typeof(ptr) ___p = (ptr);				\
-- 
cgit v1.2.3


From 5ea5d1ed572cb5ac173674fe770252253d2d9e27 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Fri, 20 Nov 2020 12:49:17 +0100
Subject: rcu: Eliminate the __kvfree_rcu() macro

This commit open-codes the __kvfree_rcu() macro, thus saving a
few lines of code and improving readability.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b95373ea3b02..f1576cde5951 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -840,15 +840,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  */
 #define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
 
-/*
- * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
- */
-#define __kvfree_rcu(head, offset) \
-	do { \
-		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
-		kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
-	} while (0)
-
 /**
  * kfree_rcu() - kfree an object after a grace period.
  * @ptr: pointer to kfree for both single- and double-argument invocations.
@@ -866,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * Because the functions are not allowed in the low-order 4096 bytes of
  * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
  * If the offset is larger than 4095 bytes, a compile-time error will
- * be generated in __kvfree_rcu(). If this error is triggered, you can
+ * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
  * either fall back to use of call_rcu() or rearrange the structure to
  * position the rcu_head structure into the first 4096 bytes.
  *
@@ -912,8 +903,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 do {									\
 	typeof (ptr) ___p = (ptr);					\
 									\
-	if (___p)							\
-		__kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+	if (___p) {									\
+		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));	\
+		kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long)		\
+			(offsetof(typeof(*(ptr)), rhf)));				\
+	}										\
 } while (0)
 
 #define kvfree_rcu_arg_1(ptr)					\
-- 
cgit v1.2.3


From 74612a07b83fc46c2b2e6f71a541d55b024ebefc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 12 Nov 2020 16:34:09 -0800
Subject: srcu: Make Tiny SRCU use multi-bit grace-period counter

There is a need for a polling interface for SRCU grace periods.  This
polling needs to distinguish between an SRCU instance being idle on the
one hand or in the middle of a grace period on the other.  This commit
therefore converts the Tiny SRCU srcu_struct structure's srcu_idx from
a defacto boolean to a free-running counter, using the bottom bit to
indicate that a grace period is in progress.  The second-from-bottom
bit is thus used as the index returned by srcu_read_lock().

Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/
Reported-by: Kent Overstreet <kent.overstreet@gmail.com>
[ paulmck: Fix ->srcu_lock_nesting[] indexing per Neeraj Upadhyay. ]
Reviewed-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutiny.h | 6 +++---
 kernel/rcu/srcutiny.c    | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 5a5a1941ca15..b8b42d0a24f1 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -15,7 +15,7 @@
 
 struct srcu_struct {
 	short srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
-	short srcu_idx;			/* Current reader array element. */
+	unsigned short srcu_idx;	/* Current reader array element in bit 0x2. */
 	u8 srcu_gp_running;		/* GP workqueue running? */
 	u8 srcu_gp_waiting;		/* GP waiting for readers? */
 	struct swait_queue_head srcu_wq;
@@ -59,7 +59,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
 {
 	int idx;
 
-	idx = READ_ONCE(ssp->srcu_idx);
+	idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1);
 	return idx;
 }
@@ -80,7 +80,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 {
 	int idx;
 
-	idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+	idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
 	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
 		 tt, tf, idx,
 		 READ_ONCE(ssp->srcu_lock_nesting[!idx]),
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 6208c1dae5c9..5598cf6f1edc 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -124,11 +124,12 @@ void srcu_drive_gp(struct work_struct *wp)
 	ssp->srcu_cb_head = NULL;
 	ssp->srcu_cb_tail = &ssp->srcu_cb_head;
 	local_irq_enable();
-	idx = ssp->srcu_idx;
-	WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+	idx = (ssp->srcu_idx & 0x2) / 2;
+	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
 	WRITE_ONCE(ssp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
 	swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
 	WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
 
 	/* Invoke the callbacks we removed above. */
 	while (lh) {
-- 
cgit v1.2.3


From 8b5bd67cf6422b63ee100d76d8de8960ca2df7f0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 13 Nov 2020 12:54:48 -0800
Subject: srcu: Provide polling interfaces for Tiny SRCU grace periods

There is a need for a polling interface for SRCU grace
periods, so this commit supplies get_state_synchronize_srcu(),
start_poll_synchronize_srcu(), and poll_state_synchronize_srcu() for this
purpose.  The first can be used if future grace periods are inevitable
(perhaps due to a later call_srcu() invocation), the second if future
grace periods might not otherwise happen, and the third to check if a
grace period has elapsed since the corresponding call to either of the
first two.

As with get_state_synchronize_rcu() and cond_synchronize_rcu(),
the return value from either get_state_synchronize_srcu() or
start_poll_synchronize_srcu() must be passed in to a later call to
poll_state_synchronize_srcu().

Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/
Reported-by: Kent Overstreet <kent.overstreet@gmail.com>
[ paulmck: Add EXPORT_SYMBOL_GPL() per kernel test robot feedback. ]
[ paulmck: Apply feedback from Neeraj Upadhyay. ]
Link: https://lore.kernel.org/lkml/20201117004017.GA7444@paulmck-ThinkPad-P72/
Reviewed-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h |  2 ++
 include/linux/srcu.h     |  3 +++
 include/linux/srcutiny.h |  1 +
 kernel/rcu/srcutiny.c    | 55 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de0826411311..e09c0d87b3c3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -33,6 +33,8 @@
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
 #define ulong2long(a)		(*(long *)(&(a)))
+#define USHORT_CMP_GE(a, b)	(USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#define USHORT_CMP_LT(a, b)	(USHRT_MAX / 2 < (unsigned short)((a) - (b)))
 
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index e432cc92c73d..a0895bbf71ce 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -60,6 +60,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp);
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
 void synchronize_srcu(struct srcu_struct *ssp);
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index b8b42d0a24f1..0e0cf4d6a72a 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -16,6 +16,7 @@
 struct srcu_struct {
 	short srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
 	unsigned short srcu_idx;	/* Current reader array element in bit 0x2. */
+	unsigned short srcu_idx_max;	/* Furthest future srcu_idx request. */
 	u8 srcu_gp_running;		/* GP workqueue running? */
 	u8 srcu_gp_waiting;		/* GP waiting for readers? */
 	struct swait_queue_head srcu_wq;
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 3bac1db85a85..26344dc6483b 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
 	ssp->srcu_gp_running = false;
 	ssp->srcu_gp_waiting = false;
 	ssp->srcu_idx = 0;
+	ssp->srcu_idx_max = 0;
 	INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
 	INIT_LIST_HEAD(&ssp->srcu_work.entry);
 	return 0;
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 	WARN_ON(ssp->srcu_gp_waiting);
 	WARN_ON(ssp->srcu_cb_head);
 	WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+	WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+	WARN_ON(ssp->srcu_idx & 0x1);
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
 	struct srcu_struct *ssp;
 
 	ssp = container_of(wp, struct srcu_struct, srcu_work);
-	if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
+	if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 		return; /* Already running or nothing to do. */
 
 	/* Remove recently arrived callbacks and wait for readers. */
@@ -147,13 +150,19 @@ void srcu_drive_gp(struct work_struct *wp)
 	 * straighten that out.
 	 */
 	WRITE_ONCE(ssp->srcu_gp_running, false);
-	if (READ_ONCE(ssp->srcu_cb_head))
+	if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 		schedule_work(&ssp->srcu_work);
 }
 EXPORT_SYMBOL_GPL(srcu_drive_gp);
 
 static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 {
+	unsigned short cookie;
+
+	cookie = get_state_synchronize_srcu(ssp);
+	if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+		return;
+	WRITE_ONCE(ssp->srcu_idx_max, cookie);
 	if (!READ_ONCE(ssp->srcu_gp_running)) {
 		if (likely(srcu_init_done))
 			schedule_work(&ssp->srcu_work);
@@ -196,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+	unsigned long ret;
+
+	barrier();
+	ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+	barrier();
+	return ret & USHRT_MAX;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+	unsigned long ret = get_state_synchronize_srcu(ssp);
+
+	srcu_gp_start_if_needed(ssp);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+	bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+
+	barrier();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
 /* Lockdep diagnostics.  */
 void __init rcu_scheduler_starting(void)
 {
-- 
cgit v1.2.3


From ac7b79fd190b02e7151bc7d2b9da692f537657f3 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Sat, 19 Dec 2020 20:46:08 -0800
Subject: inotify, memcg: account inotify instances to kmemcg

Currently the fs sysctl inotify/max_user_instances is used to limit the
number of inotify instances on the system. For systems running multiple
workloads, the per-user namespace sysctl max_inotify_instances can be
used to further partition inotify instances. However there is no easy
way to set a sensible system level max limit on inotify instances and
further partition it between the workloads. It is much easier to charge
the underlying resource (i.e. memory) behind the inotify instances to
the memcg of the workload and let their memory limits limit the number
of inotify instances they can create.

With inotify instances charged to memcg, the admin can simply set
max_user_instances to INT_MAX and let the memcg limits of the jobs limit
their inotify instances.

Link: https://lore.kernel.org/r/20201220044608.1258123-1-shakeelb@google.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c |  2 +-
 fs/notify/group.c                  | 25 ++++++++++++++++++++-----
 fs/notify/inotify/inotify_user.c   |  4 ++--
 include/linux/fsnotify_backend.h   |  1 +
 4 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3e01d8f2ab90..7e7afc2b62e1 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -976,7 +976,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		f_flags |= O_NONBLOCK;
 
 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
-	group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
 	if (IS_ERR(group)) {
 		free_uid(user);
 		return PTR_ERR(group);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index a4a4b1c64d32..ffd723ffe46d 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -111,14 +111,12 @@ void fsnotify_put_group(struct fsnotify_group *group)
 }
 EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
-/*
- * Create a new fsnotify_group and hold a reference for the group returned.
- */
-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+static struct fsnotify_group *__fsnotify_alloc_group(
+				const struct fsnotify_ops *ops, gfp_t gfp)
 {
 	struct fsnotify_group *group;
 
-	group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	group = kzalloc(sizeof(struct fsnotify_group), gfp);
 	if (!group)
 		return ERR_PTR(-ENOMEM);
 
@@ -139,8 +137,25 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
+
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+{
+	return __fsnotify_alloc_group(ops, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
+{
+	return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
+}
+EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
+
 int fsnotify_fasync(int fd, struct file *file, int on)
 {
 	struct fsnotify_group *group = file->private_data;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 59c177011a0f..266d17e8ecb9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -632,11 +632,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 	struct fsnotify_group *group;
 	struct inotify_event_info *oevent;
 
-	group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+	group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
 	if (IS_ERR(group))
 		return group;
 
-	oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+	oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
 	if (unlikely(!oevent)) {
 		fsnotify_destroy_group(group);
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a2e42d3cd87c..e5409b83e731 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -470,6 +470,7 @@ static inline void fsnotify_update_flags(struct dentry *dentry)
 
 /* create a new group */
 extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
+extern struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops);
 /* get reference to a group */
 extern void fsnotify_get_group(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_alloc_group */
-- 
cgit v1.2.3


From 281462e593483350d8072a118c6e072c550a80fa Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 28 Dec 2020 18:49:16 +0300
Subject: memory: tegra124-emc: Make driver modular

Add modularization support to the Tegra124 EMC driver, which now can be
compiled as a loadable kernel module.

Note that EMC clock must be registered at clk-init time, otherwise PLLM
will be disabled as unused clock at boot time if EMC driver is compiled
as a module. Hence add a prepare/complete callbacks. similarly to what is
done for the Tegra20/30 EMC drivers.

Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20201228154920.18846-2-digetx@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 drivers/clk/tegra/Kconfig            |  3 +++
 drivers/clk/tegra/Makefile           |  2 +-
 drivers/clk/tegra/clk-tegra124-emc.c | 41 +++++++++++++++++++++++++++++++-----
 drivers/clk/tegra/clk-tegra124.c     | 26 ++++++++++++++++++++---
 drivers/clk/tegra/clk.h              | 18 ++++++++++------
 drivers/memory/tegra/Kconfig         |  3 ++-
 drivers/memory/tegra/tegra124-emc.c  | 31 ++++++++++++++++++---------
 include/linux/clk/tegra.h            |  8 +++++++
 include/soc/tegra/emc.h              | 16 --------------
 9 files changed, 106 insertions(+), 42 deletions(-)
 delete mode 100644 include/soc/tegra/emc.h

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/Kconfig b/drivers/clk/tegra/Kconfig
index deaa4605824c..90df619dc087 100644
--- a/drivers/clk/tegra/Kconfig
+++ b/drivers/clk/tegra/Kconfig
@@ -7,3 +7,6 @@ config TEGRA_CLK_DFLL
 	depends on ARCH_TEGRA_124_SOC || ARCH_TEGRA_210_SOC
 	select PM_OPP
 	def_bool y
+
+config TEGRA124_CLK_EMC
+	bool
diff --git a/drivers/clk/tegra/Makefile b/drivers/clk/tegra/Makefile
index eec2313fd37e..7b1816856eb5 100644
--- a/drivers/clk/tegra/Makefile
+++ b/drivers/clk/tegra/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_ARCH_TEGRA_3x_SOC)		+= clk-tegra20-emc.o
 obj-$(CONFIG_ARCH_TEGRA_114_SOC)	+= clk-tegra114.o
 obj-$(CONFIG_ARCH_TEGRA_124_SOC)	+= clk-tegra124.o
 obj-$(CONFIG_TEGRA_CLK_DFLL)		+= clk-tegra124-dfll-fcpu.o
-obj-$(CONFIG_TEGRA124_EMC)		+= clk-tegra124-emc.o
+obj-$(CONFIG_TEGRA124_CLK_EMC)		+= clk-tegra124-emc.o
 obj-$(CONFIG_ARCH_TEGRA_132_SOC)	+= clk-tegra124.o
 obj-y					+= cvb.o
 obj-$(CONFIG_ARCH_TEGRA_210_SOC)	+= clk-tegra210.o
diff --git a/drivers/clk/tegra/clk-tegra124-emc.c b/drivers/clk/tegra/clk-tegra124-emc.c
index 745f9faa98d8..bdf6f4a51617 100644
--- a/drivers/clk/tegra/clk-tegra124-emc.c
+++ b/drivers/clk/tegra/clk-tegra124-emc.c
@@ -11,7 +11,9 @@
 #include <linux/clk-provider.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk/tegra.h>
 #include <linux/delay.h>
+#include <linux/export.h>
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
@@ -21,7 +23,6 @@
 #include <linux/string.h>
 
 #include <soc/tegra/fuse.h>
-#include <soc/tegra/emc.h>
 
 #include "clk.h"
 
@@ -80,6 +81,9 @@ struct tegra_clk_emc {
 	int num_timings;
 	struct emc_timing *timings;
 	spinlock_t *lock;
+
+	tegra124_emc_prepare_timing_change_cb *prepare_timing_change;
+	tegra124_emc_complete_timing_change_cb *complete_timing_change;
 };
 
 /* Common clock framework callback implementations */
@@ -176,6 +180,9 @@ static struct tegra_emc *emc_ensure_emc_driver(struct tegra_clk_emc *tegra)
 	if (tegra->emc)
 		return tegra->emc;
 
+	if (!tegra->prepare_timing_change || !tegra->complete_timing_change)
+		return NULL;
+
 	if (!tegra->emc_node)
 		return NULL;
 
@@ -241,7 +248,7 @@ static int emc_set_timing(struct tegra_clk_emc *tegra,
 
 	div = timing->parent_rate / (timing->rate / 2) - 2;
 
-	err = tegra_emc_prepare_timing_change(emc, timing->rate);
+	err = tegra->prepare_timing_change(emc, timing->rate);
 	if (err)
 		return err;
 
@@ -259,7 +266,7 @@ static int emc_set_timing(struct tegra_clk_emc *tegra,
 
 	spin_unlock_irqrestore(tegra->lock, flags);
 
-	tegra_emc_complete_timing_change(emc, timing->rate);
+	tegra->complete_timing_change(emc, timing->rate);
 
 	clk_hw_reparent(&tegra->hw, __clk_get_hw(timing->parent));
 	clk_disable_unprepare(tegra->prev_parent);
@@ -473,8 +480,8 @@ static const struct clk_ops tegra_clk_emc_ops = {
 	.get_parent = emc_get_parent,
 };
 
-struct clk *tegra_clk_register_emc(void __iomem *base, struct device_node *np,
-				   spinlock_t *lock)
+struct clk *tegra124_clk_register_emc(void __iomem *base, struct device_node *np,
+				      spinlock_t *lock)
 {
 	struct tegra_clk_emc *tegra;
 	struct clk_init_data init;
@@ -538,3 +545,27 @@ struct clk *tegra_clk_register_emc(void __iomem *base, struct device_node *np,
 
 	return clk;
 };
+
+void tegra124_clk_set_emc_callbacks(tegra124_emc_prepare_timing_change_cb *prep_cb,
+				    tegra124_emc_complete_timing_change_cb *complete_cb)
+{
+	struct clk *clk = __clk_lookup("emc");
+	struct tegra_clk_emc *tegra;
+	struct clk_hw *hw;
+
+	if (clk) {
+		hw = __clk_get_hw(clk);
+		tegra = container_of(hw, struct tegra_clk_emc, hw);
+
+		tegra->prepare_timing_change = prep_cb;
+		tegra->complete_timing_change = complete_cb;
+	}
+}
+EXPORT_SYMBOL_GPL(tegra124_clk_set_emc_callbacks);
+
+bool tegra124_clk_emc_driver_available(struct clk_hw *hw)
+{
+	struct tegra_clk_emc *tegra = container_of(hw, struct tegra_clk_emc, hw);
+
+	return tegra->prepare_timing_change && tegra->complete_timing_change;
+}
diff --git a/drivers/clk/tegra/clk-tegra124.c b/drivers/clk/tegra/clk-tegra124.c
index e931319dcc9d..934520aab6e3 100644
--- a/drivers/clk/tegra/clk-tegra124.c
+++ b/drivers/clk/tegra/clk-tegra124.c
@@ -1500,6 +1500,26 @@ static void __init tegra124_132_clock_init_pre(struct device_node *np)
 	writel(plld_base, clk_base + PLLD_BASE);
 }
 
+static struct clk *tegra124_clk_src_onecell_get(struct of_phandle_args *clkspec,
+						void *data)
+{
+	struct clk_hw *hw;
+	struct clk *clk;
+
+	clk = of_clk_src_onecell_get(clkspec, data);
+	if (IS_ERR(clk))
+		return clk;
+
+	hw = __clk_get_hw(clk);
+
+	if (clkspec->args[0] == TEGRA124_CLK_EMC) {
+		if (!tegra124_clk_emc_driver_available(hw))
+			return ERR_PTR(-EPROBE_DEFER);
+	}
+
+	return clk;
+}
+
 /**
  * tegra124_132_clock_init_post - clock initialization postamble for T124/T132
  * @np: struct device_node * of the DT node for the SoC CAR IP block
@@ -1516,10 +1536,10 @@ static void __init tegra124_132_clock_init_post(struct device_node *np)
 				  &pll_x_params);
 	tegra_init_special_resets(1, tegra124_reset_assert,
 				  tegra124_reset_deassert);
-	tegra_add_of_provider(np, of_clk_src_onecell_get);
+	tegra_add_of_provider(np, tegra124_clk_src_onecell_get);
 
-	clks[TEGRA124_CLK_EMC] = tegra_clk_register_emc(clk_base, np,
-							&emc_lock);
+	clks[TEGRA124_CLK_EMC] = tegra124_clk_register_emc(clk_base, np,
+							   &emc_lock);
 
 	tegra_register_devclks(devclks, ARRAY_SIZE(devclks));
 
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index 6b565f6b5f66..c3e36b5dcc75 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -881,16 +881,22 @@ void tegra_super_clk_gen5_init(void __iomem *clk_base,
 			void __iomem *pmc_base, struct tegra_clk *tegra_clks,
 			struct tegra_clk_pll_params *pll_params);
 
-#ifdef CONFIG_TEGRA124_EMC
-struct clk *tegra_clk_register_emc(void __iomem *base, struct device_node *np,
-				   spinlock_t *lock);
+#ifdef CONFIG_TEGRA124_CLK_EMC
+struct clk *tegra124_clk_register_emc(void __iomem *base, struct device_node *np,
+				      spinlock_t *lock);
+bool tegra124_clk_emc_driver_available(struct clk_hw *emc_hw);
 #else
-static inline struct clk *tegra_clk_register_emc(void __iomem *base,
-						 struct device_node *np,
-						 spinlock_t *lock)
+static inline struct clk *
+tegra124_clk_register_emc(void __iomem *base, struct device_node *np,
+			  spinlock_t *lock)
 {
 	return NULL;
 }
+
+static inline bool tegra124_clk_emc_driver_available(struct clk_hw *emc_hw)
+{
+	return false;
+}
 #endif
 
 void tegra114_clock_tune_cpu_trimmers_high(void);
diff --git a/drivers/memory/tegra/Kconfig b/drivers/memory/tegra/Kconfig
index ca7077a06f4c..f5b451403c58 100644
--- a/drivers/memory/tegra/Kconfig
+++ b/drivers/memory/tegra/Kconfig
@@ -32,9 +32,10 @@ config TEGRA30_EMC
 	  external memory.
 
 config TEGRA124_EMC
-	bool "NVIDIA Tegra124 External Memory Controller driver"
+	tristate "NVIDIA Tegra124 External Memory Controller driver"
 	default y
 	depends on TEGRA_MC && ARCH_TEGRA_124_SOC
+	select TEGRA124_CLK_EMC
 	help
 	  This driver is for the External Memory Controller (EMC) found on
 	  Tegra124 chips. The EMC controls the external DRAM on the board.
diff --git a/drivers/memory/tegra/tegra124-emc.c b/drivers/memory/tegra/tegra124-emc.c
index ee8ee39e98ed..edfbf6d6d357 100644
--- a/drivers/memory/tegra/tegra124-emc.c
+++ b/drivers/memory/tegra/tegra124-emc.c
@@ -9,16 +9,17 @@
 #include <linux/clk-provider.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk/tegra.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
 #include <linux/sort.h>
 #include <linux/string.h>
 
-#include <soc/tegra/emc.h>
 #include <soc/tegra/fuse.h>
 #include <soc/tegra/mc.h>
 
@@ -562,8 +563,8 @@ static struct emc_timing *tegra_emc_find_timing(struct tegra_emc *emc,
 	return timing;
 }
 
-int tegra_emc_prepare_timing_change(struct tegra_emc *emc,
-				    unsigned long rate)
+static int tegra_emc_prepare_timing_change(struct tegra_emc *emc,
+					   unsigned long rate)
 {
 	struct emc_timing *timing = tegra_emc_find_timing(emc, rate);
 	struct emc_timing *last = &emc->last_timing;
@@ -790,8 +791,8 @@ int tegra_emc_prepare_timing_change(struct tegra_emc *emc,
 	return 0;
 }
 
-void tegra_emc_complete_timing_change(struct tegra_emc *emc,
-				      unsigned long rate)
+static void tegra_emc_complete_timing_change(struct tegra_emc *emc,
+					     unsigned long rate)
 {
 	struct emc_timing *timing = tegra_emc_find_timing(emc, rate);
 	struct emc_timing *last = &emc->last_timing;
@@ -987,6 +988,7 @@ static const struct of_device_id tegra_emc_of_match[] = {
 	{ .compatible = "nvidia,tegra132-emc" },
 	{}
 };
+MODULE_DEVICE_TABLE(of, tegra_emc_of_match);
 
 static struct device_node *
 tegra_emc_find_node_by_ram_code(struct device_node *node, u32 ram_code)
@@ -1226,9 +1228,19 @@ static int tegra_emc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, emc);
 
+	tegra124_clk_set_emc_callbacks(tegra_emc_prepare_timing_change,
+				       tegra_emc_complete_timing_change);
+
 	if (IS_ENABLED(CONFIG_DEBUG_FS))
 		emc_debugfs_init(&pdev->dev, emc);
 
+	/*
+	 * Don't allow the kernel module to be unloaded. Unloading adds some
+	 * extra complexity which doesn't really worth the effort in a case of
+	 * this driver.
+	 */
+	try_module_get(THIS_MODULE);
+
 	return 0;
 };
 
@@ -1240,9 +1252,8 @@ static struct platform_driver tegra_emc_driver = {
 		.suppress_bind_attrs = true,
 	},
 };
+module_platform_driver(tegra_emc_driver);
 
-static int tegra_emc_init(void)
-{
-	return platform_driver_register(&tegra_emc_driver);
-}
-subsys_initcall(tegra_emc_init);
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra124 EMC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 3f01d43f0598..eb016fc9cc0b 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -136,6 +136,7 @@ extern void tegra210_clk_emc_dll_update_setting(u32 emc_dll_src_value);
 extern void tegra210_clk_emc_update_setting(u32 emc_src_value);
 
 struct clk;
+struct tegra_emc;
 
 typedef long (tegra20_clk_emc_round_cb)(unsigned long rate,
 					unsigned long min_rate,
@@ -146,6 +147,13 @@ void tegra20_clk_set_emc_round_callback(tegra20_clk_emc_round_cb *round_cb,
 					void *cb_arg);
 int tegra20_clk_prepare_emc_mc_same_freq(struct clk *emc_clk, bool same);
 
+typedef int (tegra124_emc_prepare_timing_change_cb)(struct tegra_emc *emc,
+						    unsigned long rate);
+typedef void (tegra124_emc_complete_timing_change_cb)(struct tegra_emc *emc,
+						      unsigned long rate);
+void tegra124_clk_set_emc_callbacks(tegra124_emc_prepare_timing_change_cb *prep_cb,
+				    tegra124_emc_complete_timing_change_cb *complete_cb);
+
 struct tegra210_clk_emc_config {
 	unsigned long rate;
 	bool same_freq;
diff --git a/include/soc/tegra/emc.h b/include/soc/tegra/emc.h
deleted file mode 100644
index 05199a97ccf4..000000000000
--- a/include/soc/tegra/emc.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
- */
-
-#ifndef __SOC_TEGRA_EMC_H__
-#define __SOC_TEGRA_EMC_H__
-
-struct tegra_emc;
-
-int tegra_emc_prepare_timing_change(struct tegra_emc *emc,
-				    unsigned long rate);
-void tegra_emc_complete_timing_change(struct tegra_emc *emc,
-				      unsigned long rate);
-
-#endif /* __SOC_TEGRA_EMC_H__ */
-- 
cgit v1.2.3


From 8553a979fcd03448a4096c7d431b7ee1a52bfca3 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <utkarsh.h.patel@intel.com>
Date: Wed, 9 Dec 2020 22:09:03 -0800
Subject: platform/chrome: cros_ec_typec: Send mux configuration acknowledgment
 to EC

In some corner cases downgrade of the superspeed typec device(e.g. Dell
typec Dock, apple dongle) was seen because before the SOC mux configuration
finishes, EC starts configuring the next mux state.

With this change, once the SOC mux is configured, kernel will send an
acknowledgment to EC via Host command EC_CMD_USB_PD_MUX_ACK [1].
After sending the host event EC will wait for the acknowledgment from
kernel before starting the PD negotiation for the next mux state. This
helps to have a framework to build better error handling along with the
synchronization of timing sensitive mux states.

This change also brings in corresponding EC header updates from the EC code
base [1].

[1]:
https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/master/include/ec_commands.h

Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
Reviewed-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Benson Leung <bleung@chromium.org>
Link: https://lore.kernel.org/r/20201210060903.2205-3-utkarsh.h.patel@intel.com
---
 drivers/platform/chrome/cros_ec_typec.c        | 16 ++++++++++++++++
 include/linux/platform_data/cros_ec_commands.h | 17 +++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c
index 6068433dd2e2..e724a5eaef1c 100644
--- a/drivers/platform/chrome/cros_ec_typec.c
+++ b/drivers/platform/chrome/cros_ec_typec.c
@@ -81,6 +81,7 @@ struct cros_typec_data {
 	struct notifier_block nb;
 	struct work_struct port_work;
 	bool typec_cmd_supported;
+	bool needs_mux_ack;
 };
 
 static int cros_typec_parse_port_props(struct typec_capability *cap,
@@ -531,6 +532,7 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num,
 				struct ec_response_usb_pd_control_v2 *pd_ctrl)
 {
 	struct cros_typec_port *port = typec->ports[port_num];
+	struct ec_params_usb_pd_mux_ack mux_ack;
 	enum typec_orientation orientation;
 	int ret;
 
@@ -570,6 +572,18 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num,
 			mux_flags);
 	}
 
+	if (!typec->needs_mux_ack)
+		return ret;
+
+	/* Sending Acknowledgment to EC */
+	mux_ack.port = port_num;
+
+	if (cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_MUX_ACK, &mux_ack,
+				  sizeof(mux_ack), NULL, 0) < 0)
+		dev_warn(typec->dev,
+			 "Failed to send Mux ACK to EC for port: %d\n",
+			 port_num);
+
 	return ret;
 }
 
@@ -1053,6 +1067,8 @@ static int cros_typec_probe(struct platform_device *pdev)
 
 	typec->typec_cmd_supported = !!cros_typec_feature_supported(typec,
 					EC_FEATURE_TYPEC_CMD);
+	typec->needs_mux_ack = !!cros_typec_feature_supported(typec,
+					EC_FEATURE_TYPEC_MUX_REQUIRE_AP_ACK);
 
 	ret = cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
 				    &resp, sizeof(resp));
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 86376779ab31..9787715540c7 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -1286,6 +1286,16 @@ enum ec_feature_code {
 	EC_FEATURE_ISH = 40,
 	/* New TCPMv2 TYPEC_ prefaced commands supported */
 	EC_FEATURE_TYPEC_CMD = 41,
+	/*
+	 * The EC will wait for direction from the AP to enter Type-C alternate
+	 * modes or USB4.
+	 */
+	EC_FEATURE_TYPEC_REQUIRE_AP_MODE_ENTRY = 42,
+	/*
+	 * The EC will wait for an acknowledge from the AP after setting the
+	 * mux.
+	 */
+	EC_FEATURE_TYPEC_MUX_REQUIRE_AP_ACK = 43,
 };
 
 #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
@@ -6054,6 +6064,13 @@ struct ec_params_charger_control {
 	uint8_t allow_charging;
 } __ec_align_size1;
 
+/* Get ACK from the USB-C SS muxes */
+#define EC_CMD_USB_PD_MUX_ACK 0x0603
+
+struct ec_params_usb_pd_mux_ack {
+	uint8_t port; /* USB-C port number */
+} __ec_align1;
+
 /*****************************************************************************/
 /*
  * Reserve a range of host commands for board-specific, experimental, or
-- 
cgit v1.2.3


From 8527efc59d45d7135460daac36054f2f213ac08a Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 5 Jan 2021 19:59:05 -0800
Subject: rpmsg: glink: Guard qcom_glink_ssr_notify() with correct config

The qcom_glink_ssr_notify() function doesn't relate to the SMEM specific
GLINK config, but the common RPMSG_QCOM_GLINK config. Update the guard
to properly reflect this.

Reviewed-by: Alex Elder <elder@linaro.org>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20210106035905.4153692-1-bjorn.andersson@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg/qcom_glink.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h
index daded9fddf36..22fc3a69b683 100644
--- a/include/linux/rpmsg/qcom_glink.h
+++ b/include/linux/rpmsg/qcom_glink.h
@@ -7,12 +7,17 @@
 
 struct qcom_glink;
 
+#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK)
+void qcom_glink_ssr_notify(const char *ssr_name);
+#else
+static inline void qcom_glink_ssr_notify(const char *ssr_name) {}
+#endif
+
 #if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SMEM)
 
 struct qcom_glink *qcom_glink_smem_register(struct device *parent,
 					    struct device_node *node);
 void qcom_glink_smem_unregister(struct qcom_glink *glink);
-void qcom_glink_ssr_notify(const char *ssr_name);
 
 #else
 
@@ -24,7 +29,6 @@ qcom_glink_smem_register(struct device *parent,
 }
 
 static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {}
-static inline void qcom_glink_ssr_notify(const char *ssr_name) {}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 98621ed011c57ba6e52e01a5982b221c9943b6d9 Mon Sep 17 00:00:00 2001
From: Sowjanya Komatineni <skomatineni@nvidia.com>
Date: Mon, 21 Dec 2020 13:17:35 -0800
Subject: spi: spi-mem: Mark dummy transfers by setting dummy_data bit

This patch marks dummy transfer by setting dummy_data bit to 1.

Controllers supporting dummy transfer by hardware use this bit field
to skip software transfer of dummy bytes and use hardware dummy bytes
transfer.

Signed-off-by: Sowjanya Komatineni <skomatineni@nvidia.com>
Link: https://lore.kernel.org/r/1608585459-17250-6-git-send-email-skomatineni@nvidia.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mem.c   | 1 +
 include/linux/spi/spi.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index f3a3f196e628..c64371ce6c38 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -354,6 +354,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op)
 		xfers[xferpos].tx_buf = tmpbuf + op->addr.nbytes + 1;
 		xfers[xferpos].len = op->dummy.nbytes;
 		xfers[xferpos].tx_nbits = op->dummy.buswidth;
+		xfers[xferpos].dummy_data = 1;
 		spi_message_add_tail(&xfers[xferpos], &msg);
 		xferpos++;
 		totalxferlen += op->dummy.nbytes;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 9bfdfaf286eb..797ef28fa7d5 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -825,6 +825,7 @@ extern void spi_res_release(struct spi_controller *ctlr,
  *      transfer. If 0 the default (from @spi_device) is used.
  * @bits_per_word: select a bits_per_word other than the device default
  *      for this transfer. If 0 the default (from @spi_device) is used.
+ * @dummy_data: indicates transfer is dummy bytes transfer.
  * @cs_change: affects chipselect after this transfer completes
  * @cs_change_delay: delay between cs deassert and assert when
  *      @cs_change is set and @spi_transfer is not the last in @spi_message
@@ -937,6 +938,7 @@ struct spi_transfer {
 	struct sg_table tx_sg;
 	struct sg_table rx_sg;
 
+	unsigned	dummy_data:1;
 	unsigned	cs_change:1;
 	unsigned	tx_nbits:3;
 	unsigned	rx_nbits:3;
-- 
cgit v1.2.3


From f2485a2dc9f0f30fbdd013ad5772975100c71360 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Jun 2020 00:08:44 -0400
Subject: elf_prstatus: collect the common part (everything before pr_reg) into
 a struct

Preparations to doing i386 compat elf_prstatus sanely - rather than duplicating
the beginning of compat_elf_prstatus, take these fields into a separate
structure (compat_elf_prstatus_common), so that it could be reused.  Due to
the incestous relationship between binfmt_elf.c and compat_binfmt_elf.c we
need the same shape change done to native struct elf_prstatus, gathering the
fields prior to pr_reg into a new structure (struct elf_prstatus_common).

Fortunately, offset of pr_reg is always a multiple of 16 with no padding
right before it, so it's possible to turn all the stuff prior to it into
a single member without disturbing the layout.

[build fix from Geert Uytterhoeven folded in]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/crash.c                   |  2 +-
 arch/mips/kernel/binfmt_elfn32.c           |  7 ++++++-
 arch/mips/kernel/binfmt_elfo32.c           |  6 +++++-
 arch/powerpc/platforms/powernv/opal-core.c |  6 +++---
 arch/s390/kernel/crash_dump.c              |  2 +-
 fs/binfmt_elf.c                            |  8 ++++----
 fs/binfmt_elf_fdpic.c                      | 22 +++++-----------------
 fs/compat_binfmt_elf.c                     |  1 +
 include/linux/elfcore-compat.h             |  7 ++++++-
 include/linux/elfcore.h                    |  7 ++++++-
 kernel/kexec_core.c                        |  2 +-
 11 files changed, 39 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
index fec70d662d0c..4f47741005d2 100644
--- a/arch/ia64/kernel/crash.c
+++ b/arch/ia64/kernel/crash.c
@@ -43,7 +43,7 @@ crash_save_this_cpu(void)
 
 	elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg);
 	memset(prstatus, 0, sizeof(*prstatus));
-	prstatus->pr_pid = current->pid;
+	prstatus->common.pr_pid = current->pid;
 
 	ia64_dump_cpu_regs(dst);
 	cfm = dst[43];
diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c
index 6ee3f7218c67..136dc0c9300d 100644
--- a/arch/mips/kernel/binfmt_elfn32.c
+++ b/arch/mips/kernel/binfmt_elfn32.c
@@ -44,7 +44,8 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #include <linux/math64.h>
 
 #define elf_prstatus elf_prstatus32
-struct elf_prstatus32
+#define elf_prstatus_common elf_prstatus32_common
+struct elf_prstatus32_common
 {
 	struct elf_siginfo pr_info;	/* Info associated with signal */
 	short	pr_cursig;		/* Current signal */
@@ -58,6 +59,10 @@ struct elf_prstatus32
 	struct old_timeval32 pr_stime; /* System time */
 	struct old_timeval32 pr_cutime;/* Cumulative user time */
 	struct old_timeval32 pr_cstime;/* Cumulative system time */
+};
+struct elf_prstatus32
+{
+	struct elf_prstatus32_common common:
 	elf_gregset_t pr_reg;	/* GP registers */
 	int pr_fpvalid;		/* True if math co-processor being used.  */
 };
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index 6dd103d3cebb..b1f4b8f1dee7 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -49,7 +49,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #include <linux/math64.h>
 
 #define elf_prstatus elf_prstatus32
-struct elf_prstatus32
+struct elf_prstatus32_common
 {
 	struct elf_siginfo pr_info;	/* Info associated with signal */
 	short	pr_cursig;		/* Current signal */
@@ -63,6 +63,10 @@ struct elf_prstatus32
 	struct old_timeval32 pr_stime; /* System time */
 	struct old_timeval32 pr_cutime;/* Cumulative user time */
 	struct old_timeval32 pr_cstime;/* Cumulative system time */
+};
+struct elf_prstatus32
+{
+	struct elf_prstatus32_common common:
 	elf_gregset_t pr_reg;	/* GP registers */
 	int pr_fpvalid;		/* True if math co-processor being used.  */
 };
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index 23571f0b555a..0d9ba70f7251 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -119,8 +119,8 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
 	 * As a PIR value could also be '0', add an offset of '100'
 	 * to every PIR to avoid misinterpretations in GDB.
 	 */
-	prstatus->pr_pid  = cpu_to_be32(100 + pir);
-	prstatus->pr_ppid = cpu_to_be32(1);
+	prstatus->common.pr_pid  = cpu_to_be32(100 + pir);
+	prstatus->common.pr_ppid = cpu_to_be32(1);
 
 	/*
 	 * Indicate SIGUSR1 for crash initiated from kernel.
@@ -130,7 +130,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
 		short sig;
 
 		sig = kernel_initiated ? SIGUSR1 : SIGTERM;
-		prstatus->pr_cursig = cpu_to_be16(sig);
+		prstatus->common.pr_cursig = cpu_to_be16(sig);
 	}
 }
 
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 205b2e2648aa..0e36dfc9ccd6 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -365,7 +365,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
 	memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs));
 	memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw));
 	memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs));
-	nt_prstatus.pr_pid = cpu;
+	nt_prstatus.common.pr_pid = cpu;
 	/* Prepare fpregset (floating point) note */
 	memset(&nt_fpregset, 0, sizeof(nt_fpregset));
 	memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8380478d3d92..4c1550b13899 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1495,7 +1495,7 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
  * fill up all the fields in prstatus from the given task struct, except
  * registers which need to be filled up separately.
  */
-static void fill_prstatus(struct elf_prstatus *prstatus,
+static void fill_prstatus(struct elf_prstatus_common *prstatus,
 		struct task_struct *p, long signr)
 {
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
@@ -1736,7 +1736,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * than being the whole note contents.  We fill the reset in here.
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
-	fill_prstatus(&t->prstatus, t->task, signr);
+	fill_prstatus(&t->prstatus.common, t->task, signr);
 	regset_get(t->task, &view->regsets[0],
 		   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
 
@@ -1958,7 +1958,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	struct task_struct *p = t->thread;
 	t->num_notes = 0;
 
-	fill_prstatus(&t->prstatus, p, signr);
+	fill_prstatus(&t->prstatus.common, p, signr);
 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
 	
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
@@ -2037,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	}
 	/* now collect the dump for the current */
 	memset(info->prstatus, 0, sizeof(*info->prstatus));
-	fill_prstatus(info->prstatus, current, siginfo->si_signo);
+	fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
 	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
 
 	/* Set up header */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index be4062b8ba75..03d81a14bcbf 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1191,18 +1191,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 
 struct elf_prstatus_fdpic
 {
-	struct elf_siginfo pr_info;	/* Info associated with signal */
-	short	pr_cursig;		/* Current signal */
-	unsigned long pr_sigpend;	/* Set of pending signals */
-	unsigned long pr_sighold;	/* Set of held signals */
-	pid_t	pr_pid;
-	pid_t	pr_ppid;
-	pid_t	pr_pgrp;
-	pid_t	pr_sid;
-	struct __kernel_old_timeval pr_utime;	/* User time */
-	struct __kernel_old_timeval pr_stime;	/* System time */
-	struct __kernel_old_timeval pr_cutime;	/* Cumulative user time */
-	struct __kernel_old_timeval pr_cstime;	/* Cumulative system time */
+	struct elf_prstatus_common	common;
 	elf_gregset_t pr_reg;	/* GP registers */
 	/* When using FDPIC, the loadmap addresses need to be communicated
 	 * to GDB in order for GDB to do the necessary relocations.  The
@@ -1301,7 +1290,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
  * fill up all the fields in prstatus from the given task struct, except
  * registers which need to be filled up separately.
  */
-static void fill_prstatus(struct elf_prstatus_fdpic *prstatus,
+static void fill_prstatus(struct elf_prstatus_common *prstatus,
 			  struct task_struct *p, long signr)
 {
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
@@ -1332,9 +1321,6 @@ static void fill_prstatus(struct elf_prstatus_fdpic *prstatus,
 	}
 	prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime);
 	prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime);
-
-	prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
-	prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
 }
 
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
@@ -1405,7 +1391,9 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
 	if (!t)
 		return t;
 
-	fill_prstatus(&t->prstatus, p, signr);
+	fill_prstatus(&t->prstatus.common, p, signr);
+	t->prstatus.pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
+	t->prstatus.pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
 	regset_get(p, &view->regsets[0],
 		   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
 
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 962df845ed51..feb48a5c2d44 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -50,6 +50,7 @@
  * which requires asm/elf.h to define compat_elf_gregset_t et al.
  */
 #define elf_prstatus	compat_elf_prstatus
+#define elf_prstatus_common	compat_elf_prstatus_common
 #define elf_prpsinfo	compat_elf_prpsinfo
 
 #undef ns_to_kernel_old_timeval
diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h
index 10485f0c9740..4aeda5f1f038 100644
--- a/include/linux/elfcore-compat.h
+++ b/include/linux/elfcore-compat.h
@@ -17,7 +17,7 @@ struct compat_elf_siginfo
 	compat_int_t			si_errno;
 };
 
-struct compat_elf_prstatus
+struct compat_elf_prstatus_common
 {
 	struct compat_elf_siginfo	pr_info;
 	short				pr_cursig;
@@ -31,6 +31,11 @@ struct compat_elf_prstatus
 	struct old_timeval32		pr_stime;
 	struct old_timeval32		pr_cutime;
 	struct old_timeval32		pr_cstime;
+};
+
+struct compat_elf_prstatus
+{
+	struct compat_elf_prstatus_common	common;
 	compat_elf_gregset_t		pr_reg;
 	compat_int_t			pr_fpvalid;
 };
diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index de51c1bef27d..2aaa15779d50 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -29,7 +29,7 @@ struct elf_siginfo
  * the SVR4 structure, but more Linuxy, with things that Linux does
  * not support and which gdb doesn't really use excluded.
  */
-struct elf_prstatus
+struct elf_prstatus_common
 {
 	struct elf_siginfo pr_info;	/* Info associated with signal */
 	short	pr_cursig;		/* Current signal */
@@ -43,6 +43,11 @@ struct elf_prstatus
 	struct __kernel_old_timeval pr_stime;	/* System time */
 	struct __kernel_old_timeval pr_cutime;	/* Cumulative user time */
 	struct __kernel_old_timeval pr_cstime;	/* Cumulative system time */
+};
+
+struct elf_prstatus
+{
+	struct elf_prstatus_common common;
 	elf_gregset_t pr_reg;	/* GP registers */
 	int pr_fpvalid;		/* True if math co-processor being used.  */
 };
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 4f8efc278aa7..80905e5aa8ae 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1076,7 +1076,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 	if (!buf)
 		return;
 	memset(&prstatus, 0, sizeof(prstatus));
-	prstatus.pr_pid = current->pid;
+	prstatus.common.pr_pid = current->pid;
 	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
 	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
 			      &prstatus, sizeof(prstatus));
-- 
cgit v1.2.3


From 7facdc426f86c67e579e49e100943cbccc43e1c6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Jun 2020 23:03:25 -0400
Subject: [amd64] clean PRSTATUS_SIZE/SET_PR_FPVALID up properly

To get rid of hardcoded size/offset in those macros we need to have
a definition of i386 variant of struct elf_prstatus.  However, we can't
do that in asm/compat.h - the types needed for that are not there and
adding an include of asm/user32.h into asm/compat.h would cause a lot
of mess.

That could be conveniently done in elfcore-compat.h, but currently there
is nowhere to put arch-dependent parts of it - no asm/elfcore-compat.h.
So we introduce a new file (asm/elfcore-compat.h, present on architectures
that have CONFIG_ARCH_HAS_ELFCORE_COMPAT set, currently only on x86),
have it pulled by linux/elfcore-compat.h and move the definitions there.

As a side benefit, we don't need to worry about accidental inclusion of
that file into binfmt_elf.c itself, so we don't need the dance with
COMPAT_PRSTATUS_SIZE, etc. - only fs/compat_binfmt_elf.c will see
that header.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/Kconfig                          |  3 +++
 arch/x86/Kconfig                      |  1 +
 arch/x86/include/asm/compat.h         | 14 --------------
 arch/x86/include/asm/elfcore-compat.h | 31 +++++++++++++++++++++++++++++++
 fs/compat_binfmt_elf.c                |  8 --------
 include/linux/elfcore-compat.h        | 18 +++++++++++-------
 6 files changed, 46 insertions(+), 29 deletions(-)
 create mode 100644 arch/x86/include/asm/elfcore-compat.h

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 78c6f05b10f9..a17ced73b23c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1105,6 +1105,9 @@ config HAVE_ARCH_PFN_VALID
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	bool
 
+config ARCH_HAS_ELFCORE_COMPAT
+	bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7b6dd10b162a..302a6b453c91 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86_64
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE
 	select SWIOTLB
+	select ARCH_HAS_ELFCORE_COMPAT
 
 config FORCE_DYNAMIC_FTRACE
 	def_bool y
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 15cf0f831dee..be09c7eac89f 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -159,20 +159,6 @@ struct compat_shmid64_ds {
 	compat_ulong_t __unused5;
 };
 
-/*
- * The type of struct elf_prstatus.pr_reg in compatible core dumps.
- */
-typedef struct user_regs_struct compat_elf_gregset_t;
-
-/* Full regset -- prstatus on x32, otherwise on ia32 */
-#define COMPAT_PRSTATUS_SIZE (user_64bit_mode(task_pt_regs(current)) \
-	? sizeof(struct compat_elf_prstatus) \
-	: 144)
-#define COMPAT_SET_PR_FPVALID(S) \
-	(*(user_64bit_mode(task_pt_regs(current)) \
-	       ? &(S)->pr_fpvalid	\
-               : (int *)((void *)(S) + 140)) = 1)
-
 #ifdef CONFIG_X86_X32_ABI
 #define COMPAT_USE_64BIT_TIME \
 	(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h
new file mode 100644
index 000000000000..f1b6c7a8d8fc
--- /dev/null
+++ b/arch/x86/include/asm/elfcore-compat.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_ELFCORE_COMPAT_H
+#define _ASM_X86_ELFCORE_COMPAT_H
+
+#include <asm/user32.h>
+
+/*
+ * On amd64 we have two 32bit ABIs - i386 and x32.  The latter
+ * has bigger registers, so we use it for compat_elf_regset_t.
+ * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID
+ * are used to choose the size and location of ->pr_fpvalid of
+ * the layout actually used.
+ */
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+struct i386_elf_prstatus
+{
+	struct compat_elf_prstatus_common	common;
+	struct user_regs_struct32		pr_reg;
+	compat_int_t			pr_fpvalid;
+};
+
+#define PRSTATUS_SIZE \
+	(user_64bit_mode(task_pt_regs(current)) \
+		? sizeof(struct compat_elf_prstatus) \
+		: sizeof(struct i386_elf_prstatus))
+#define SET_PR_FPVALID(S) \
+	(*(user_64bit_mode(task_pt_regs(current)) \
+		? &(S)->pr_fpvalid 	\
+		: &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1)
+
+#endif
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index feb48a5c2d44..a6321415aba0 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -96,14 +96,6 @@
 #define	ELF_EXEC_PAGESIZE	COMPAT_ELF_EXEC_PAGESIZE
 #endif
 
-#ifdef	COMPAT_PRSTATUS_SIZE
-#define	PRSTATUS_SIZE COMPAT_PRSTATUS_SIZE
-#endif
-
-#ifdef	COMPAT_SET_PR_FPVALID
-#define	SET_PR_FPVALID(S) COMPAT_SET_PR_FPVALID(S)
-#endif
-
 #ifdef	COMPAT_ELF_PLAT_INIT
 #undef	ELF_PLAT_INIT
 #define	ELF_PLAT_INIT		COMPAT_ELF_PLAT_INIT
diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h
index 4aeda5f1f038..e272c3d452ce 100644
--- a/include/linux/elfcore-compat.h
+++ b/include/linux/elfcore-compat.h
@@ -33,13 +33,6 @@ struct compat_elf_prstatus_common
 	struct old_timeval32		pr_cstime;
 };
 
-struct compat_elf_prstatus
-{
-	struct compat_elf_prstatus_common	common;
-	compat_elf_gregset_t		pr_reg;
-	compat_int_t			pr_fpvalid;
-};
-
 struct compat_elf_prpsinfo
 {
 	char				pr_state;
@@ -54,4 +47,15 @@ struct compat_elf_prpsinfo
 	char				pr_psargs[ELF_PRARGSZ];
 };
 
+#ifdef CONFIG_ARCH_HAS_ELFCORE_COMPAT
+#include <asm/elfcore-compat.h>
+#endif
+
+struct compat_elf_prstatus
+{
+	struct compat_elf_prstatus_common	common;
+	compat_elf_gregset_t		pr_reg;
+	compat_int_t			pr_fpvalid;
+};
+
 #endif /* _LINUX_ELFCORE_COMPAT_H */
-- 
cgit v1.2.3


From c167b9c7e3d6131b4a4865c112a3dbc86d2e997d Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Mon, 21 Dec 2020 19:39:51 +0100
Subject: platform/surface: Add Surface Aggregator subsystem

Add Surface System Aggregator Module core and Surface Serial Hub driver,
required for the embedded controller found on Microsoft Surface devices.

The Surface System Aggregator Module (SSAM, SAM or Surface Aggregator)
is an embedded controller (EC) found on 4th and later generation
Microsoft Surface devices, with the exception of the Surface Go series.
This EC provides various functionality, depending on the device in
question. This can include battery status and thermal reporting (5th and
later generations), but also HID keyboard (6th+) and touchpad input
(7th+) on Surface Laptop and Surface Book 3 series devices.

This patch provides the basic necessities for communication with the SAM
EC on 5th and later generation devices. On these devices, the EC
provides an interface that acts as serial device, called the Surface
Serial Hub (SSH). 4th generation devices, on which the EC interface is
provided via an HID-over-I2C device, are not supported by this patch.

Specifically, this patch adds a driver for the SSH device (device HID
MSHW0084 in ACPI), as well as a controller structure and associated API.
This represents the functional core of the Surface Aggregator kernel
subsystem, introduced with this patch, and will be expanded upon in
subsequent commits.

The SSH driver acts as the main attachment point for this subsystem and
sets-up and manages the controller structure. The controller in turn
provides a basic communication interface, allowing to send requests from
host to EC and receiving the corresponding responses, as well as
managing and receiving events, sent from EC to host. It is structured
into multiple layers, with the top layer presenting the API used by
other kernel drivers and the lower layers modeled after the serial
protocol used for communication.

Said other drivers are then responsible for providing the (Surface model
specific) functionality accessible through the EC (e.g. battery status
reporting, thermal information, ...) via said controller structure and
API, and will be added in future commits.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20201221183959.1186143-2-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 MAINTAINERS                                        |    8 +
 drivers/platform/surface/Kconfig                   |    2 +
 drivers/platform/surface/Makefile                  |    1 +
 drivers/platform/surface/aggregator/Kconfig        |   42 +
 drivers/platform/surface/aggregator/Makefile       |   10 +
 drivers/platform/surface/aggregator/controller.c   | 2504 ++++++++++++++++++++
 drivers/platform/surface/aggregator/controller.h   |  276 +++
 drivers/platform/surface/aggregator/core.c         |  787 ++++++
 drivers/platform/surface/aggregator/ssh_msgb.h     |  205 ++
 .../platform/surface/aggregator/ssh_packet_layer.c | 1710 +++++++++++++
 .../platform/surface/aggregator/ssh_packet_layer.h |  187 ++
 drivers/platform/surface/aggregator/ssh_parser.c   |  228 ++
 drivers/platform/surface/aggregator/ssh_parser.h   |  154 ++
 .../surface/aggregator/ssh_request_layer.c         | 1211 ++++++++++
 .../surface/aggregator/ssh_request_layer.h         |  143 ++
 include/linux/surface_aggregator/controller.h      |  824 +++++++
 include/linux/surface_aggregator/serial_hub.h      |  672 ++++++
 17 files changed, 8964 insertions(+)
 create mode 100644 drivers/platform/surface/aggregator/Kconfig
 create mode 100644 drivers/platform/surface/aggregator/Makefile
 create mode 100644 drivers/platform/surface/aggregator/controller.c
 create mode 100644 drivers/platform/surface/aggregator/controller.h
 create mode 100644 drivers/platform/surface/aggregator/core.c
 create mode 100644 drivers/platform/surface/aggregator/ssh_msgb.h
 create mode 100644 drivers/platform/surface/aggregator/ssh_packet_layer.c
 create mode 100644 drivers/platform/surface/aggregator/ssh_packet_layer.h
 create mode 100644 drivers/platform/surface/aggregator/ssh_parser.c
 create mode 100644 drivers/platform/surface/aggregator/ssh_parser.h
 create mode 100644 drivers/platform/surface/aggregator/ssh_request_layer.c
 create mode 100644 drivers/platform/surface/aggregator/ssh_request_layer.h
 create mode 100644 include/linux/surface_aggregator/controller.h
 create mode 100644 include/linux/surface_aggregator/serial_hub.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 546aa66428c9..845a245f562c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11813,6 +11813,14 @@ L:	platform-driver-x86@vger.kernel.org
 S:	Supported
 F:	drivers/platform/surface/surfacepro3_button.c
 
+MICROSOFT SURFACE SYSTEM AGGREGATOR SUBSYSTEM
+M:	Maximilian Luz <luzmaximilian@gmail.com>
+S:	Maintained
+W:	https://github.com/linux-surface/surface-aggregator-module
+C:	irc://chat.freenode.net/##linux-surface
+F:	drivers/platform/surface/aggregator/
+F:	include/linux/surface_aggregator/
+
 MICROTEK X6 SCANNER
 M:	Oliver Neukum <oliver@neukum.org>
 S:	Maintained
diff --git a/drivers/platform/surface/Kconfig b/drivers/platform/surface/Kconfig
index 2c941cdac9ee..2916a47b130e 100644
--- a/drivers/platform/surface/Kconfig
+++ b/drivers/platform/surface/Kconfig
@@ -56,4 +56,6 @@ config SURFACE_PRO3_BUTTON
 	help
 	  This driver handles the power/home/volume buttons on the Microsoft Surface Pro 3/4 tablet.
 
+source "drivers/platform/surface/aggregator/Kconfig"
+
 endif # SURFACE_PLATFORMS
diff --git a/drivers/platform/surface/Makefile b/drivers/platform/surface/Makefile
index cedfb027ded1..034134fe0264 100644
--- a/drivers/platform/surface/Makefile
+++ b/drivers/platform/surface/Makefile
@@ -7,5 +7,6 @@
 obj-$(CONFIG_SURFACE3_WMI)		+= surface3-wmi.o
 obj-$(CONFIG_SURFACE_3_BUTTON)		+= surface3_button.o
 obj-$(CONFIG_SURFACE_3_POWER_OPREGION)	+= surface3_power.o
+obj-$(CONFIG_SURFACE_AGGREGATOR)	+= aggregator/
 obj-$(CONFIG_SURFACE_GPE)		+= surface_gpe.o
 obj-$(CONFIG_SURFACE_PRO3_BUTTON)	+= surfacepro3_button.o
diff --git a/drivers/platform/surface/aggregator/Kconfig b/drivers/platform/surface/aggregator/Kconfig
new file mode 100644
index 000000000000..e9f4ad96e40a
--- /dev/null
+++ b/drivers/platform/surface/aggregator/Kconfig
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: GPL-2.0+
+# Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+
+menuconfig SURFACE_AGGREGATOR
+	tristate "Microsoft Surface System Aggregator Module Subsystem and Drivers"
+	depends on SERIAL_DEV_BUS
+	select CRC_CCITT
+	help
+	  The Surface System Aggregator Module (Surface SAM or SSAM) is an
+	  embedded controller (EC) found on 5th- and later-generation Microsoft
+	  Surface devices (i.e. Surface Pro 5, Surface Book 2, Surface Laptop,
+	  and newer, with exception of Surface Go series devices).
+
+	  Depending on the device in question, this EC provides varying
+	  functionality, including:
+	  - EC access from ACPI via Surface ACPI Notify (5th- and 6th-generation)
+	  - battery status information (all devices)
+	  - thermal sensor access (all devices)
+	  - performance mode / cooling mode control (all devices)
+	  - clipboard detachment system control (Surface Book 2 and 3)
+	  - HID / keyboard input (Surface Laptops, Surface Book 3)
+
+	  This option controls whether the Surface SAM subsystem core will be
+	  built. This includes a driver for the Surface Serial Hub (SSH), which
+	  is the device responsible for the communication with the EC, and a
+	  basic kernel interface exposing the EC functionality to other client
+	  drivers, i.e. allowing them to make requests to the EC and receive
+	  events from it. Selecting this option alone will not provide any
+	  client drivers and therefore no functionality beyond the in-kernel
+	  interface. Said functionality is the responsibility of the respective
+	  client drivers.
+
+	  Note: While 4th-generation Surface devices also make use of a SAM EC,
+	  due to a difference in the communication interface of the controller,
+	  only 5th and later generations are currently supported. Specifically,
+	  devices using SAM-over-SSH are supported, whereas devices using
+	  SAM-over-HID, which is used on the 4th generation, are currently not
+	  supported.
+
+	  Choose m if you want to build the SAM subsystem core and SSH driver as
+	  module, y if you want to build it into the kernel and n if you don't
+	  want it at all.
diff --git a/drivers/platform/surface/aggregator/Makefile b/drivers/platform/surface/aggregator/Makefile
new file mode 100644
index 000000000000..faad18d4a7f2
--- /dev/null
+++ b/drivers/platform/surface/aggregator/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0+
+# Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+
+obj-$(CONFIG_SURFACE_AGGREGATOR) += surface_aggregator.o
+
+surface_aggregator-objs := core.o
+surface_aggregator-objs += ssh_parser.o
+surface_aggregator-objs += ssh_packet_layer.o
+surface_aggregator-objs += ssh_request_layer.o
+surface_aggregator-objs += controller.o
diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
new file mode 100644
index 000000000000..488318cf2098
--- /dev/null
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -0,0 +1,2504 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Main SSAM/SSH controller structure and functionality.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <linux/acpi.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/gpio/consumer.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/serdev.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include <linux/surface_aggregator/serial_hub.h>
+
+#include "controller.h"
+#include "ssh_msgb.h"
+#include "ssh_request_layer.h"
+
+
+/* -- Safe counters. -------------------------------------------------------- */
+
+/**
+ * ssh_seq_reset() - Reset/initialize sequence ID counter.
+ * @c: The counter to reset.
+ */
+static void ssh_seq_reset(struct ssh_seq_counter *c)
+{
+	WRITE_ONCE(c->value, 0);
+}
+
+/**
+ * ssh_seq_next() - Get next sequence ID.
+ * @c: The counter providing the sequence IDs.
+ *
+ * Return: Returns the next sequence ID of the counter.
+ */
+static u8 ssh_seq_next(struct ssh_seq_counter *c)
+{
+	u8 old = READ_ONCE(c->value);
+	u8 new = old + 1;
+	u8 ret;
+
+	while (unlikely((ret = cmpxchg(&c->value, old, new)) != old)) {
+		old = ret;
+		new = old + 1;
+	}
+
+	return old;
+}
+
+/**
+ * ssh_rqid_reset() - Reset/initialize request ID counter.
+ * @c: The counter to reset.
+ */
+static void ssh_rqid_reset(struct ssh_rqid_counter *c)
+{
+	WRITE_ONCE(c->value, 0);
+}
+
+/**
+ * ssh_rqid_next() - Get next request ID.
+ * @c: The counter providing the request IDs.
+ *
+ * Return: Returns the next request ID of the counter, skipping any reserved
+ * request IDs.
+ */
+static u16 ssh_rqid_next(struct ssh_rqid_counter *c)
+{
+	u16 old = READ_ONCE(c->value);
+	u16 new = ssh_rqid_next_valid(old);
+	u16 ret;
+
+	while (unlikely((ret = cmpxchg(&c->value, old, new)) != old)) {
+		old = ret;
+		new = ssh_rqid_next_valid(old);
+	}
+
+	return old;
+}
+
+
+/* -- Event notifier/callbacks. --------------------------------------------- */
+/*
+ * The notifier system is based on linux/notifier.h, specifically the SRCU
+ * implementation. The difference to that is, that some bits of the notifier
+ * call return value can be tracked across multiple calls. This is done so
+ * that handling of events can be tracked and a warning can be issued in case
+ * an event goes unhandled. The idea of that warning is that it should help
+ * discover and identify new/currently unimplemented features.
+ */
+
+/**
+ * ssam_event_matches_notifier() - Test if an event matches a notifier.
+ * @n: The event notifier to test against.
+ * @event: The event to test.
+ *
+ * Return: Returns %true if the given event matches the given notifier
+ * according to the rules set in the notifier's event mask, %false otherwise.
+ */
+static bool ssam_event_matches_notifier(const struct ssam_event_notifier *n,
+					const struct ssam_event *event)
+{
+	bool match = n->event.id.target_category == event->target_category;
+
+	if (n->event.mask & SSAM_EVENT_MASK_TARGET)
+		match &= n->event.reg.target_id == event->target_id;
+
+	if (n->event.mask & SSAM_EVENT_MASK_INSTANCE)
+		match &= n->event.id.instance == event->instance_id;
+
+	return match;
+}
+
+/**
+ * ssam_nfblk_call_chain() - Call event notifier callbacks of the given chain.
+ * @nh:    The notifier head for which the notifier callbacks should be called.
+ * @event: The event data provided to the callbacks.
+ *
+ * Call all registered notifier callbacks in order of their priority until
+ * either no notifier is left or a notifier returns a value with the
+ * %SSAM_NOTIF_STOP bit set. Note that this bit is automatically set via
+ * ssam_notifier_from_errno() on any non-zero error value.
+ *
+ * Return: Returns the notifier status value, which contains the notifier
+ * status bits (%SSAM_NOTIF_HANDLED and %SSAM_NOTIF_STOP) as well as a
+ * potential error value returned from the last executed notifier callback.
+ * Use ssam_notifier_to_errno() to convert this value to the original error
+ * value.
+ */
+static int ssam_nfblk_call_chain(struct ssam_nf_head *nh, struct ssam_event *event)
+{
+	struct ssam_event_notifier *nf;
+	int ret = 0, idx;
+
+	idx = srcu_read_lock(&nh->srcu);
+
+	list_for_each_entry_rcu(nf, &nh->head, base.node,
+				srcu_read_lock_held(&nh->srcu)) {
+		if (ssam_event_matches_notifier(nf, event)) {
+			ret = (ret & SSAM_NOTIF_STATE_MASK) | nf->base.fn(nf, event);
+			if (ret & SSAM_NOTIF_STOP)
+				break;
+		}
+	}
+
+	srcu_read_unlock(&nh->srcu, idx);
+	return ret;
+}
+
+/**
+ * ssam_nfblk_insert() - Insert a new notifier block into the given notifier
+ * list.
+ * @nh: The notifier head into which the block should be inserted.
+ * @nb: The notifier block to add.
+ *
+ * Note: This function must be synchronized by the caller with respect to other
+ * insert, find, and/or remove calls by holding ``struct ssam_nf.lock``.
+ *
+ * Return: Returns zero on success, %-EEXIST if the notifier block has already
+ * been registered.
+ */
+static int ssam_nfblk_insert(struct ssam_nf_head *nh, struct ssam_notifier_block *nb)
+{
+	struct ssam_notifier_block *p;
+	struct list_head *h;
+
+	/* Runs under lock, no need for RCU variant. */
+	list_for_each(h, &nh->head) {
+		p = list_entry(h, struct ssam_notifier_block, node);
+
+		if (unlikely(p == nb)) {
+			WARN(1, "double register detected");
+			return -EEXIST;
+		}
+
+		if (nb->priority > p->priority)
+			break;
+	}
+
+	list_add_tail_rcu(&nb->node, h);
+	return 0;
+}
+
+/**
+ * ssam_nfblk_find() - Check if a notifier block is registered on the given
+ * notifier head.
+ * list.
+ * @nh: The notifier head on which to search.
+ * @nb: The notifier block to search for.
+ *
+ * Note: This function must be synchronized by the caller with respect to other
+ * insert, find, and/or remove calls by holding ``struct ssam_nf.lock``.
+ *
+ * Return: Returns true if the given notifier block is registered on the given
+ * notifier head, false otherwise.
+ */
+static bool ssam_nfblk_find(struct ssam_nf_head *nh, struct ssam_notifier_block *nb)
+{
+	struct ssam_notifier_block *p;
+
+	/* Runs under lock, no need for RCU variant. */
+	list_for_each_entry(p, &nh->head, node) {
+		if (p == nb)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * ssam_nfblk_remove() - Remove a notifier block from its notifier list.
+ * @nb: The notifier block to be removed.
+ *
+ * Note: This function must be synchronized by the caller with respect to
+ * other insert, find, and/or remove calls by holding ``struct ssam_nf.lock``.
+ * Furthermore, the caller _must_ ensure SRCU synchronization by calling
+ * synchronize_srcu() with ``nh->srcu`` after leaving the critical section, to
+ * ensure that the removed notifier block is not in use any more.
+ */
+static void ssam_nfblk_remove(struct ssam_notifier_block *nb)
+{
+	list_del_rcu(&nb->node);
+}
+
+/**
+ * ssam_nf_head_init() - Initialize the given notifier head.
+ * @nh: The notifier head to initialize.
+ */
+static int ssam_nf_head_init(struct ssam_nf_head *nh)
+{
+	int status;
+
+	status = init_srcu_struct(&nh->srcu);
+	if (status)
+		return status;
+
+	INIT_LIST_HEAD(&nh->head);
+	return 0;
+}
+
+/**
+ * ssam_nf_head_destroy() - Deinitialize the given notifier head.
+ * @nh: The notifier head to deinitialize.
+ */
+static void ssam_nf_head_destroy(struct ssam_nf_head *nh)
+{
+	cleanup_srcu_struct(&nh->srcu);
+}
+
+
+/* -- Event/notification registry. ------------------------------------------ */
+
+/**
+ * struct ssam_nf_refcount_key - Key used for event activation reference
+ * counting.
+ * @reg: The registry via which the event is enabled/disabled.
+ * @id:  The ID uniquely describing the event.
+ */
+struct ssam_nf_refcount_key {
+	struct ssam_event_registry reg;
+	struct ssam_event_id id;
+};
+
+/**
+ * struct ssam_nf_refcount_entry - RB-tree entry for reference counting event
+ * activations.
+ * @node:     The node of this entry in the rb-tree.
+ * @key:      The key of the event.
+ * @refcount: The reference-count of the event.
+ * @flags:    The flags used when enabling the event.
+ */
+struct ssam_nf_refcount_entry {
+	struct rb_node node;
+	struct ssam_nf_refcount_key key;
+	int refcount;
+	u8 flags;
+};
+
+/**
+ * ssam_nf_refcount_inc() - Increment reference-/activation-count of the given
+ * event.
+ * @nf:  The notifier system reference.
+ * @reg: The registry used to enable/disable the event.
+ * @id:  The event ID.
+ *
+ * Increments the reference-/activation-count associated with the specified
+ * event type/ID, allocating a new entry for this event ID if necessary. A
+ * newly allocated entry will have a refcount of one.
+ *
+ * Note: ``nf->lock`` must be held when calling this function.
+ *
+ * Return: Returns the refcount entry on success. Returns an error pointer
+ * with %-ENOSPC if there have already been %INT_MAX events of the specified
+ * ID and type registered, or %-ENOMEM if the entry could not be allocated.
+ */
+static struct ssam_nf_refcount_entry *
+ssam_nf_refcount_inc(struct ssam_nf *nf, struct ssam_event_registry reg,
+		     struct ssam_event_id id)
+{
+	struct ssam_nf_refcount_entry *entry;
+	struct ssam_nf_refcount_key key;
+	struct rb_node **link = &nf->refcount.rb_node;
+	struct rb_node *parent = NULL;
+	int cmp;
+
+	lockdep_assert_held(&nf->lock);
+
+	key.reg = reg;
+	key.id = id;
+
+	while (*link) {
+		entry = rb_entry(*link, struct ssam_nf_refcount_entry, node);
+		parent = *link;
+
+		cmp = memcmp(&key, &entry->key, sizeof(key));
+		if (cmp < 0) {
+			link = &(*link)->rb_left;
+		} else if (cmp > 0) {
+			link = &(*link)->rb_right;
+		} else if (entry->refcount < INT_MAX) {
+			entry->refcount++;
+			return entry;
+		} else {
+			WARN_ON(1);
+			return ERR_PTR(-ENOSPC);
+		}
+	}
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	entry->key = key;
+	entry->refcount = 1;
+
+	rb_link_node(&entry->node, parent, link);
+	rb_insert_color(&entry->node, &nf->refcount);
+
+	return entry;
+}
+
+/**
+ * ssam_nf_refcount_dec() - Decrement reference-/activation-count of the given
+ * event.
+ * @nf:  The notifier system reference.
+ * @reg: The registry used to enable/disable the event.
+ * @id:  The event ID.
+ *
+ * Decrements the reference-/activation-count of the specified event,
+ * returning its entry. If the returned entry has a refcount of zero, the
+ * caller is responsible for freeing it using kfree().
+ *
+ * Note: ``nf->lock`` must be held when calling this function.
+ *
+ * Return: Returns the refcount entry on success or %NULL if the entry has not
+ * been found.
+ */
+static struct ssam_nf_refcount_entry *
+ssam_nf_refcount_dec(struct ssam_nf *nf, struct ssam_event_registry reg,
+		     struct ssam_event_id id)
+{
+	struct ssam_nf_refcount_entry *entry;
+	struct ssam_nf_refcount_key key;
+	struct rb_node *node = nf->refcount.rb_node;
+	int cmp;
+
+	lockdep_assert_held(&nf->lock);
+
+	key.reg = reg;
+	key.id = id;
+
+	while (node) {
+		entry = rb_entry(node, struct ssam_nf_refcount_entry, node);
+
+		cmp = memcmp(&key, &entry->key, sizeof(key));
+		if (cmp < 0) {
+			node = node->rb_left;
+		} else if (cmp > 0) {
+			node = node->rb_right;
+		} else {
+			entry->refcount--;
+			if (entry->refcount == 0)
+				rb_erase(&entry->node, &nf->refcount);
+
+			return entry;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * ssam_nf_refcount_empty() - Test if the notification system has any
+ * enabled/active events.
+ * @nf: The notification system.
+ */
+static bool ssam_nf_refcount_empty(struct ssam_nf *nf)
+{
+	return RB_EMPTY_ROOT(&nf->refcount);
+}
+
+/**
+ * ssam_nf_call() - Call notification callbacks for the provided event.
+ * @nf:    The notifier system
+ * @dev:   The associated device, only used for logging.
+ * @rqid:  The request ID of the event.
+ * @event: The event provided to the callbacks.
+ *
+ * Execute registered callbacks in order of their priority until either no
+ * callback is left or a callback returns a value with the %SSAM_NOTIF_STOP
+ * bit set. Note that this bit is set automatically when converting non-zero
+ * error values via ssam_notifier_from_errno() to notifier values.
+ *
+ * Also note that any callback that could handle an event should return a value
+ * with bit %SSAM_NOTIF_HANDLED set, indicating that the event does not go
+ * unhandled/ignored. In case no registered callback could handle an event,
+ * this function will emit a warning.
+ *
+ * In case a callback failed, this function will emit an error message.
+ */
+static void ssam_nf_call(struct ssam_nf *nf, struct device *dev, u16 rqid,
+			 struct ssam_event *event)
+{
+	struct ssam_nf_head *nf_head;
+	int status, nf_ret;
+
+	if (!ssh_rqid_is_event(rqid)) {
+		dev_warn(dev, "event: unsupported rqid: %#06x\n", rqid);
+		return;
+	}
+
+	nf_head = &nf->head[ssh_rqid_to_event(rqid)];
+	nf_ret = ssam_nfblk_call_chain(nf_head, event);
+	status = ssam_notifier_to_errno(nf_ret);
+
+	if (status < 0) {
+		dev_err(dev,
+			"event: error handling event: %d (tc: %#04x, tid: %#04x, cid: %#04x, iid: %#04x)\n",
+			status, event->target_category, event->target_id,
+			event->command_id, event->instance_id);
+	} else if (!(nf_ret & SSAM_NOTIF_HANDLED)) {
+		dev_warn(dev,
+			 "event: unhandled event (rqid: %#04x, tc: %#04x, tid: %#04x, cid: %#04x, iid: %#04x)\n",
+			 rqid, event->target_category, event->target_id,
+			 event->command_id, event->instance_id);
+	}
+}
+
+/**
+ * ssam_nf_init() - Initialize the notifier system.
+ * @nf: The notifier system to initialize.
+ */
+static int ssam_nf_init(struct ssam_nf *nf)
+{
+	int i, status;
+
+	for (i = 0; i < SSH_NUM_EVENTS; i++) {
+		status = ssam_nf_head_init(&nf->head[i]);
+		if (status)
+			break;
+	}
+
+	if (status) {
+		while (i--)
+			ssam_nf_head_destroy(&nf->head[i]);
+
+		return status;
+	}
+
+	mutex_init(&nf->lock);
+	return 0;
+}
+
+/**
+ * ssam_nf_destroy() - Deinitialize the notifier system.
+ * @nf: The notifier system to deinitialize.
+ */
+static void ssam_nf_destroy(struct ssam_nf *nf)
+{
+	int i;
+
+	for (i = 0; i < SSH_NUM_EVENTS; i++)
+		ssam_nf_head_destroy(&nf->head[i]);
+
+	mutex_destroy(&nf->lock);
+}
+
+
+/* -- Event/async request completion system. -------------------------------- */
+
+#define SSAM_CPLT_WQ_NAME	"ssam_cpltq"
+
+/*
+ * SSAM_CPLT_WQ_BATCH - Maximum number of event item completions executed per
+ * work execution. Used to prevent livelocking of the workqueue. Value chosen
+ * via educated guess, may be adjusted.
+ */
+#define SSAM_CPLT_WQ_BATCH	10
+
+/**
+ * ssam_event_item_alloc() - Allocate an event item with the given payload size.
+ * @len:   The event payload length.
+ * @flags: The flags used for allocation.
+ *
+ * Allocate an event item with the given payload size. Sets the item
+ * operations and payload length values. The item free callback (``ops.free``)
+ * should not be overwritten after this call.
+ *
+ * Return: Returns the newly allocated event item.
+ */
+static struct ssam_event_item *ssam_event_item_alloc(size_t len, gfp_t flags)
+{
+	struct ssam_event_item *item;
+
+	item = kzalloc(struct_size(item, event.data, len), flags);
+	if (!item)
+		return NULL;
+
+	item->event.length = len;
+	return item;
+}
+
+/**
+ * ssam_event_queue_push() - Push an event item to the event queue.
+ * @q:    The event queue.
+ * @item: The item to add.
+ */
+static void ssam_event_queue_push(struct ssam_event_queue *q,
+				  struct ssam_event_item *item)
+{
+	spin_lock(&q->lock);
+	list_add_tail(&item->node, &q->head);
+	spin_unlock(&q->lock);
+}
+
+/**
+ * ssam_event_queue_pop() - Pop the next event item from the event queue.
+ * @q: The event queue.
+ *
+ * Returns and removes the next event item from the queue. Returns %NULL If
+ * there is no event item left.
+ */
+static struct ssam_event_item *ssam_event_queue_pop(struct ssam_event_queue *q)
+{
+	struct ssam_event_item *item;
+
+	spin_lock(&q->lock);
+	item = list_first_entry_or_null(&q->head, struct ssam_event_item, node);
+	if (item)
+		list_del(&item->node);
+	spin_unlock(&q->lock);
+
+	return item;
+}
+
+/**
+ * ssam_event_queue_is_empty() - Check if the event queue is empty.
+ * @q: The event queue.
+ */
+static bool ssam_event_queue_is_empty(struct ssam_event_queue *q)
+{
+	bool empty;
+
+	spin_lock(&q->lock);
+	empty = list_empty(&q->head);
+	spin_unlock(&q->lock);
+
+	return empty;
+}
+
+/**
+ * ssam_cplt_get_event_queue() - Get the event queue for the given parameters.
+ * @cplt: The completion system on which to look for the queue.
+ * @tid:  The target ID of the queue.
+ * @rqid: The request ID representing the event ID for which to get the queue.
+ *
+ * Return: Returns the event queue corresponding to the event type described
+ * by the given parameters. If the request ID does not represent an event,
+ * this function returns %NULL. If the target ID is not supported, this
+ * function will fall back to the default target ID (``tid = 1``).
+ */
+static
+struct ssam_event_queue *ssam_cplt_get_event_queue(struct ssam_cplt *cplt,
+						   u8 tid, u16 rqid)
+{
+	u16 event = ssh_rqid_to_event(rqid);
+	u16 tidx = ssh_tid_to_index(tid);
+
+	if (!ssh_rqid_is_event(rqid)) {
+		dev_err(cplt->dev, "event: unsupported request ID: %#06x\n", rqid);
+		return NULL;
+	}
+
+	if (!ssh_tid_is_valid(tid)) {
+		dev_warn(cplt->dev, "event: unsupported target ID: %u\n", tid);
+		tidx = 0;
+	}
+
+	return &cplt->event.target[tidx].queue[event];
+}
+
+/**
+ * ssam_cplt_submit() - Submit a work item to the completion system workqueue.
+ * @cplt: The completion system.
+ * @work: The work item to submit.
+ */
+static bool ssam_cplt_submit(struct ssam_cplt *cplt, struct work_struct *work)
+{
+	return queue_work(cplt->wq, work);
+}
+
+/**
+ * ssam_cplt_submit_event() - Submit an event to the completion system.
+ * @cplt: The completion system.
+ * @item: The event item to submit.
+ *
+ * Submits the event to the completion system by queuing it on the event item
+ * queue and queuing the respective event queue work item on the completion
+ * workqueue, which will eventually complete the event.
+ *
+ * Return: Returns zero on success, %-EINVAL if there is no event queue that
+ * can handle the given event item.
+ */
+static int ssam_cplt_submit_event(struct ssam_cplt *cplt,
+				  struct ssam_event_item *item)
+{
+	struct ssam_event_queue *evq;
+
+	evq = ssam_cplt_get_event_queue(cplt, item->event.target_id, item->rqid);
+	if (!evq)
+		return -EINVAL;
+
+	ssam_event_queue_push(evq, item);
+	ssam_cplt_submit(cplt, &evq->work);
+	return 0;
+}
+
+/**
+ * ssam_cplt_flush() - Flush the completion system.
+ * @cplt: The completion system.
+ *
+ * Flush the completion system by waiting until all currently submitted work
+ * items have been completed.
+ *
+ * Note: This function does not guarantee that all events will have been
+ * handled once this call terminates. In case of a larger number of
+ * to-be-completed events, the event queue work function may re-schedule its
+ * work item, which this flush operation will ignore.
+ *
+ * This operation is only intended to, during normal operation prior to
+ * shutdown, try to complete most events and requests to get them out of the
+ * system while the system is still fully operational. It does not aim to
+ * provide any guarantee that all of them have been handled.
+ */
+static void ssam_cplt_flush(struct ssam_cplt *cplt)
+{
+	flush_workqueue(cplt->wq);
+}
+
+static void ssam_event_queue_work_fn(struct work_struct *work)
+{
+	struct ssam_event_queue *queue;
+	struct ssam_event_item *item;
+	struct ssam_nf *nf;
+	struct device *dev;
+	unsigned int iterations = SSAM_CPLT_WQ_BATCH;
+
+	queue = container_of(work, struct ssam_event_queue, work);
+	nf = &queue->cplt->event.notif;
+	dev = queue->cplt->dev;
+
+	/* Limit number of processed events to avoid livelocking. */
+	do {
+		item = ssam_event_queue_pop(queue);
+		if (!item)
+			return;
+
+		ssam_nf_call(nf, dev, item->rqid, &item->event);
+		kfree(item);
+	} while (--iterations);
+
+	if (!ssam_event_queue_is_empty(queue))
+		ssam_cplt_submit(queue->cplt, &queue->work);
+}
+
+/**
+ * ssam_event_queue_init() - Initialize an event queue.
+ * @cplt: The completion system on which the queue resides.
+ * @evq:  The event queue to initialize.
+ */
+static void ssam_event_queue_init(struct ssam_cplt *cplt,
+				  struct ssam_event_queue *evq)
+{
+	evq->cplt = cplt;
+	spin_lock_init(&evq->lock);
+	INIT_LIST_HEAD(&evq->head);
+	INIT_WORK(&evq->work, ssam_event_queue_work_fn);
+}
+
+/**
+ * ssam_cplt_init() - Initialize completion system.
+ * @cplt: The completion system to initialize.
+ * @dev:  The device used for logging.
+ */
+static int ssam_cplt_init(struct ssam_cplt *cplt, struct device *dev)
+{
+	struct ssam_event_target *target;
+	int status, c, i;
+
+	cplt->dev = dev;
+
+	cplt->wq = create_workqueue(SSAM_CPLT_WQ_NAME);
+	if (!cplt->wq)
+		return -ENOMEM;
+
+	for (c = 0; c < ARRAY_SIZE(cplt->event.target); c++) {
+		target = &cplt->event.target[c];
+
+		for (i = 0; i < ARRAY_SIZE(target->queue); i++)
+			ssam_event_queue_init(cplt, &target->queue[i]);
+	}
+
+	status = ssam_nf_init(&cplt->event.notif);
+	if (status)
+		destroy_workqueue(cplt->wq);
+
+	return status;
+}
+
+/**
+ * ssam_cplt_destroy() - Deinitialize the completion system.
+ * @cplt: The completion system to deinitialize.
+ *
+ * Deinitialize the given completion system and ensure that all pending, i.e.
+ * yet-to-be-completed, event items and requests have been handled.
+ */
+static void ssam_cplt_destroy(struct ssam_cplt *cplt)
+{
+	/*
+	 * Note: destroy_workqueue ensures that all currently queued work will
+	 * be fully completed and the workqueue drained. This means that this
+	 * call will inherently also free any queued ssam_event_items, thus we
+	 * don't have to take care of that here explicitly.
+	 */
+	destroy_workqueue(cplt->wq);
+	ssam_nf_destroy(&cplt->event.notif);
+}
+
+
+/* -- Main SSAM device structures. ------------------------------------------ */
+
+/**
+ * ssam_controller_device() - Get the &struct device associated with this
+ * controller.
+ * @c: The controller for which to get the device.
+ *
+ * Return: Returns the &struct device associated with this controller,
+ * providing its lower-level transport.
+ */
+struct device *ssam_controller_device(struct ssam_controller *c)
+{
+	return ssh_rtl_get_device(&c->rtl);
+}
+EXPORT_SYMBOL_GPL(ssam_controller_device);
+
+static void __ssam_controller_release(struct kref *kref)
+{
+	struct ssam_controller *ctrl = to_ssam_controller(kref, kref);
+
+	/*
+	 * The lock-call here is to satisfy lockdep. At this point we really
+	 * expect this to be the last remaining reference to the controller.
+	 * Anything else is a bug.
+	 */
+	ssam_controller_lock(ctrl);
+	ssam_controller_destroy(ctrl);
+	ssam_controller_unlock(ctrl);
+
+	kfree(ctrl);
+}
+
+/**
+ * ssam_controller_get() - Increment reference count of controller.
+ * @c: The controller.
+ *
+ * Return: Returns the controller provided as input.
+ */
+struct ssam_controller *ssam_controller_get(struct ssam_controller *c)
+{
+	if (c)
+		kref_get(&c->kref);
+	return c;
+}
+EXPORT_SYMBOL_GPL(ssam_controller_get);
+
+/**
+ * ssam_controller_put() - Decrement reference count of controller.
+ * @c: The controller.
+ */
+void ssam_controller_put(struct ssam_controller *c)
+{
+	if (c)
+		kref_put(&c->kref, __ssam_controller_release);
+}
+EXPORT_SYMBOL_GPL(ssam_controller_put);
+
+/**
+ * ssam_controller_statelock() - Lock the controller against state transitions.
+ * @c: The controller to lock.
+ *
+ * Lock the controller against state transitions. Holding this lock guarantees
+ * that the controller will not transition between states, i.e. if the
+ * controller is in state "started", when this lock has been acquired, it will
+ * remain in this state at least until the lock has been released.
+ *
+ * Multiple clients may concurrently hold this lock. In other words: The
+ * ``statelock`` functions represent the read-lock part of a r/w-semaphore.
+ * Actions causing state transitions of the controller must be executed while
+ * holding the write-part of this r/w-semaphore (see ssam_controller_lock()
+ * and ssam_controller_unlock() for that).
+ *
+ * See ssam_controller_stateunlock() for the corresponding unlock function.
+ */
+void ssam_controller_statelock(struct ssam_controller *c)
+{
+	down_read(&c->lock);
+}
+EXPORT_SYMBOL_GPL(ssam_controller_statelock);
+
+/**
+ * ssam_controller_stateunlock() - Unlock controller state transitions.
+ * @c: The controller to unlock.
+ *
+ * See ssam_controller_statelock() for the corresponding lock function.
+ */
+void ssam_controller_stateunlock(struct ssam_controller *c)
+{
+	up_read(&c->lock);
+}
+EXPORT_SYMBOL_GPL(ssam_controller_stateunlock);
+
+/**
+ * ssam_controller_lock() - Acquire the main controller lock.
+ * @c: The controller to lock.
+ *
+ * This lock must be held for any state transitions, including transition to
+ * suspend/resumed states and during shutdown. See ssam_controller_statelock()
+ * for more details on controller locking.
+ *
+ * See ssam_controller_unlock() for the corresponding unlock function.
+ */
+void ssam_controller_lock(struct ssam_controller *c)
+{
+	down_write(&c->lock);
+}
+
+/*
+ * ssam_controller_unlock() - Release the main controller lock.
+ * @c: The controller to unlock.
+ *
+ * See ssam_controller_lock() for the corresponding lock function.
+ */
+void ssam_controller_unlock(struct ssam_controller *c)
+{
+	up_write(&c->lock);
+}
+
+static void ssam_handle_event(struct ssh_rtl *rtl,
+			      const struct ssh_command *cmd,
+			      const struct ssam_span *data)
+{
+	struct ssam_controller *ctrl = to_ssam_controller(rtl, rtl);
+	struct ssam_event_item *item;
+
+	item = ssam_event_item_alloc(data->len, GFP_KERNEL);
+	if (!item)
+		return;
+
+	item->rqid = get_unaligned_le16(&cmd->rqid);
+	item->event.target_category = cmd->tc;
+	item->event.target_id = cmd->tid_in;
+	item->event.command_id = cmd->cid;
+	item->event.instance_id = cmd->iid;
+	memcpy(&item->event.data[0], data->ptr, data->len);
+
+	if (WARN_ON(ssam_cplt_submit_event(&ctrl->cplt, item)))
+		kfree(item);
+}
+
+static const struct ssh_rtl_ops ssam_rtl_ops = {
+	.handle_event = ssam_handle_event,
+};
+
+static bool ssam_notifier_is_empty(struct ssam_controller *ctrl);
+static void ssam_notifier_unregister_all(struct ssam_controller *ctrl);
+
+#define SSAM_SSH_DSM_REVISION	0
+
+/* d5e383e1-d892-4a76-89fc-f6aaae7ed5b5 */
+static const guid_t SSAM_SSH_DSM_GUID =
+	GUID_INIT(0xd5e383e1, 0xd892, 0x4a76,
+		  0x89, 0xfc, 0xf6, 0xaa, 0xae, 0x7e, 0xd5, 0xb5);
+
+enum ssh_dsm_fn {
+	SSH_DSM_FN_SSH_POWER_PROFILE             = 0x05,
+	SSH_DSM_FN_SCREEN_ON_SLEEP_IDLE_TIMEOUT  = 0x06,
+	SSH_DSM_FN_SCREEN_OFF_SLEEP_IDLE_TIMEOUT = 0x07,
+	SSH_DSM_FN_D3_CLOSES_HANDLE              = 0x08,
+	SSH_DSM_FN_SSH_BUFFER_SIZE               = 0x09,
+};
+
+static int ssam_dsm_get_functions(acpi_handle handle, u64 *funcs)
+{
+	union acpi_object *obj;
+	u64 mask = 0;
+	int i;
+
+	*funcs = 0;
+
+	/*
+	 * The _DSM function is only present on newer models. It is not
+	 * present on 5th and 6th generation devices (i.e. up to and including
+	 * Surface Pro 6, Surface Laptop 2, Surface Book 2).
+	 *
+	 * If the _DSM is not present, indicate that no function is supported.
+	 * This will result in default values being set.
+	 */
+	if (!acpi_has_method(handle, "_DSM"))
+		return 0;
+
+	obj = acpi_evaluate_dsm_typed(handle, &SSAM_SSH_DSM_GUID,
+				      SSAM_SSH_DSM_REVISION, 0, NULL,
+				      ACPI_TYPE_BUFFER);
+	if (!obj)
+		return -EIO;
+
+	for (i = 0; i < obj->buffer.length && i < 8; i++)
+		mask |= (((u64)obj->buffer.pointer[i]) << (i * 8));
+
+	if (mask & BIT(0))
+		*funcs = mask;
+
+	ACPI_FREE(obj);
+	return 0;
+}
+
+static int ssam_dsm_load_u32(acpi_handle handle, u64 funcs, u64 func, u32 *ret)
+{
+	union acpi_object *obj;
+	u64 val;
+
+	if (!(funcs & BIT(func)))
+		return 0; /* Not supported, leave *ret at its default value */
+
+	obj = acpi_evaluate_dsm_typed(handle, &SSAM_SSH_DSM_GUID,
+				      SSAM_SSH_DSM_REVISION, func, NULL,
+				      ACPI_TYPE_INTEGER);
+	if (!obj)
+		return -EIO;
+
+	val = obj->integer.value;
+	ACPI_FREE(obj);
+
+	if (val > U32_MAX)
+		return -ERANGE;
+
+	*ret = val;
+	return 0;
+}
+
+/**
+ * ssam_controller_caps_load_from_acpi() - Load controller capabilities from
+ * ACPI _DSM.
+ * @handle: The handle of the ACPI controller/SSH device.
+ * @caps:   Where to store the capabilities in.
+ *
+ * Initializes the given controller capabilities with default values, then
+ * checks and, if the respective _DSM functions are available, loads the
+ * actual capabilities from the _DSM.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+static
+int ssam_controller_caps_load_from_acpi(acpi_handle handle,
+					struct ssam_controller_caps *caps)
+{
+	u32 d3_closes_handle = false;
+	u64 funcs;
+	int status;
+
+	/* Set defaults. */
+	caps->ssh_power_profile = U32_MAX;
+	caps->screen_on_sleep_idle_timeout = U32_MAX;
+	caps->screen_off_sleep_idle_timeout = U32_MAX;
+	caps->d3_closes_handle = false;
+	caps->ssh_buffer_size = U32_MAX;
+
+	/* Pre-load supported DSM functions. */
+	status = ssam_dsm_get_functions(handle, &funcs);
+	if (status)
+		return status;
+
+	/* Load actual values from ACPI, if present. */
+	status = ssam_dsm_load_u32(handle, funcs, SSH_DSM_FN_SSH_POWER_PROFILE,
+				   &caps->ssh_power_profile);
+	if (status)
+		return status;
+
+	status = ssam_dsm_load_u32(handle, funcs,
+				   SSH_DSM_FN_SCREEN_ON_SLEEP_IDLE_TIMEOUT,
+				   &caps->screen_on_sleep_idle_timeout);
+	if (status)
+		return status;
+
+	status = ssam_dsm_load_u32(handle, funcs,
+				   SSH_DSM_FN_SCREEN_OFF_SLEEP_IDLE_TIMEOUT,
+				   &caps->screen_off_sleep_idle_timeout);
+	if (status)
+		return status;
+
+	status = ssam_dsm_load_u32(handle, funcs, SSH_DSM_FN_D3_CLOSES_HANDLE,
+				   &d3_closes_handle);
+	if (status)
+		return status;
+
+	caps->d3_closes_handle = !!d3_closes_handle;
+
+	status = ssam_dsm_load_u32(handle, funcs, SSH_DSM_FN_SSH_BUFFER_SIZE,
+				   &caps->ssh_buffer_size);
+	if (status)
+		return status;
+
+	return 0;
+}
+
+/**
+ * ssam_controller_init() - Initialize SSAM controller.
+ * @ctrl:   The controller to initialize.
+ * @serdev: The serial device representing the underlying data transport.
+ *
+ * Initializes the given controller. Does neither start receiver nor
+ * transmitter threads. After this call, the controller has to be hooked up to
+ * the serdev core separately via &struct serdev_device_ops, relaying calls to
+ * ssam_controller_receive_buf() and ssam_controller_write_wakeup(). Once the
+ * controller has been hooked up, transmitter and receiver threads may be
+ * started via ssam_controller_start(). These setup steps need to be completed
+ * before controller can be used for requests.
+ */
+int ssam_controller_init(struct ssam_controller *ctrl,
+			 struct serdev_device *serdev)
+{
+	acpi_handle handle = ACPI_HANDLE(&serdev->dev);
+	int status;
+
+	init_rwsem(&ctrl->lock);
+	kref_init(&ctrl->kref);
+
+	status = ssam_controller_caps_load_from_acpi(handle, &ctrl->caps);
+	if (status)
+		return status;
+
+	dev_dbg(&serdev->dev,
+		"device capabilities:\n"
+		"  ssh_power_profile:             %u\n"
+		"  ssh_buffer_size:               %u\n"
+		"  screen_on_sleep_idle_timeout:  %u\n"
+		"  screen_off_sleep_idle_timeout: %u\n"
+		"  d3_closes_handle:              %u\n",
+		ctrl->caps.ssh_power_profile,
+		ctrl->caps.ssh_buffer_size,
+		ctrl->caps.screen_on_sleep_idle_timeout,
+		ctrl->caps.screen_off_sleep_idle_timeout,
+		ctrl->caps.d3_closes_handle);
+
+	ssh_seq_reset(&ctrl->counter.seq);
+	ssh_rqid_reset(&ctrl->counter.rqid);
+
+	/* Initialize event/request completion system. */
+	status = ssam_cplt_init(&ctrl->cplt, &serdev->dev);
+	if (status)
+		return status;
+
+	/* Initialize request and packet transport layers. */
+	status = ssh_rtl_init(&ctrl->rtl, serdev, &ssam_rtl_ops);
+	if (status) {
+		ssam_cplt_destroy(&ctrl->cplt);
+		return status;
+	}
+
+	/*
+	 * Set state via write_once even though we expect to be in an
+	 * exclusive context, due to smoke-testing in
+	 * ssam_request_sync_submit().
+	 */
+	WRITE_ONCE(ctrl->state, SSAM_CONTROLLER_INITIALIZED);
+	return 0;
+}
+
+/**
+ * ssam_controller_start() - Start the receiver and transmitter threads of the
+ * controller.
+ * @ctrl: The controller.
+ *
+ * Note: When this function is called, the controller should be properly
+ * hooked up to the serdev core via &struct serdev_device_ops. Please refer
+ * to ssam_controller_init() for more details on controller initialization.
+ *
+ * This function must be called with the main controller lock held (i.e. by
+ * calling ssam_controller_lock()).
+ */
+int ssam_controller_start(struct ssam_controller *ctrl)
+{
+	int status;
+
+	lockdep_assert_held_write(&ctrl->lock);
+
+	if (ctrl->state != SSAM_CONTROLLER_INITIALIZED)
+		return -EINVAL;
+
+	status = ssh_rtl_start(&ctrl->rtl);
+	if (status)
+		return status;
+
+	/*
+	 * Set state via write_once even though we expect to be locked/in an
+	 * exclusive context, due to smoke-testing in
+	 * ssam_request_sync_submit().
+	 */
+	WRITE_ONCE(ctrl->state, SSAM_CONTROLLER_STARTED);
+	return 0;
+}
+
+/*
+ * SSAM_CTRL_SHUTDOWN_FLUSH_TIMEOUT - Timeout for flushing requests during
+ * shutdown.
+ *
+ * Chosen to be larger than one full request timeout, including packets timing
+ * out. This value should give ample time to complete any outstanding requests
+ * during normal operation and account for the odd package timeout.
+ */
+#define SSAM_CTRL_SHUTDOWN_FLUSH_TIMEOUT	msecs_to_jiffies(5000)
+
+/**
+ * ssam_controller_shutdown() - Shut down the controller.
+ * @ctrl: The controller.
+ *
+ * Shuts down the controller by flushing all pending requests and stopping the
+ * transmitter and receiver threads. All requests submitted after this call
+ * will fail with %-ESHUTDOWN. While it is discouraged to do so, this function
+ * is safe to use in parallel with ongoing request submission.
+ *
+ * In the course of this shutdown procedure, all currently registered
+ * notifiers will be unregistered. It is, however, strongly recommended to not
+ * rely on this behavior, and instead the party registering the notifier
+ * should unregister it before the controller gets shut down, e.g. via the
+ * SSAM bus which guarantees client devices to be removed before a shutdown.
+ *
+ * Note that events may still be pending after this call, but, due to the
+ * notifiers being unregistered, these events will be dropped when the
+ * controller is subsequently destroyed via ssam_controller_destroy().
+ *
+ * This function must be called with the main controller lock held (i.e. by
+ * calling ssam_controller_lock()).
+ */
+void ssam_controller_shutdown(struct ssam_controller *ctrl)
+{
+	enum ssam_controller_state s = ctrl->state;
+	int status;
+
+	lockdep_assert_held_write(&ctrl->lock);
+
+	if (s == SSAM_CONTROLLER_UNINITIALIZED || s == SSAM_CONTROLLER_STOPPED)
+		return;
+
+	/*
+	 * Try to flush pending events and requests while everything still
+	 * works. Note: There may still be packets and/or requests in the
+	 * system after this call (e.g. via control packets submitted by the
+	 * packet transport layer or flush timeout / failure, ...). Those will
+	 * be handled with the ssh_rtl_shutdown() call below.
+	 */
+	status = ssh_rtl_flush(&ctrl->rtl, SSAM_CTRL_SHUTDOWN_FLUSH_TIMEOUT);
+	if (status) {
+		ssam_err(ctrl, "failed to flush request transport layer: %d\n",
+			 status);
+	}
+
+	/* Try to flush all currently completing requests and events. */
+	ssam_cplt_flush(&ctrl->cplt);
+
+	/*
+	 * We expect all notifiers to have been removed by the respective client
+	 * driver that set them up at this point. If this warning occurs, some
+	 * client driver has not done that...
+	 */
+	WARN_ON(!ssam_notifier_is_empty(ctrl));
+
+	/*
+	 * Nevertheless, we should still take care of drivers that don't behave
+	 * well. Thus disable all enabled events, unregister all notifiers.
+	 */
+	ssam_notifier_unregister_all(ctrl);
+
+	/*
+	 * Cancel remaining requests. Ensure no new ones can be queued and stop
+	 * threads.
+	 */
+	ssh_rtl_shutdown(&ctrl->rtl);
+
+	/*
+	 * Set state via write_once even though we expect to be locked/in an
+	 * exclusive context, due to smoke-testing in
+	 * ssam_request_sync_submit().
+	 */
+	WRITE_ONCE(ctrl->state, SSAM_CONTROLLER_STOPPED);
+	ctrl->rtl.ptl.serdev = NULL;
+}
+
+/**
+ * ssam_controller_destroy() - Destroy the controller and free its resources.
+ * @ctrl: The controller.
+ *
+ * Ensures that all resources associated with the controller get freed. This
+ * function should only be called after the controller has been stopped via
+ * ssam_controller_shutdown(). In general, this function should not be called
+ * directly. The only valid place to call this function directly is during
+ * initialization, before the controller has been fully initialized and passed
+ * to other processes. This function is called automatically when the
+ * reference count of the controller reaches zero.
+ *
+ * This function must be called with the main controller lock held (i.e. by
+ * calling ssam_controller_lock()).
+ */
+void ssam_controller_destroy(struct ssam_controller *ctrl)
+{
+	lockdep_assert_held_write(&ctrl->lock);
+
+	if (ctrl->state == SSAM_CONTROLLER_UNINITIALIZED)
+		return;
+
+	WARN_ON(ctrl->state != SSAM_CONTROLLER_STOPPED);
+
+	/*
+	 * Note: New events could still have been received after the previous
+	 * flush in ssam_controller_shutdown, before the request transport layer
+	 * has been shut down. At this point, after the shutdown, we can be sure
+	 * that no new events will be queued. The call to ssam_cplt_destroy will
+	 * ensure that those remaining are being completed and freed.
+	 */
+
+	/* Actually free resources. */
+	ssam_cplt_destroy(&ctrl->cplt);
+	ssh_rtl_destroy(&ctrl->rtl);
+
+	/*
+	 * Set state via write_once even though we expect to be locked/in an
+	 * exclusive context, due to smoke-testing in
+	 * ssam_request_sync_submit().
+	 */
+	WRITE_ONCE(ctrl->state, SSAM_CONTROLLER_UNINITIALIZED);
+}
+
+/**
+ * ssam_controller_suspend() - Suspend the controller.
+ * @ctrl: The controller to suspend.
+ *
+ * Marks the controller as suspended. Note that display-off and D0-exit
+ * notifications have to be sent manually before transitioning the controller
+ * into the suspended state via this function.
+ *
+ * See ssam_controller_resume() for the corresponding resume function.
+ *
+ * Return: Returns %-EINVAL if the controller is currently not in the
+ * "started" state.
+ */
+int ssam_controller_suspend(struct ssam_controller *ctrl)
+{
+	ssam_controller_lock(ctrl);
+
+	if (ctrl->state != SSAM_CONTROLLER_STARTED) {
+		ssam_controller_unlock(ctrl);
+		return -EINVAL;
+	}
+
+	ssam_dbg(ctrl, "pm: suspending controller\n");
+
+	/*
+	 * Set state via write_once even though we're locked, due to
+	 * smoke-testing in ssam_request_sync_submit().
+	 */
+	WRITE_ONCE(ctrl->state, SSAM_CONTROLLER_SUSPENDED);
+
+	ssam_controller_unlock(ctrl);
+	return 0;
+}
+
+/**
+ * ssam_controller_resume() - Resume the controller from suspend.
+ * @ctrl: The controller to resume.
+ *
+ * Resume the controller from the suspended state it was put into via
+ * ssam_controller_suspend(). This function does not issue display-on and
+ * D0-entry notifications. If required, those have to be sent manually after
+ * this call.
+ *
+ * Return: Returns %-EINVAL if the controller is currently not suspended.
+ */
+int ssam_controller_resume(struct ssam_controller *ctrl)
+{
+	ssam_controller_lock(ctrl);
+
+	if (ctrl->state != SSAM_CONTROLLER_SUSPENDED) {
+		ssam_controller_unlock(ctrl);
+		return -EINVAL;
+	}
+
+	ssam_dbg(ctrl, "pm: resuming controller\n");
+
+	/*
+	 * Set state via write_once even though we're locked, due to
+	 * smoke-testing in ssam_request_sync_submit().
+	 */
+	WRITE_ONCE(ctrl->state, SSAM_CONTROLLER_STARTED);
+
+	ssam_controller_unlock(ctrl);
+	return 0;
+}
+
+
+/* -- Top-level request interface ------------------------------------------- */
+
+/**
+ * ssam_request_write_data() - Construct and write SAM request message to
+ * buffer.
+ * @buf:  The buffer to write the data to.
+ * @ctrl: The controller via which the request will be sent.
+ * @spec: The request data and specification.
+ *
+ * Constructs a SAM/SSH request message and writes it to the provided buffer.
+ * The request and transport counters, specifically RQID and SEQ, will be set
+ * in this call. These counters are obtained from the controller. It is thus
+ * only valid to send the resulting message via the controller specified here.
+ *
+ * For calculation of the required buffer size, refer to the
+ * SSH_COMMAND_MESSAGE_LENGTH() macro.
+ *
+ * Return: Returns the number of bytes used in the buffer on success. Returns
+ * %-EINVAL if the payload length provided in the request specification is too
+ * large (larger than %SSH_COMMAND_MAX_PAYLOAD_SIZE) or if the provided buffer
+ * is too small.
+ */
+ssize_t ssam_request_write_data(struct ssam_span *buf,
+				struct ssam_controller *ctrl,
+				const struct ssam_request *spec)
+{
+	struct msgbuf msgb;
+	u16 rqid;
+	u8 seq;
+
+	if (spec->length > SSH_COMMAND_MAX_PAYLOAD_SIZE)
+		return -EINVAL;
+
+	if (SSH_COMMAND_MESSAGE_LENGTH(spec->length) > buf->len)
+		return -EINVAL;
+
+	msgb_init(&msgb, buf->ptr, buf->len);
+	seq = ssh_seq_next(&ctrl->counter.seq);
+	rqid = ssh_rqid_next(&ctrl->counter.rqid);
+	msgb_push_cmd(&msgb, seq, rqid, spec);
+
+	return msgb_bytes_used(&msgb);
+}
+EXPORT_SYMBOL_GPL(ssam_request_write_data);
+
+static void ssam_request_sync_complete(struct ssh_request *rqst,
+				       const struct ssh_command *cmd,
+				       const struct ssam_span *data, int status)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+	struct ssam_request_sync *r;
+
+	r = container_of(rqst, struct ssam_request_sync, base);
+	r->status = status;
+
+	if (r->resp)
+		r->resp->length = 0;
+
+	if (status) {
+		rtl_dbg_cond(rtl, "rsp: request failed: %d\n", status);
+		return;
+	}
+
+	if (!data)	/* Handle requests without a response. */
+		return;
+
+	if (!r->resp || !r->resp->pointer) {
+		if (data->len)
+			rtl_warn(rtl, "rsp: no response buffer provided, dropping data\n");
+		return;
+	}
+
+	if (data->len > r->resp->capacity) {
+		rtl_err(rtl,
+			"rsp: response buffer too small, capacity: %zu bytes, got: %zu bytes\n",
+			r->resp->capacity, data->len);
+		r->status = -ENOSPC;
+		return;
+	}
+
+	r->resp->length = data->len;
+	memcpy(r->resp->pointer, data->ptr, data->len);
+}
+
+static void ssam_request_sync_release(struct ssh_request *rqst)
+{
+	complete_all(&container_of(rqst, struct ssam_request_sync, base)->comp);
+}
+
+static const struct ssh_request_ops ssam_request_sync_ops = {
+	.release = ssam_request_sync_release,
+	.complete = ssam_request_sync_complete,
+};
+
+/**
+ * ssam_request_sync_alloc() - Allocate a synchronous request.
+ * @payload_len: The length of the request payload.
+ * @flags:       Flags used for allocation.
+ * @rqst:        Where to store the pointer to the allocated request.
+ * @buffer:      Where to store the buffer descriptor for the message buffer of
+ *               the request.
+ *
+ * Allocates a synchronous request with corresponding message buffer. The
+ * request still needs to be initialized ssam_request_sync_init() before
+ * it can be submitted, and the message buffer data must still be set to the
+ * returned buffer via ssam_request_sync_set_data() after it has been filled,
+ * if need be with adjusted message length.
+ *
+ * After use, the request and its corresponding message buffer should be freed
+ * via ssam_request_sync_free(). The buffer must not be freed separately.
+ *
+ * Return: Returns zero on success, %-ENOMEM if the request could not be
+ * allocated.
+ */
+int ssam_request_sync_alloc(size_t payload_len, gfp_t flags,
+			    struct ssam_request_sync **rqst,
+			    struct ssam_span *buffer)
+{
+	size_t msglen = SSH_COMMAND_MESSAGE_LENGTH(payload_len);
+
+	*rqst = kzalloc(sizeof(**rqst) + msglen, flags);
+	if (!*rqst)
+		return -ENOMEM;
+
+	buffer->ptr = (u8 *)(*rqst + 1);
+	buffer->len = msglen;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ssam_request_sync_alloc);
+
+/**
+ * ssam_request_sync_free() - Free a synchronous request.
+ * @rqst: The request to be freed.
+ *
+ * Free a synchronous request and its corresponding buffer allocated with
+ * ssam_request_sync_alloc(). Do not use for requests allocated on the stack
+ * or via any other function.
+ *
+ * Warning: The caller must ensure that the request is not in use any more.
+ * I.e. the caller must ensure that it has the only reference to the request
+ * and the request is not currently pending. This means that the caller has
+ * either never submitted the request, request submission has failed, or the
+ * caller has waited until the submitted request has been completed via
+ * ssam_request_sync_wait().
+ */
+void ssam_request_sync_free(struct ssam_request_sync *rqst)
+{
+	kfree(rqst);
+}
+EXPORT_SYMBOL_GPL(ssam_request_sync_free);
+
+/**
+ * ssam_request_sync_init() - Initialize a synchronous request struct.
+ * @rqst:  The request to initialize.
+ * @flags: The request flags.
+ *
+ * Initializes the given request struct. Does not initialize the request
+ * message data. This has to be done explicitly after this call via
+ * ssam_request_sync_set_data() and the actual message data has to be written
+ * via ssam_request_write_data().
+ *
+ * Return: Returns zero on success or %-EINVAL if the given flags are invalid.
+ */
+int ssam_request_sync_init(struct ssam_request_sync *rqst,
+			   enum ssam_request_flags flags)
+{
+	int status;
+
+	status = ssh_request_init(&rqst->base, flags, &ssam_request_sync_ops);
+	if (status)
+		return status;
+
+	init_completion(&rqst->comp);
+	rqst->resp = NULL;
+	rqst->status = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ssam_request_sync_init);
+
+/**
+ * ssam_request_sync_submit() - Submit a synchronous request.
+ * @ctrl: The controller with which to submit the request.
+ * @rqst: The request to submit.
+ *
+ * Submit a synchronous request. The request has to be initialized and
+ * properly set up, including response buffer (may be %NULL if no response is
+ * expected) and command message data. This function does not wait for the
+ * request to be completed.
+ *
+ * If this function succeeds, ssam_request_sync_wait() must be used to ensure
+ * that the request has been completed before the response data can be
+ * accessed and/or the request can be freed. On failure, the request may
+ * immediately be freed.
+ *
+ * This function may only be used if the controller is active, i.e. has been
+ * initialized and not suspended.
+ */
+int ssam_request_sync_submit(struct ssam_controller *ctrl,
+			     struct ssam_request_sync *rqst)
+{
+	int status;
+
+	/*
+	 * This is only a superficial check. In general, the caller needs to
+	 * ensure that the controller is initialized and is not (and does not
+	 * get) suspended during use, i.e. until the request has been completed
+	 * (if _absolutely_ necessary, by use of ssam_controller_statelock/
+	 * ssam_controller_stateunlock, but something like ssam_client_link
+	 * should be preferred as this needs to last until the request has been
+	 * completed).
+	 *
+	 * Note that it is actually safe to use this function while the
+	 * controller is in the process of being shut down (as ssh_rtl_submit
+	 * is safe with regards to this), but it is generally discouraged to do
+	 * so.
+	 */
+	if (WARN_ON(READ_ONCE(ctrl->state) != SSAM_CONTROLLER_STARTED)) {
+		ssh_request_put(&rqst->base);
+		return -ENODEV;
+	}
+
+	status = ssh_rtl_submit(&ctrl->rtl, &rqst->base);
+	ssh_request_put(&rqst->base);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(ssam_request_sync_submit);
+
+/**
+ * ssam_request_sync() - Execute a synchronous request.
+ * @ctrl: The controller via which the request will be submitted.
+ * @spec: The request specification and payload.
+ * @rsp:  The response buffer.
+ *
+ * Allocates a synchronous request with its message data buffer on the heap
+ * via ssam_request_sync_alloc(), fully initializes it via the provided
+ * request specification, submits it, and finally waits for its completion
+ * before freeing it and returning its status.
+ *
+ * Return: Returns the status of the request or any failure during setup.
+ */
+int ssam_request_sync(struct ssam_controller *ctrl,
+		      const struct ssam_request *spec,
+		      struct ssam_response *rsp)
+{
+	struct ssam_request_sync *rqst;
+	struct ssam_span buf;
+	ssize_t len;
+	int status;
+
+	status = ssam_request_sync_alloc(spec->length, GFP_KERNEL, &rqst, &buf);
+	if (status)
+		return status;
+
+	status = ssam_request_sync_init(rqst, spec->flags);
+	if (status)
+		return status;
+
+	ssam_request_sync_set_resp(rqst, rsp);
+
+	len = ssam_request_write_data(&buf, ctrl, spec);
+	if (len < 0) {
+		ssam_request_sync_free(rqst);
+		return len;
+	}
+
+	ssam_request_sync_set_data(rqst, buf.ptr, len);
+
+	status = ssam_request_sync_submit(ctrl, rqst);
+	if (!status)
+		status = ssam_request_sync_wait(rqst);
+
+	ssam_request_sync_free(rqst);
+	return status;
+}
+EXPORT_SYMBOL_GPL(ssam_request_sync);
+
+/**
+ * ssam_request_sync_with_buffer() - Execute a synchronous request with the
+ * provided buffer as back-end for the message buffer.
+ * @ctrl: The controller via which the request will be submitted.
+ * @spec: The request specification and payload.
+ * @rsp:  The response buffer.
+ * @buf:  The buffer for the request message data.
+ *
+ * Allocates a synchronous request struct on the stack, fully initializes it
+ * using the provided buffer as message data buffer, submits it, and then
+ * waits for its completion before returning its status. The
+ * SSH_COMMAND_MESSAGE_LENGTH() macro can be used to compute the required
+ * message buffer size.
+ *
+ * This function does essentially the same as ssam_request_sync(), but instead
+ * of dynamically allocating the request and message data buffer, it uses the
+ * provided message data buffer and stores the (small) request struct on the
+ * heap.
+ *
+ * Return: Returns the status of the request or any failure during setup.
+ */
+int ssam_request_sync_with_buffer(struct ssam_controller *ctrl,
+				  const struct ssam_request *spec,
+				  struct ssam_response *rsp,
+				  struct ssam_span *buf)
+{
+	struct ssam_request_sync rqst;
+	ssize_t len;
+	int status;
+
+	status = ssam_request_sync_init(&rqst, spec->flags);
+	if (status)
+		return status;
+
+	ssam_request_sync_set_resp(&rqst, rsp);
+
+	len = ssam_request_write_data(buf, ctrl, spec);
+	if (len < 0)
+		return len;
+
+	ssam_request_sync_set_data(&rqst, buf->ptr, len);
+
+	status = ssam_request_sync_submit(ctrl, &rqst);
+	if (!status)
+		status = ssam_request_sync_wait(&rqst);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(ssam_request_sync_with_buffer);
+
+
+/* -- Internal SAM requests. ------------------------------------------------ */
+
+static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_get_firmware_version, __le32, {
+	.target_category = SSAM_SSH_TC_SAM,
+	.target_id       = 0x01,
+	.command_id      = 0x13,
+	.instance_id     = 0x00,
+});
+
+static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_display_off, u8, {
+	.target_category = SSAM_SSH_TC_SAM,
+	.target_id       = 0x01,
+	.command_id      = 0x15,
+	.instance_id     = 0x00,
+});
+
+static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_display_on, u8, {
+	.target_category = SSAM_SSH_TC_SAM,
+	.target_id       = 0x01,
+	.command_id      = 0x16,
+	.instance_id     = 0x00,
+});
+
+static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_d0_exit, u8, {
+	.target_category = SSAM_SSH_TC_SAM,
+	.target_id       = 0x01,
+	.command_id      = 0x33,
+	.instance_id     = 0x00,
+});
+
+static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_d0_entry, u8, {
+	.target_category = SSAM_SSH_TC_SAM,
+	.target_id       = 0x01,
+	.command_id      = 0x34,
+	.instance_id     = 0x00,
+});
+
+/**
+ * struct ssh_notification_params - Command payload to enable/disable SSH
+ * notifications.
+ * @target_category: The target category for which notifications should be
+ *                   enabled/disabled.
+ * @flags:           Flags determining how notifications are being sent.
+ * @request_id:      The request ID that is used to send these notifications.
+ * @instance_id:     The specific instance in the given target category for
+ *                   which notifications should be enabled.
+ */
+struct ssh_notification_params {
+	u8 target_category;
+	u8 flags;
+	__le16 request_id;
+	u8 instance_id;
+} __packed;
+
+static_assert(sizeof(struct ssh_notification_params) == 5);
+
+static int __ssam_ssh_event_request(struct ssam_controller *ctrl,
+				    struct ssam_event_registry reg, u8 cid,
+				    struct ssam_event_id id, u8 flags)
+{
+	struct ssh_notification_params params;
+	struct ssam_request rqst;
+	struct ssam_response result;
+	int status;
+
+	u16 rqid = ssh_tc_to_rqid(id.target_category);
+	u8 buf = 0;
+
+	/* Only allow RQIDs that lie within the event spectrum. */
+	if (!ssh_rqid_is_event(rqid))
+		return -EINVAL;
+
+	params.target_category = id.target_category;
+	params.instance_id = id.instance;
+	params.flags = flags;
+	put_unaligned_le16(rqid, &params.request_id);
+
+	rqst.target_category = reg.target_category;
+	rqst.target_id = reg.target_id;
+	rqst.command_id = cid;
+	rqst.instance_id = 0x00;
+	rqst.flags = SSAM_REQUEST_HAS_RESPONSE;
+	rqst.length = sizeof(params);
+	rqst.payload = (u8 *)&params;
+
+	result.capacity = sizeof(buf);
+	result.length = 0;
+	result.pointer = &buf;
+
+	status = ssam_retry(ssam_request_sync_onstack, ctrl, &rqst, &result,
+			    sizeof(params));
+
+	return status < 0 ? status : buf;
+}
+
+/**
+ * ssam_ssh_event_enable() - Enable SSH event.
+ * @ctrl:  The controller for which to enable the event.
+ * @reg:   The event registry describing what request to use for enabling and
+ *         disabling the event.
+ * @id:    The event identifier.
+ * @flags: The event flags.
+ *
+ * Enables the specified event on the EC. This function does not manage
+ * reference counting of enabled events and is basically only a wrapper for
+ * the raw EC request. If the specified event is already enabled, the EC will
+ * ignore this request.
+ *
+ * Return: Returns the status of the executed SAM request (zero on success and
+ * negative on direct failure) or %-EPROTO if the request response indicates a
+ * failure.
+ */
+static int ssam_ssh_event_enable(struct ssam_controller *ctrl,
+				 struct ssam_event_registry reg,
+				 struct ssam_event_id id, u8 flags)
+{
+	int status;
+
+	status = __ssam_ssh_event_request(ctrl, reg, reg.cid_enable, id, flags);
+
+	if (status < 0 && status != -EINVAL) {
+		ssam_err(ctrl,
+			 "failed to enable event source (tc: %#04x, iid: %#04x, reg: %#04x)\n",
+			 id.target_category, id.instance, reg.target_category);
+	}
+
+	if (status > 0) {
+		ssam_err(ctrl,
+			 "unexpected result while enabling event source: %#04x (tc: %#04x, iid: %#04x, reg: %#04x)\n",
+			 status, id.target_category, id.instance, reg.target_category);
+		return -EPROTO;
+	}
+
+	return status;
+}
+
+/**
+ * ssam_ssh_event_disable() - Disable SSH event.
+ * @ctrl:  The controller for which to disable the event.
+ * @reg:   The event registry describing what request to use for enabling and
+ *         disabling the event (must be same as used when enabling the event).
+ * @id:    The event identifier.
+ * @flags: The event flags (likely ignored for disabling of events).
+ *
+ * Disables the specified event on the EC. This function does not manage
+ * reference counting of enabled events and is basically only a wrapper for
+ * the raw EC request. If the specified event is already disabled, the EC will
+ * ignore this request.
+ *
+ * Return: Returns the status of the executed SAM request (zero on success and
+ * negative on direct failure) or %-EPROTO if the request response indicates a
+ * failure.
+ */
+static int ssam_ssh_event_disable(struct ssam_controller *ctrl,
+				  struct ssam_event_registry reg,
+				  struct ssam_event_id id, u8 flags)
+{
+	int status;
+
+	status = __ssam_ssh_event_request(ctrl, reg, reg.cid_enable, id, flags);
+
+	if (status < 0 && status != -EINVAL) {
+		ssam_err(ctrl,
+			 "failed to disable event source (tc: %#04x, iid: %#04x, reg: %#04x)\n",
+			 id.target_category, id.instance, reg.target_category);
+	}
+
+	if (status > 0) {
+		ssam_err(ctrl,
+			 "unexpected result while disabling event source: %#04x (tc: %#04x, iid: %#04x, reg: %#04x)\n",
+			 status, id.target_category, id.instance, reg.target_category);
+		return -EPROTO;
+	}
+
+	return status;
+}
+
+
+/* -- Wrappers for internal SAM requests. ----------------------------------- */
+
+/**
+ * ssam_get_firmware_version() - Get the SAM/EC firmware version.
+ * @ctrl:    The controller.
+ * @version: Where to store the version number.
+ *
+ * Return: Returns zero on success or the status of the executed SAM request
+ * if that request failed.
+ */
+int ssam_get_firmware_version(struct ssam_controller *ctrl, u32 *version)
+{
+	__le32 __version;
+	int status;
+
+	status = ssam_retry(ssam_ssh_get_firmware_version, ctrl, &__version);
+	if (status)
+		return status;
+
+	*version = le32_to_cpu(__version);
+	return 0;
+}
+
+/**
+ * ssam_ctrl_notif_display_off() - Notify EC that the display has been turned
+ * off.
+ * @ctrl: The controller.
+ *
+ * Notify the EC that the display has been turned off and the driver may enter
+ * a lower-power state. This will prevent events from being sent directly.
+ * Rather, the EC signals an event by pulling the wakeup GPIO high for as long
+ * as there are pending events. The events then need to be manually released,
+ * one by one, via the GPIO callback request. All pending events accumulated
+ * during this state can also be released by issuing the display-on
+ * notification, e.g. via ssam_ctrl_notif_display_on(), which will also reset
+ * the GPIO.
+ *
+ * On some devices, specifically ones with an integrated keyboard, the keyboard
+ * backlight will be turned off by this call.
+ *
+ * This function will only send the display-off notification command if
+ * display notifications are supported by the EC. Currently all known devices
+ * support these notifications.
+ *
+ * Use ssam_ctrl_notif_display_on() to reverse the effects of this function.
+ *
+ * Return: Returns zero on success or if no request has been executed, the
+ * status of the executed SAM request if that request failed, or %-EPROTO if
+ * an unexpected response has been received.
+ */
+int ssam_ctrl_notif_display_off(struct ssam_controller *ctrl)
+{
+	int status;
+	u8 response;
+
+	ssam_dbg(ctrl, "pm: notifying display off\n");
+
+	status = ssam_retry(ssam_ssh_notif_display_off, ctrl, &response);
+	if (status)
+		return status;
+
+	if (response != 0) {
+		ssam_err(ctrl, "unexpected response from display-off notification: %#04x\n",
+			 response);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+/**
+ * ssam_ctrl_notif_display_on() - Notify EC that the display has been turned on.
+ * @ctrl: The controller.
+ *
+ * Notify the EC that the display has been turned back on and the driver has
+ * exited its lower-power state. This notification is the counterpart to the
+ * display-off notification sent via ssam_ctrl_notif_display_off() and will
+ * reverse its effects, including resetting events to their default behavior.
+ *
+ * This function will only send the display-on notification command if display
+ * notifications are supported by the EC. Currently all known devices support
+ * these notifications.
+ *
+ * See ssam_ctrl_notif_display_off() for more details.
+ *
+ * Return: Returns zero on success or if no request has been executed, the
+ * status of the executed SAM request if that request failed, or %-EPROTO if
+ * an unexpected response has been received.
+ */
+int ssam_ctrl_notif_display_on(struct ssam_controller *ctrl)
+{
+	int status;
+	u8 response;
+
+	ssam_dbg(ctrl, "pm: notifying display on\n");
+
+	status = ssam_retry(ssam_ssh_notif_display_on, ctrl, &response);
+	if (status)
+		return status;
+
+	if (response != 0) {
+		ssam_err(ctrl, "unexpected response from display-on notification: %#04x\n",
+			 response);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+/**
+ * ssam_ctrl_notif_d0_exit() - Notify EC that the driver/device exits the D0
+ * power state.
+ * @ctrl: The controller
+ *
+ * Notifies the EC that the driver prepares to exit the D0 power state in
+ * favor of a lower-power state. Exact effects of this function related to the
+ * EC are currently unknown.
+ *
+ * This function will only send the D0-exit notification command if D0-state
+ * notifications are supported by the EC. Only newer Surface generations
+ * support these notifications.
+ *
+ * Use ssam_ctrl_notif_d0_entry() to reverse the effects of this function.
+ *
+ * Return: Returns zero on success or if no request has been executed, the
+ * status of the executed SAM request if that request failed, or %-EPROTO if
+ * an unexpected response has been received.
+ */
+int ssam_ctrl_notif_d0_exit(struct ssam_controller *ctrl)
+{
+	int status;
+	u8 response;
+
+	if (!ctrl->caps.d3_closes_handle)
+		return 0;
+
+	ssam_dbg(ctrl, "pm: notifying D0 exit\n");
+
+	status = ssam_retry(ssam_ssh_notif_d0_exit, ctrl, &response);
+	if (status)
+		return status;
+
+	if (response != 0) {
+		ssam_err(ctrl, "unexpected response from D0-exit notification: %#04x\n",
+			 response);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+/**
+ * ssam_ctrl_notif_d0_entry() - Notify EC that the driver/device enters the D0
+ * power state.
+ * @ctrl: The controller
+ *
+ * Notifies the EC that the driver has exited a lower-power state and entered
+ * the D0 power state. Exact effects of this function related to the EC are
+ * currently unknown.
+ *
+ * This function will only send the D0-entry notification command if D0-state
+ * notifications are supported by the EC. Only newer Surface generations
+ * support these notifications.
+ *
+ * See ssam_ctrl_notif_d0_exit() for more details.
+ *
+ * Return: Returns zero on success or if no request has been executed, the
+ * status of the executed SAM request if that request failed, or %-EPROTO if
+ * an unexpected response has been received.
+ */
+int ssam_ctrl_notif_d0_entry(struct ssam_controller *ctrl)
+{
+	int status;
+	u8 response;
+
+	if (!ctrl->caps.d3_closes_handle)
+		return 0;
+
+	ssam_dbg(ctrl, "pm: notifying D0 entry\n");
+
+	status = ssam_retry(ssam_ssh_notif_d0_entry, ctrl, &response);
+	if (status)
+		return status;
+
+	if (response != 0) {
+		ssam_err(ctrl, "unexpected response from D0-entry notification: %#04x\n",
+			 response);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+
+/* -- Top-level event registry interface. ----------------------------------- */
+
+/**
+ * ssam_notifier_register() - Register an event notifier.
+ * @ctrl: The controller to register the notifier on.
+ * @n:    The event notifier to register.
+ *
+ * Register an event notifier and increment the usage counter of the
+ * associated SAM event. If the event was previously not enabled, it will be
+ * enabled during this call.
+ *
+ * Return: Returns zero on success, %-ENOSPC if there have already been
+ * %INT_MAX notifiers for the event ID/type associated with the notifier block
+ * registered, %-ENOMEM if the corresponding event entry could not be
+ * allocated. If this is the first time that a notifier block is registered
+ * for the specific associated event, returns the status of the event-enable
+ * EC-command.
+ */
+int ssam_notifier_register(struct ssam_controller *ctrl,
+			   struct ssam_event_notifier *n)
+{
+	u16 rqid = ssh_tc_to_rqid(n->event.id.target_category);
+	struct ssam_nf_refcount_entry *entry;
+	struct ssam_nf_head *nf_head;
+	struct ssam_nf *nf;
+	int status;
+
+	if (!ssh_rqid_is_event(rqid))
+		return -EINVAL;
+
+	nf = &ctrl->cplt.event.notif;
+	nf_head = &nf->head[ssh_rqid_to_event(rqid)];
+
+	mutex_lock(&nf->lock);
+
+	entry = ssam_nf_refcount_inc(nf, n->event.reg, n->event.id);
+	if (IS_ERR(entry)) {
+		mutex_unlock(&nf->lock);
+		return PTR_ERR(entry);
+	}
+
+	ssam_dbg(ctrl, "enabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+		 n->event.reg.target_category, n->event.id.target_category,
+		 n->event.id.instance, entry->refcount);
+
+	status = ssam_nfblk_insert(nf_head, &n->base);
+	if (status) {
+		entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
+		if (entry->refcount == 0)
+			kfree(entry);
+
+		mutex_unlock(&nf->lock);
+		return status;
+	}
+
+	if (entry->refcount == 1) {
+		status = ssam_ssh_event_enable(ctrl, n->event.reg, n->event.id,
+					       n->event.flags);
+		if (status) {
+			ssam_nfblk_remove(&n->base);
+			kfree(ssam_nf_refcount_dec(nf, n->event.reg, n->event.id));
+			mutex_unlock(&nf->lock);
+			synchronize_srcu(&nf_head->srcu);
+			return status;
+		}
+
+		entry->flags = n->event.flags;
+
+	} else if (entry->flags != n->event.flags) {
+		ssam_warn(ctrl,
+			  "inconsistent flags when enabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
+			  n->event.flags, entry->flags, n->event.reg.target_category,
+			  n->event.id.target_category, n->event.id.instance);
+	}
+
+	mutex_unlock(&nf->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ssam_notifier_register);
+
+/**
+ * ssam_notifier_unregister() - Unregister an event notifier.
+ * @ctrl: The controller the notifier has been registered on.
+ * @n:    The event notifier to unregister.
+ *
+ * Unregister an event notifier and decrement the usage counter of the
+ * associated SAM event. If the usage counter reaches zero, the event will be
+ * disabled.
+ *
+ * Return: Returns zero on success, %-ENOENT if the given notifier block has
+ * not been registered on the controller. If the given notifier block was the
+ * last one associated with its specific event, returns the status of the
+ * event-disable EC-command.
+ */
+int ssam_notifier_unregister(struct ssam_controller *ctrl,
+			     struct ssam_event_notifier *n)
+{
+	u16 rqid = ssh_tc_to_rqid(n->event.id.target_category);
+	struct ssam_nf_refcount_entry *entry;
+	struct ssam_nf_head *nf_head;
+	struct ssam_nf *nf;
+	int status = 0;
+
+	if (!ssh_rqid_is_event(rqid))
+		return -EINVAL;
+
+	nf = &ctrl->cplt.event.notif;
+	nf_head = &nf->head[ssh_rqid_to_event(rqid)];
+
+	mutex_lock(&nf->lock);
+
+	if (!ssam_nfblk_find(nf_head, &n->base)) {
+		mutex_unlock(&nf->lock);
+		return -ENOENT;
+	}
+
+	entry = ssam_nf_refcount_dec(nf, n->event.reg, n->event.id);
+	if (WARN_ON(!entry)) {
+		/*
+		 * If this does not return an entry, there's a logic error
+		 * somewhere: The notifier block is registered, but the event
+		 * refcount entry is not there. Remove the notifier block
+		 * anyways.
+		 */
+		status = -ENOENT;
+		goto remove;
+	}
+
+	ssam_dbg(ctrl, "disabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+		 n->event.reg.target_category, n->event.id.target_category,
+		 n->event.id.instance, entry->refcount);
+
+	if (entry->flags != n->event.flags) {
+		ssam_warn(ctrl,
+			  "inconsistent flags when disabling event: got %#04x, expected %#04x (reg: %#04x, tc: %#04x, iid: %#04x)\n",
+			  n->event.flags, entry->flags, n->event.reg.target_category,
+			  n->event.id.target_category, n->event.id.instance);
+	}
+
+	if (entry->refcount == 0) {
+		status = ssam_ssh_event_disable(ctrl, n->event.reg, n->event.id,
+						n->event.flags);
+		kfree(entry);
+	}
+
+remove:
+	ssam_nfblk_remove(&n->base);
+	mutex_unlock(&nf->lock);
+	synchronize_srcu(&nf_head->srcu);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(ssam_notifier_unregister);
+
+/**
+ * ssam_notifier_disable_registered() - Disable events for all registered
+ * notifiers.
+ * @ctrl: The controller for which to disable the notifiers/events.
+ *
+ * Disables events for all currently registered notifiers. In case of an error
+ * (EC command failing), all previously disabled events will be restored and
+ * the error code returned.
+ *
+ * This function is intended to disable all events prior to hibernation entry.
+ * See ssam_notifier_restore_registered() to restore/re-enable all events
+ * disabled with this function.
+ *
+ * Note that this function will not disable events for notifiers registered
+ * after calling this function. It should thus be made sure that no new
+ * notifiers are going to be added after this call and before the corresponding
+ * call to ssam_notifier_restore_registered().
+ *
+ * Return: Returns zero on success. In case of failure returns the error code
+ * returned by the failed EC command to disable an event.
+ */
+int ssam_notifier_disable_registered(struct ssam_controller *ctrl)
+{
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	struct rb_node *n;
+	int status;
+
+	mutex_lock(&nf->lock);
+	for (n = rb_first(&nf->refcount); n; n = rb_next(n)) {
+		struct ssam_nf_refcount_entry *e;
+
+		e = rb_entry(n, struct ssam_nf_refcount_entry, node);
+		status = ssam_ssh_event_disable(ctrl, e->key.reg,
+						e->key.id, e->flags);
+		if (status)
+			goto err;
+	}
+	mutex_unlock(&nf->lock);
+
+	return 0;
+
+err:
+	for (n = rb_prev(n); n; n = rb_prev(n)) {
+		struct ssam_nf_refcount_entry *e;
+
+		e = rb_entry(n, struct ssam_nf_refcount_entry, node);
+		ssam_ssh_event_enable(ctrl, e->key.reg, e->key.id, e->flags);
+	}
+	mutex_unlock(&nf->lock);
+
+	return status;
+}
+
+/**
+ * ssam_notifier_restore_registered() - Restore/re-enable events for all
+ * registered notifiers.
+ * @ctrl: The controller for which to restore the notifiers/events.
+ *
+ * Restores/re-enables all events for which notifiers have been registered on
+ * the given controller. In case of a failure, the error is logged and the
+ * function continues to try and enable the remaining events.
+ *
+ * This function is intended to restore/re-enable all registered events after
+ * hibernation. See ssam_notifier_disable_registered() for the counter part
+ * disabling the events and more details.
+ */
+void ssam_notifier_restore_registered(struct ssam_controller *ctrl)
+{
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	struct rb_node *n;
+
+	mutex_lock(&nf->lock);
+	for (n = rb_first(&nf->refcount); n; n = rb_next(n)) {
+		struct ssam_nf_refcount_entry *e;
+
+		e = rb_entry(n, struct ssam_nf_refcount_entry, node);
+
+		/* Ignore errors, will get logged in call. */
+		ssam_ssh_event_enable(ctrl, e->key.reg, e->key.id, e->flags);
+	}
+	mutex_unlock(&nf->lock);
+}
+
+/**
+ * ssam_notifier_is_empty() - Check if there are any registered notifiers.
+ * @ctrl: The controller to check on.
+ *
+ * Return: Returns %true if there are currently no notifiers registered on the
+ * controller, %false otherwise.
+ */
+static bool ssam_notifier_is_empty(struct ssam_controller *ctrl)
+{
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	bool result;
+
+	mutex_lock(&nf->lock);
+	result = ssam_nf_refcount_empty(nf);
+	mutex_unlock(&nf->lock);
+
+	return result;
+}
+
+/**
+ * ssam_notifier_unregister_all() - Unregister all currently registered
+ * notifiers.
+ * @ctrl: The controller to unregister the notifiers on.
+ *
+ * Unregisters all currently registered notifiers. This function is used to
+ * ensure that all notifiers will be unregistered and associated
+ * entries/resources freed when the controller is being shut down.
+ */
+static void ssam_notifier_unregister_all(struct ssam_controller *ctrl)
+{
+	struct ssam_nf *nf = &ctrl->cplt.event.notif;
+	struct ssam_nf_refcount_entry *e, *n;
+
+	mutex_lock(&nf->lock);
+	rbtree_postorder_for_each_entry_safe(e, n, &nf->refcount, node) {
+		/* Ignore errors, will get logged in call. */
+		ssam_ssh_event_disable(ctrl, e->key.reg, e->key.id, e->flags);
+		kfree(e);
+	}
+	nf->refcount = RB_ROOT;
+	mutex_unlock(&nf->lock);
+}
+
+
+/* -- Wakeup IRQ. ----------------------------------------------------------- */
+
+static irqreturn_t ssam_irq_handle(int irq, void *dev_id)
+{
+	struct ssam_controller *ctrl = dev_id;
+
+	ssam_dbg(ctrl, "pm: wake irq triggered\n");
+
+	/*
+	 * Note: Proper wakeup detection is currently unimplemented.
+	 *       When the EC is in display-off or any other non-D0 state, it
+	 *       does not send events/notifications to the host. Instead it
+	 *       signals that there are events available via the wakeup IRQ.
+	 *       This driver is responsible for calling back to the EC to
+	 *       release these events one-by-one.
+	 *
+	 *       This IRQ should not cause a full system resume by its own.
+	 *       Instead, events should be handled by their respective subsystem
+	 *       drivers, which in turn should signal whether a full system
+	 *       resume should be performed.
+	 *
+	 * TODO: Send GPIO callback command repeatedly to EC until callback
+	 *       returns 0x00. Return flag of callback is "has more events".
+	 *       Each time the command is sent, one event is "released". Once
+	 *       all events have been released (return = 0x00), the GPIO is
+	 *       re-armed. Detect wakeup events during this process, go back to
+	 *       sleep if no wakeup event has been received.
+	 */
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * ssam_irq_setup() - Set up SAM EC wakeup-GPIO interrupt.
+ * @ctrl: The controller for which the IRQ should be set up.
+ *
+ * Set up an IRQ for the wakeup-GPIO pin of the SAM EC. This IRQ can be used
+ * to wake the device from a low power state.
+ *
+ * Note that this IRQ can only be triggered while the EC is in the display-off
+ * state. In this state, events are not sent to the host in the usual way.
+ * Instead the wakeup-GPIO gets pulled to "high" as long as there are pending
+ * events and these events need to be released one-by-one via the GPIO
+ * callback request, either until there are no events left and the GPIO is
+ * reset, or all at once by transitioning the EC out of the display-off state,
+ * which will also clear the GPIO.
+ *
+ * Not all events, however, should trigger a full system wakeup. Instead the
+ * driver should, if necessary, inspect and forward each event to the
+ * corresponding subsystem, which in turn should decide if the system needs to
+ * be woken up. This logic has not been implemented yet, thus wakeup by this
+ * IRQ should be disabled by default to avoid spurious wake-ups, caused, for
+ * example, by the remaining battery percentage changing. Refer to comments in
+ * this function and comments in the corresponding IRQ handler for more
+ * details on how this should be implemented.
+ *
+ * See also ssam_ctrl_notif_display_off() and ssam_ctrl_notif_display_off()
+ * for functions to transition the EC into and out of the display-off state as
+ * well as more details on it.
+ *
+ * The IRQ is disabled by default and has to be enabled before it can wake up
+ * the device from suspend via ssam_irq_arm_for_wakeup(). On teardown, the IRQ
+ * should be freed via ssam_irq_free().
+ */
+int ssam_irq_setup(struct ssam_controller *ctrl)
+{
+	struct device *dev = ssam_controller_device(ctrl);
+	struct gpio_desc *gpiod;
+	int irq;
+	int status;
+
+	/*
+	 * The actual GPIO interrupt is declared in ACPI as TRIGGER_HIGH.
+	 * However, the GPIO line only gets reset by sending the GPIO callback
+	 * command to SAM (or alternatively the display-on notification). As
+	 * proper handling for this interrupt is not implemented yet, leaving
+	 * the IRQ at TRIGGER_HIGH would cause an IRQ storm (as the callback
+	 * never gets sent and thus the line never gets reset). To avoid this,
+	 * mark the IRQ as TRIGGER_RISING for now, only creating a single
+	 * interrupt, and let the SAM resume callback during the controller
+	 * resume process clear it.
+	 */
+	const int irqf = IRQF_SHARED | IRQF_ONESHOT | IRQF_TRIGGER_RISING;
+
+	gpiod = gpiod_get(dev, "ssam_wakeup-int", GPIOD_ASIS);
+	if (IS_ERR(gpiod))
+		return PTR_ERR(gpiod);
+
+	irq = gpiod_to_irq(gpiod);
+	gpiod_put(gpiod);
+
+	if (irq < 0)
+		return irq;
+
+	status = request_threaded_irq(irq, NULL, ssam_irq_handle, irqf,
+				      "ssam_wakeup", ctrl);
+	if (status)
+		return status;
+
+	ctrl->irq.num = irq;
+	disable_irq(ctrl->irq.num);
+	return 0;
+}
+
+/**
+ * ssam_irq_free() - Free SAM EC wakeup-GPIO interrupt.
+ * @ctrl: The controller for which the IRQ should be freed.
+ *
+ * Free the wakeup-GPIO IRQ previously set-up via ssam_irq_setup().
+ */
+void ssam_irq_free(struct ssam_controller *ctrl)
+{
+	free_irq(ctrl->irq.num, ctrl);
+	ctrl->irq.num = -1;
+}
+
+/**
+ * ssam_irq_arm_for_wakeup() - Arm the EC IRQ for wakeup, if enabled.
+ * @ctrl: The controller for which the IRQ should be armed.
+ *
+ * Sets up the IRQ so that it can be used to wake the device. Specifically,
+ * this function enables the irq and then, if the device is allowed to wake up
+ * the system, calls enable_irq_wake(). See ssam_irq_disarm_wakeup() for the
+ * corresponding function to disable the IRQ.
+ *
+ * This function is intended to arm the IRQ before entering S2idle suspend.
+ *
+ * Note: calls to ssam_irq_arm_for_wakeup() and ssam_irq_disarm_wakeup() must
+ * be balanced.
+ */
+int ssam_irq_arm_for_wakeup(struct ssam_controller *ctrl)
+{
+	struct device *dev = ssam_controller_device(ctrl);
+	int status;
+
+	enable_irq(ctrl->irq.num);
+	if (device_may_wakeup(dev)) {
+		status = enable_irq_wake(ctrl->irq.num);
+		if (status) {
+			ssam_err(ctrl, "failed to enable wake IRQ: %d\n", status);
+			disable_irq(ctrl->irq.num);
+			return status;
+		}
+
+		ctrl->irq.wakeup_enabled = true;
+	} else {
+		ctrl->irq.wakeup_enabled = false;
+	}
+
+	return 0;
+}
+
+/**
+ * ssam_irq_disarm_wakeup() - Disarm the wakeup IRQ.
+ * @ctrl: The controller for which the IRQ should be disarmed.
+ *
+ * Disarm the IRQ previously set up for wake via ssam_irq_arm_for_wakeup().
+ *
+ * This function is intended to disarm the IRQ after exiting S2idle suspend.
+ *
+ * Note: calls to ssam_irq_arm_for_wakeup() and ssam_irq_disarm_wakeup() must
+ * be balanced.
+ */
+void ssam_irq_disarm_wakeup(struct ssam_controller *ctrl)
+{
+	int status;
+
+	if (ctrl->irq.wakeup_enabled) {
+		status = disable_irq_wake(ctrl->irq.num);
+		if (status)
+			ssam_err(ctrl, "failed to disable wake IRQ: %d\n", status);
+
+		ctrl->irq.wakeup_enabled = false;
+	}
+	disable_irq(ctrl->irq.num);
+}
diff --git a/drivers/platform/surface/aggregator/controller.h b/drivers/platform/surface/aggregator/controller.h
new file mode 100644
index 000000000000..5ee9e966f1d7
--- /dev/null
+++ b/drivers/platform/surface/aggregator/controller.h
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Main SSAM/SSH controller structure and functionality.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _SURFACE_AGGREGATOR_CONTROLLER_H
+#define _SURFACE_AGGREGATOR_CONTROLLER_H
+
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/serdev.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include <linux/surface_aggregator/serial_hub.h>
+
+#include "ssh_request_layer.h"
+
+
+/* -- Safe counters. -------------------------------------------------------- */
+
+/**
+ * struct ssh_seq_counter - Safe counter for SSH sequence IDs.
+ * @value: The current counter value.
+ */
+struct ssh_seq_counter {
+	u8 value;
+};
+
+/**
+ * struct ssh_rqid_counter - Safe counter for SSH request IDs.
+ * @value: The current counter value.
+ */
+struct ssh_rqid_counter {
+	u16 value;
+};
+
+
+/* -- Event/notification system. -------------------------------------------- */
+
+/**
+ * struct ssam_nf_head - Notifier head for SSAM events.
+ * @srcu: The SRCU struct for synchronization.
+ * @head: List-head for notifier blocks registered under this head.
+ */
+struct ssam_nf_head {
+	struct srcu_struct srcu;
+	struct list_head head;
+};
+
+/**
+ * struct ssam_nf - Notifier callback- and activation-registry for SSAM events.
+ * @lock:     Lock guarding (de-)registration of notifier blocks. Note: This
+ *            lock does not need to be held for notifier calls, only
+ *            registration and deregistration.
+ * @refcount: The root of the RB-tree used for reference-counting enabled
+ *            events/notifications.
+ * @head:     The list of notifier heads for event/notification callbacks.
+ */
+struct ssam_nf {
+	struct mutex lock;
+	struct rb_root refcount;
+	struct ssam_nf_head head[SSH_NUM_EVENTS];
+};
+
+
+/* -- Event/async request completion system. -------------------------------- */
+
+struct ssam_cplt;
+
+/**
+ * struct ssam_event_item - Struct for event queuing and completion.
+ * @node:     The node in the queue.
+ * @rqid:     The request ID of the event.
+ * @event:    Actual event data.
+ */
+struct ssam_event_item {
+	struct list_head node;
+	u16 rqid;
+
+	struct ssam_event event;	/* must be last */
+};
+
+/**
+ * struct ssam_event_queue - Queue for completing received events.
+ * @cplt: Reference to the completion system on which this queue is active.
+ * @lock: The lock for any operation on the queue.
+ * @head: The list-head of the queue.
+ * @work: The &struct work_struct performing completion work for this queue.
+ */
+struct ssam_event_queue {
+	struct ssam_cplt *cplt;
+
+	spinlock_t lock;
+	struct list_head head;
+	struct work_struct work;
+};
+
+/**
+ * struct ssam_event_target - Set of queues for a single SSH target ID.
+ * @queue: The array of queues, one queue per event ID.
+ */
+struct ssam_event_target {
+	struct ssam_event_queue queue[SSH_NUM_EVENTS];
+};
+
+/**
+ * struct ssam_cplt - SSAM event/async request completion system.
+ * @dev:          The device with which this system is associated. Only used
+ *                for logging.
+ * @wq:           The &struct workqueue_struct on which all completion work
+ *                items are queued.
+ * @event:        Event completion management.
+ * @event.target: Array of &struct ssam_event_target, one for each target.
+ * @event.notif:  Notifier callbacks and event activation reference counting.
+ */
+struct ssam_cplt {
+	struct device *dev;
+	struct workqueue_struct *wq;
+
+	struct {
+		struct ssam_event_target target[SSH_NUM_TARGETS];
+		struct ssam_nf notif;
+	} event;
+};
+
+
+/* -- Main SSAM device structures. ------------------------------------------ */
+
+/**
+ * enum ssam_controller_state - State values for &struct ssam_controller.
+ * @SSAM_CONTROLLER_UNINITIALIZED:
+ *	The controller has not been initialized yet or has been deinitialized.
+ * @SSAM_CONTROLLER_INITIALIZED:
+ *	The controller is initialized, but has not been started yet.
+ * @SSAM_CONTROLLER_STARTED:
+ *	The controller has been started and is ready to use.
+ * @SSAM_CONTROLLER_STOPPED:
+ *	The controller has been stopped.
+ * @SSAM_CONTROLLER_SUSPENDED:
+ *	The controller has been suspended.
+ */
+enum ssam_controller_state {
+	SSAM_CONTROLLER_UNINITIALIZED,
+	SSAM_CONTROLLER_INITIALIZED,
+	SSAM_CONTROLLER_STARTED,
+	SSAM_CONTROLLER_STOPPED,
+	SSAM_CONTROLLER_SUSPENDED,
+};
+
+/**
+ * struct ssam_controller_caps - Controller device capabilities.
+ * @ssh_power_profile:             SSH power profile.
+ * @ssh_buffer_size:               SSH driver UART buffer size.
+ * @screen_on_sleep_idle_timeout:  SAM UART screen-on sleep idle timeout.
+ * @screen_off_sleep_idle_timeout: SAM UART screen-off sleep idle timeout.
+ * @d3_closes_handle:              SAM closes UART handle in D3.
+ *
+ * Controller and SSH device capabilities found in ACPI.
+ */
+struct ssam_controller_caps {
+	u32 ssh_power_profile;
+	u32 ssh_buffer_size;
+	u32 screen_on_sleep_idle_timeout;
+	u32 screen_off_sleep_idle_timeout;
+	u32 d3_closes_handle:1;
+};
+
+/**
+ * struct ssam_controller - SSAM controller device.
+ * @kref:  Reference count of the controller.
+ * @lock:  Main lock for the controller, used to guard state changes.
+ * @state: Controller state.
+ * @rtl:   Request transport layer for SSH I/O.
+ * @cplt:  Completion system for SSH/SSAM events and asynchronous requests.
+ * @counter:      Safe SSH message ID counters.
+ * @counter.seq:  Sequence ID counter.
+ * @counter.rqid: Request ID counter.
+ * @irq:          Wakeup IRQ resources.
+ * @irq.num:      The wakeup IRQ number.
+ * @irq.wakeup_enabled: Whether wakeup by IRQ is enabled during suspend.
+ * @caps: The controller device capabilities.
+ */
+struct ssam_controller {
+	struct kref kref;
+
+	struct rw_semaphore lock;
+	enum ssam_controller_state state;
+
+	struct ssh_rtl rtl;
+	struct ssam_cplt cplt;
+
+	struct {
+		struct ssh_seq_counter seq;
+		struct ssh_rqid_counter rqid;
+	} counter;
+
+	struct {
+		int num;
+		bool wakeup_enabled;
+	} irq;
+
+	struct ssam_controller_caps caps;
+};
+
+#define to_ssam_controller(ptr, member) \
+	container_of(ptr, struct ssam_controller, member)
+
+#define ssam_dbg(ctrl, fmt, ...)  rtl_dbg(&(ctrl)->rtl, fmt, ##__VA_ARGS__)
+#define ssam_info(ctrl, fmt, ...) rtl_info(&(ctrl)->rtl, fmt, ##__VA_ARGS__)
+#define ssam_warn(ctrl, fmt, ...) rtl_warn(&(ctrl)->rtl, fmt, ##__VA_ARGS__)
+#define ssam_err(ctrl, fmt, ...)  rtl_err(&(ctrl)->rtl, fmt, ##__VA_ARGS__)
+
+/**
+ * ssam_controller_receive_buf() - Provide input-data to the controller.
+ * @ctrl: The controller.
+ * @buf:  The input buffer.
+ * @n:    The number of bytes in the input buffer.
+ *
+ * Provide input data to be evaluated by the controller, which has been
+ * received via the lower-level transport.
+ *
+ * Return: Returns the number of bytes consumed, or, if the packet transport
+ * layer of the controller has been shut down, %-ESHUTDOWN.
+ */
+static inline
+int ssam_controller_receive_buf(struct ssam_controller *ctrl,
+				const unsigned char *buf, size_t n)
+{
+	return ssh_ptl_rx_rcvbuf(&ctrl->rtl.ptl, buf, n);
+}
+
+/**
+ * ssam_controller_write_wakeup() - Notify the controller that the underlying
+ * device has space available for data to be written.
+ * @ctrl: The controller.
+ */
+static inline void ssam_controller_write_wakeup(struct ssam_controller *ctrl)
+{
+	ssh_ptl_tx_wakeup_transfer(&ctrl->rtl.ptl);
+}
+
+int ssam_controller_init(struct ssam_controller *ctrl, struct serdev_device *s);
+int ssam_controller_start(struct ssam_controller *ctrl);
+void ssam_controller_shutdown(struct ssam_controller *ctrl);
+void ssam_controller_destroy(struct ssam_controller *ctrl);
+
+int ssam_notifier_disable_registered(struct ssam_controller *ctrl);
+void ssam_notifier_restore_registered(struct ssam_controller *ctrl);
+
+int ssam_irq_setup(struct ssam_controller *ctrl);
+void ssam_irq_free(struct ssam_controller *ctrl);
+int ssam_irq_arm_for_wakeup(struct ssam_controller *ctrl);
+void ssam_irq_disarm_wakeup(struct ssam_controller *ctrl);
+
+void ssam_controller_lock(struct ssam_controller *c);
+void ssam_controller_unlock(struct ssam_controller *c);
+
+int ssam_get_firmware_version(struct ssam_controller *ctrl, u32 *version);
+int ssam_ctrl_notif_display_off(struct ssam_controller *ctrl);
+int ssam_ctrl_notif_display_on(struct ssam_controller *ctrl);
+int ssam_ctrl_notif_d0_exit(struct ssam_controller *ctrl);
+int ssam_ctrl_notif_d0_entry(struct ssam_controller *ctrl);
+
+int ssam_controller_suspend(struct ssam_controller *ctrl);
+int ssam_controller_resume(struct ssam_controller *ctrl);
+
+#endif /* _SURFACE_AGGREGATOR_CONTROLLER_H */
diff --git a/drivers/platform/surface/aggregator/core.c b/drivers/platform/surface/aggregator/core.c
new file mode 100644
index 000000000000..18e0e9e34e7b
--- /dev/null
+++ b/drivers/platform/surface/aggregator/core.c
@@ -0,0 +1,787 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Surface Serial Hub (SSH) driver for communication with the Surface/System
+ * Aggregator Module (SSAM/SAM).
+ *
+ * Provides access to a SAM-over-SSH connected EC via a controller device.
+ * Handles communication via requests as well as enabling, disabling, and
+ * relaying of events.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <linux/acpi.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/gpio/consumer.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/serdev.h>
+#include <linux/sysfs.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include "controller.h"
+
+
+/* -- Static controller reference. ------------------------------------------ */
+
+/*
+ * Main controller reference. The corresponding lock must be held while
+ * accessing (reading/writing) the reference.
+ */
+static struct ssam_controller *__ssam_controller;
+static DEFINE_SPINLOCK(__ssam_controller_lock);
+
+/**
+ * ssam_get_controller() - Get reference to SSAM controller.
+ *
+ * Returns a reference to the SSAM controller of the system or %NULL if there
+ * is none, it hasn't been set up yet, or it has already been unregistered.
+ * This function automatically increments the reference count of the
+ * controller, thus the calling party must ensure that ssam_controller_put()
+ * is called when it doesn't need the controller any more.
+ */
+struct ssam_controller *ssam_get_controller(void)
+{
+	struct ssam_controller *ctrl;
+
+	spin_lock(&__ssam_controller_lock);
+
+	ctrl = __ssam_controller;
+	if (!ctrl)
+		goto out;
+
+	if (WARN_ON(!kref_get_unless_zero(&ctrl->kref)))
+		ctrl = NULL;
+
+out:
+	spin_unlock(&__ssam_controller_lock);
+	return ctrl;
+}
+EXPORT_SYMBOL_GPL(ssam_get_controller);
+
+/**
+ * ssam_try_set_controller() - Try to set the main controller reference.
+ * @ctrl: The controller to which the reference should point.
+ *
+ * Set the main controller reference to the given pointer if the reference
+ * hasn't been set already.
+ *
+ * Return: Returns zero on success or %-EEXIST if the reference has already
+ * been set.
+ */
+static int ssam_try_set_controller(struct ssam_controller *ctrl)
+{
+	int status = 0;
+
+	spin_lock(&__ssam_controller_lock);
+	if (!__ssam_controller)
+		__ssam_controller = ctrl;
+	else
+		status = -EEXIST;
+	spin_unlock(&__ssam_controller_lock);
+
+	return status;
+}
+
+/**
+ * ssam_clear_controller() - Remove/clear the main controller reference.
+ *
+ * Clears the main controller reference, i.e. sets it to %NULL. This function
+ * should be called before the controller is shut down.
+ */
+static void ssam_clear_controller(void)
+{
+	spin_lock(&__ssam_controller_lock);
+	__ssam_controller = NULL;
+	spin_unlock(&__ssam_controller_lock);
+}
+
+/**
+ * ssam_client_link() - Link an arbitrary client device to the controller.
+ * @c: The controller to link to.
+ * @client: The client device.
+ *
+ * Link an arbitrary client device to the controller by creating a device link
+ * between it as consumer and the controller device as provider. This function
+ * can be used for non-SSAM devices (or SSAM devices not registered as child
+ * under the controller) to guarantee that the controller is valid for as long
+ * as the driver of the client device is bound, and that proper suspend and
+ * resume ordering is guaranteed.
+ *
+ * The device link does not have to be destructed manually. It is removed
+ * automatically once the driver of the client device unbinds.
+ *
+ * Return: Returns zero on success, %-ENODEV if the controller is not ready or
+ * going to be removed soon, or %-ENOMEM if the device link could not be
+ * created for other reasons.
+ */
+int ssam_client_link(struct ssam_controller *c, struct device *client)
+{
+	const u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_CONSUMER;
+	struct device_link *link;
+	struct device *ctrldev;
+
+	ssam_controller_statelock(c);
+
+	if (c->state != SSAM_CONTROLLER_STARTED) {
+		ssam_controller_stateunlock(c);
+		return -ENODEV;
+	}
+
+	ctrldev = ssam_controller_device(c);
+	if (!ctrldev) {
+		ssam_controller_stateunlock(c);
+		return -ENODEV;
+	}
+
+	link = device_link_add(client, ctrldev, flags);
+	if (!link) {
+		ssam_controller_stateunlock(c);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Return -ENODEV if supplier driver is on its way to be removed. In
+	 * this case, the controller won't be around for much longer and the
+	 * device link is not going to save us any more, as unbinding is
+	 * already in progress.
+	 */
+	if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) {
+		ssam_controller_stateunlock(c);
+		return -ENODEV;
+	}
+
+	ssam_controller_stateunlock(c);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ssam_client_link);
+
+/**
+ * ssam_client_bind() - Bind an arbitrary client device to the controller.
+ * @client: The client device.
+ *
+ * Link an arbitrary client device to the controller by creating a device link
+ * between it as consumer and the main controller device as provider. This
+ * function can be used for non-SSAM devices to guarantee that the controller
+ * returned by this function is valid for as long as the driver of the client
+ * device is bound, and that proper suspend and resume ordering is guaranteed.
+ *
+ * This function does essentially the same as ssam_client_link(), except that
+ * it first fetches the main controller reference, then creates the link, and
+ * finally returns this reference. Note that this function does not increment
+ * the reference counter of the controller, as, due to the link, the
+ * controller lifetime is assured as long as the driver of the client device
+ * is bound.
+ *
+ * It is not valid to use the controller reference obtained by this method
+ * outside of the driver bound to the client device at the time of calling
+ * this function, without first incrementing the reference count of the
+ * controller via ssam_controller_get(). Even after doing this, care must be
+ * taken that requests are only submitted and notifiers are only
+ * (un-)registered when the controller is active and not suspended. In other
+ * words: The device link only lives as long as the client driver is bound and
+ * any guarantees enforced by this link (e.g. active controller state) can
+ * only be relied upon as long as this link exists and may need to be enforced
+ * in other ways afterwards.
+ *
+ * The created device link does not have to be destructed manually. It is
+ * removed automatically once the driver of the client device unbinds.
+ *
+ * Return: Returns the controller on success, an error pointer with %-ENODEV
+ * if the controller is not present, not ready or going to be removed soon, or
+ * %-ENOMEM if the device link could not be created for other reasons.
+ */
+struct ssam_controller *ssam_client_bind(struct device *client)
+{
+	struct ssam_controller *c;
+	int status;
+
+	c = ssam_get_controller();
+	if (!c)
+		return ERR_PTR(-ENODEV);
+
+	status = ssam_client_link(c, client);
+
+	/*
+	 * Note that we can drop our controller reference in both success and
+	 * failure cases: On success, we have bound the controller lifetime
+	 * inherently to the client driver lifetime, i.e. it the controller is
+	 * now guaranteed to outlive the client driver. On failure, we're not
+	 * going to use the controller any more.
+	 */
+	ssam_controller_put(c);
+
+	return status >= 0 ? c : ERR_PTR(status);
+}
+EXPORT_SYMBOL_GPL(ssam_client_bind);
+
+
+/* -- Glue layer (serdev_device -> ssam_controller). ------------------------ */
+
+static int ssam_receive_buf(struct serdev_device *dev, const unsigned char *buf,
+			    size_t n)
+{
+	struct ssam_controller *ctrl;
+
+	ctrl = serdev_device_get_drvdata(dev);
+	return ssam_controller_receive_buf(ctrl, buf, n);
+}
+
+static void ssam_write_wakeup(struct serdev_device *dev)
+{
+	ssam_controller_write_wakeup(serdev_device_get_drvdata(dev));
+}
+
+static const struct serdev_device_ops ssam_serdev_ops = {
+	.receive_buf = ssam_receive_buf,
+	.write_wakeup = ssam_write_wakeup,
+};
+
+
+/* -- SysFS and misc. ------------------------------------------------------- */
+
+static int ssam_log_firmware_version(struct ssam_controller *ctrl)
+{
+	u32 version, a, b, c;
+	int status;
+
+	status = ssam_get_firmware_version(ctrl, &version);
+	if (status)
+		return status;
+
+	a = (version >> 24) & 0xff;
+	b = ((version >> 8) & 0xffff);
+	c = version & 0xff;
+
+	ssam_info(ctrl, "SAM firmware version: %u.%u.%u\n", a, b, c);
+	return 0;
+}
+
+static ssize_t firmware_version_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct ssam_controller *ctrl = dev_get_drvdata(dev);
+	u32 version, a, b, c;
+	int status;
+
+	status = ssam_get_firmware_version(ctrl, &version);
+	if (status < 0)
+		return status;
+
+	a = (version >> 24) & 0xff;
+	b = ((version >> 8) & 0xffff);
+	c = version & 0xff;
+
+	return sysfs_emit(buf, "%u.%u.%u\n", a, b, c);
+}
+static DEVICE_ATTR_RO(firmware_version);
+
+static struct attribute *ssam_sam_attrs[] = {
+	&dev_attr_firmware_version.attr,
+	NULL
+};
+
+static const struct attribute_group ssam_sam_group = {
+	.name = "sam",
+	.attrs = ssam_sam_attrs,
+};
+
+
+/* -- ACPI based device setup. ---------------------------------------------- */
+
+static acpi_status ssam_serdev_setup_via_acpi_crs(struct acpi_resource *rsc,
+						  void *ctx)
+{
+	struct serdev_device *serdev = ctx;
+	struct acpi_resource_common_serialbus *serial;
+	struct acpi_resource_uart_serialbus *uart;
+	bool flow_control;
+	int status = 0;
+
+	if (rsc->type != ACPI_RESOURCE_TYPE_SERIAL_BUS)
+		return AE_OK;
+
+	serial = &rsc->data.common_serial_bus;
+	if (serial->type != ACPI_RESOURCE_SERIAL_TYPE_UART)
+		return AE_OK;
+
+	uart = &rsc->data.uart_serial_bus;
+
+	/* Set up serdev device. */
+	serdev_device_set_baudrate(serdev, uart->default_baud_rate);
+
+	/* serdev currently only supports RTSCTS flow control. */
+	if (uart->flow_control & (~((u8)ACPI_UART_FLOW_CONTROL_HW))) {
+		dev_warn(&serdev->dev, "setup: unsupported flow control (value: %#04x)\n",
+			 uart->flow_control);
+	}
+
+	/* Set RTSCTS flow control. */
+	flow_control = uart->flow_control & ACPI_UART_FLOW_CONTROL_HW;
+	serdev_device_set_flow_control(serdev, flow_control);
+
+	/* serdev currently only supports EVEN/ODD parity. */
+	switch (uart->parity) {
+	case ACPI_UART_PARITY_NONE:
+		status = serdev_device_set_parity(serdev, SERDEV_PARITY_NONE);
+		break;
+	case ACPI_UART_PARITY_EVEN:
+		status = serdev_device_set_parity(serdev, SERDEV_PARITY_EVEN);
+		break;
+	case ACPI_UART_PARITY_ODD:
+		status = serdev_device_set_parity(serdev, SERDEV_PARITY_ODD);
+		break;
+	default:
+		dev_warn(&serdev->dev, "setup: unsupported parity (value: %#04x)\n",
+			 uart->parity);
+		break;
+	}
+
+	if (status) {
+		dev_err(&serdev->dev, "setup: failed to set parity (value: %#04x, error: %d)\n",
+			uart->parity, status);
+		return AE_ERROR;
+	}
+
+	/* We've found the resource and are done. */
+	return AE_CTRL_TERMINATE;
+}
+
+static acpi_status ssam_serdev_setup_via_acpi(acpi_handle handle,
+					      struct serdev_device *serdev)
+{
+	return acpi_walk_resources(handle, METHOD_NAME__CRS,
+				   ssam_serdev_setup_via_acpi_crs, serdev);
+}
+
+
+/* -- Power management. ----------------------------------------------------- */
+
+static void ssam_serial_hub_shutdown(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * Try to disable notifiers, signal display-off and D0-exit, ignore any
+	 * errors.
+	 *
+	 * Note: It has not been established yet if this is actually
+	 * necessary/useful for shutdown.
+	 */
+
+	status = ssam_notifier_disable_registered(c);
+	if (status) {
+		ssam_err(c, "pm: failed to disable notifiers for shutdown: %d\n",
+			 status);
+	}
+
+	status = ssam_ctrl_notif_display_off(c);
+	if (status)
+		ssam_err(c, "pm: display-off notification failed: %d\n", status);
+
+	status = ssam_ctrl_notif_d0_exit(c);
+	if (status)
+		ssam_err(c, "pm: D0-exit notification failed: %d\n", status);
+}
+
+#ifdef CONFIG_PM_SLEEP
+
+static int ssam_serial_hub_pm_prepare(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * Try to signal display-off, This will quiesce events.
+	 *
+	 * Note: Signaling display-off/display-on should normally be done from
+	 * some sort of display state notifier. As that is not available,
+	 * signal it here.
+	 */
+
+	status = ssam_ctrl_notif_display_off(c);
+	if (status)
+		ssam_err(c, "pm: display-off notification failed: %d\n", status);
+
+	return status;
+}
+
+static void ssam_serial_hub_pm_complete(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * Try to signal display-on. This will restore events.
+	 *
+	 * Note: Signaling display-off/display-on should normally be done from
+	 * some sort of display state notifier. As that is not available,
+	 * signal it here.
+	 */
+
+	status = ssam_ctrl_notif_display_on(c);
+	if (status)
+		ssam_err(c, "pm: display-on notification failed: %d\n", status);
+}
+
+static int ssam_serial_hub_pm_suspend(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * Try to signal D0-exit, enable IRQ wakeup if specified. Abort on
+	 * error.
+	 */
+
+	status = ssam_ctrl_notif_d0_exit(c);
+	if (status) {
+		ssam_err(c, "pm: D0-exit notification failed: %d\n", status);
+		goto err_notif;
+	}
+
+	status = ssam_irq_arm_for_wakeup(c);
+	if (status)
+		goto err_irq;
+
+	WARN_ON(ssam_controller_suspend(c));
+	return 0;
+
+err_irq:
+	ssam_ctrl_notif_d0_entry(c);
+err_notif:
+	ssam_ctrl_notif_display_on(c);
+	return status;
+}
+
+static int ssam_serial_hub_pm_resume(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	WARN_ON(ssam_controller_resume(c));
+
+	/*
+	 * Try to disable IRQ wakeup (if specified) and signal D0-entry. In
+	 * case of errors, log them and try to restore normal operation state
+	 * as far as possible.
+	 *
+	 * Note: Signaling display-off/display-on should normally be done from
+	 * some sort of display state notifier. As that is not available,
+	 * signal it here.
+	 */
+
+	ssam_irq_disarm_wakeup(c);
+
+	status = ssam_ctrl_notif_d0_entry(c);
+	if (status)
+		ssam_err(c, "pm: D0-entry notification failed: %d\n", status);
+
+	return 0;
+}
+
+static int ssam_serial_hub_pm_freeze(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * During hibernation image creation, we only have to ensure that the
+	 * EC doesn't send us any events. This is done via the display-off
+	 * and D0-exit notifications. Note that this sets up the wakeup IRQ
+	 * on the EC side, however, we have disabled it by default on our side
+	 * and won't enable it here.
+	 *
+	 * See ssam_serial_hub_poweroff() for more details on the hibernation
+	 * process.
+	 */
+
+	status = ssam_ctrl_notif_d0_exit(c);
+	if (status) {
+		ssam_err(c, "pm: D0-exit notification failed: %d\n", status);
+		ssam_ctrl_notif_display_on(c);
+		return status;
+	}
+
+	WARN_ON(ssam_controller_suspend(c));
+	return 0;
+}
+
+static int ssam_serial_hub_pm_thaw(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	WARN_ON(ssam_controller_resume(c));
+
+	status = ssam_ctrl_notif_d0_entry(c);
+	if (status)
+		ssam_err(c, "pm: D0-exit notification failed: %d\n", status);
+
+	return status;
+}
+
+static int ssam_serial_hub_pm_poweroff(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * When entering hibernation and powering off the system, the EC, at
+	 * least on some models, may disable events. Without us taking care of
+	 * that, this leads to events not being enabled/restored when the
+	 * system resumes from hibernation, resulting SAM-HID subsystem devices
+	 * (i.e. keyboard, touchpad) not working, AC-plug/AC-unplug events being
+	 * gone, etc.
+	 *
+	 * To avoid these issues, we disable all registered events here (this is
+	 * likely not actually required) and restore them during the drivers PM
+	 * restore callback.
+	 *
+	 * Wakeup from the EC interrupt is not supported during hibernation,
+	 * so don't arm the IRQ here.
+	 */
+
+	status = ssam_notifier_disable_registered(c);
+	if (status) {
+		ssam_err(c, "pm: failed to disable notifiers for hibernation: %d\n",
+			 status);
+		return status;
+	}
+
+	status = ssam_ctrl_notif_d0_exit(c);
+	if (status) {
+		ssam_err(c, "pm: D0-exit notification failed: %d\n", status);
+		ssam_notifier_restore_registered(c);
+		return status;
+	}
+
+	WARN_ON(ssam_controller_suspend(c));
+	return 0;
+}
+
+static int ssam_serial_hub_pm_restore(struct device *dev)
+{
+	struct ssam_controller *c = dev_get_drvdata(dev);
+	int status;
+
+	/*
+	 * Ignore but log errors, try to restore state as much as possible in
+	 * case of failures. See ssam_serial_hub_poweroff() for more details on
+	 * the hibernation process.
+	 */
+
+	WARN_ON(ssam_controller_resume(c));
+
+	status = ssam_ctrl_notif_d0_entry(c);
+	if (status)
+		ssam_err(c, "pm: D0-entry notification failed: %d\n", status);
+
+	ssam_notifier_restore_registered(c);
+	return 0;
+}
+
+static const struct dev_pm_ops ssam_serial_hub_pm_ops = {
+	.prepare  = ssam_serial_hub_pm_prepare,
+	.complete = ssam_serial_hub_pm_complete,
+	.suspend  = ssam_serial_hub_pm_suspend,
+	.resume   = ssam_serial_hub_pm_resume,
+	.freeze   = ssam_serial_hub_pm_freeze,
+	.thaw     = ssam_serial_hub_pm_thaw,
+	.poweroff = ssam_serial_hub_pm_poweroff,
+	.restore  = ssam_serial_hub_pm_restore,
+};
+
+#else /* CONFIG_PM_SLEEP */
+
+static const struct dev_pm_ops ssam_serial_hub_pm_ops = { };
+
+#endif /* CONFIG_PM_SLEEP */
+
+
+/* -- Device/driver setup. -------------------------------------------------- */
+
+static const struct acpi_gpio_params gpio_ssam_wakeup_int = { 0, 0, false };
+static const struct acpi_gpio_params gpio_ssam_wakeup     = { 1, 0, false };
+
+static const struct acpi_gpio_mapping ssam_acpi_gpios[] = {
+	{ "ssam_wakeup-int-gpio", &gpio_ssam_wakeup_int, 1 },
+	{ "ssam_wakeup-gpio",     &gpio_ssam_wakeup,     1 },
+	{ },
+};
+
+static int ssam_serial_hub_probe(struct serdev_device *serdev)
+{
+	struct ssam_controller *ctrl;
+	acpi_handle *ssh = ACPI_HANDLE(&serdev->dev);
+	acpi_status astatus;
+	int status;
+
+	if (gpiod_count(&serdev->dev, NULL) < 0)
+		return -ENODEV;
+
+	status = devm_acpi_dev_add_driver_gpios(&serdev->dev, ssam_acpi_gpios);
+	if (status)
+		return status;
+
+	/* Allocate controller. */
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return -ENOMEM;
+
+	/* Initialize controller. */
+	status = ssam_controller_init(ctrl, serdev);
+	if (status)
+		goto err_ctrl_init;
+
+	ssam_controller_lock(ctrl);
+
+	/* Set up serdev device. */
+	serdev_device_set_drvdata(serdev, ctrl);
+	serdev_device_set_client_ops(serdev, &ssam_serdev_ops);
+	status = serdev_device_open(serdev);
+	if (status)
+		goto err_devopen;
+
+	astatus = ssam_serdev_setup_via_acpi(ssh, serdev);
+	if (ACPI_FAILURE(astatus)) {
+		status = -ENXIO;
+		goto err_devinit;
+	}
+
+	/* Start controller. */
+	status = ssam_controller_start(ctrl);
+	if (status)
+		goto err_devinit;
+
+	ssam_controller_unlock(ctrl);
+
+	/*
+	 * Initial SAM requests: Log version and notify default/init power
+	 * states.
+	 */
+	status = ssam_log_firmware_version(ctrl);
+	if (status)
+		goto err_initrq;
+
+	status = ssam_ctrl_notif_d0_entry(ctrl);
+	if (status)
+		goto err_initrq;
+
+	status = ssam_ctrl_notif_display_on(ctrl);
+	if (status)
+		goto err_initrq;
+
+	status = sysfs_create_group(&serdev->dev.kobj, &ssam_sam_group);
+	if (status)
+		goto err_initrq;
+
+	/* Set up IRQ. */
+	status = ssam_irq_setup(ctrl);
+	if (status)
+		goto err_irq;
+
+	/* Finally, set main controller reference. */
+	status = ssam_try_set_controller(ctrl);
+	if (WARN_ON(status))	/* Currently, we're the only provider. */
+		goto err_mainref;
+
+	/*
+	 * TODO: The EC can wake up the system via the associated GPIO interrupt
+	 *       in multiple situations. One of which is the remaining battery
+	 *       capacity falling below a certain threshold. Normally, we should
+	 *       use the device_init_wakeup function, however, the EC also seems
+	 *       to have other reasons for waking up the system and it seems
+	 *       that Windows has additional checks whether the system should be
+	 *       resumed. In short, this causes some spurious unwanted wake-ups.
+	 *       For now let's thus default power/wakeup to false.
+	 */
+	device_set_wakeup_capable(&serdev->dev, true);
+	acpi_walk_dep_device_list(ssh);
+
+	return 0;
+
+err_mainref:
+	ssam_irq_free(ctrl);
+err_irq:
+	sysfs_remove_group(&serdev->dev.kobj, &ssam_sam_group);
+err_initrq:
+	ssam_controller_lock(ctrl);
+	ssam_controller_shutdown(ctrl);
+err_devinit:
+	serdev_device_close(serdev);
+err_devopen:
+	ssam_controller_destroy(ctrl);
+	ssam_controller_unlock(ctrl);
+err_ctrl_init:
+	kfree(ctrl);
+	return status;
+}
+
+static void ssam_serial_hub_remove(struct serdev_device *serdev)
+{
+	struct ssam_controller *ctrl = serdev_device_get_drvdata(serdev);
+	int status;
+
+	/* Clear static reference so that no one else can get a new one. */
+	ssam_clear_controller();
+
+	/* Disable and free IRQ. */
+	ssam_irq_free(ctrl);
+
+	sysfs_remove_group(&serdev->dev.kobj, &ssam_sam_group);
+	ssam_controller_lock(ctrl);
+
+	/* Act as if suspending to silence events. */
+	status = ssam_ctrl_notif_display_off(ctrl);
+	if (status) {
+		dev_err(&serdev->dev, "display-off notification failed: %d\n",
+			status);
+	}
+
+	status = ssam_ctrl_notif_d0_exit(ctrl);
+	if (status) {
+		dev_err(&serdev->dev, "D0-exit notification failed: %d\n",
+			status);
+	}
+
+	/* Shut down controller and remove serdev device reference from it. */
+	ssam_controller_shutdown(ctrl);
+
+	/* Shut down actual transport. */
+	serdev_device_wait_until_sent(serdev, 0);
+	serdev_device_close(serdev);
+
+	/* Drop our controller reference. */
+	ssam_controller_unlock(ctrl);
+	ssam_controller_put(ctrl);
+
+	device_set_wakeup_capable(&serdev->dev, false);
+}
+
+static const struct acpi_device_id ssam_serial_hub_match[] = {
+	{ "MSHW0084", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, ssam_serial_hub_match);
+
+static struct serdev_device_driver ssam_serial_hub = {
+	.probe = ssam_serial_hub_probe,
+	.remove = ssam_serial_hub_remove,
+	.driver = {
+		.name = "surface_serial_hub",
+		.acpi_match_table = ssam_serial_hub_match,
+		.pm = &ssam_serial_hub_pm_ops,
+		.shutdown = ssam_serial_hub_shutdown,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+module_serdev_device_driver(ssam_serial_hub);
+
+MODULE_AUTHOR("Maximilian Luz <luzmaximilian@gmail.com>");
+MODULE_DESCRIPTION("Subsystem and Surface Serial Hub driver for Surface System Aggregator Module");
+MODULE_LICENSE("GPL");
diff --git a/drivers/platform/surface/aggregator/ssh_msgb.h b/drivers/platform/surface/aggregator/ssh_msgb.h
new file mode 100644
index 000000000000..1221f642dda1
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_msgb.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SSH message builder functions.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _SURFACE_AGGREGATOR_SSH_MSGB_H
+#define _SURFACE_AGGREGATOR_SSH_MSGB_H
+
+#include <asm/unaligned.h>
+#include <linux/types.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include <linux/surface_aggregator/serial_hub.h>
+
+/**
+ * struct msgbuf - Buffer struct to construct SSH messages.
+ * @begin: Pointer to the beginning of the allocated buffer space.
+ * @end:   Pointer to the end (one past last element) of the allocated buffer
+ *         space.
+ * @ptr:   Pointer to the first free element in the buffer.
+ */
+struct msgbuf {
+	u8 *begin;
+	u8 *end;
+	u8 *ptr;
+};
+
+/**
+ * msgb_init() - Initialize the given message buffer struct.
+ * @msgb: The buffer struct to initialize
+ * @ptr:  Pointer to the underlying memory by which the buffer will be backed.
+ * @cap:  Size of the underlying memory.
+ *
+ * Initialize the given message buffer struct using the provided memory as
+ * backing.
+ */
+static inline void msgb_init(struct msgbuf *msgb, u8 *ptr, size_t cap)
+{
+	msgb->begin = ptr;
+	msgb->end = ptr + cap;
+	msgb->ptr = ptr;
+}
+
+/**
+ * msgb_bytes_used() - Return the current number of bytes used in the buffer.
+ * @msgb: The message buffer.
+ */
+static inline size_t msgb_bytes_used(const struct msgbuf *msgb)
+{
+	return msgb->ptr - msgb->begin;
+}
+
+static inline void __msgb_push_u8(struct msgbuf *msgb, u8 value)
+{
+	*msgb->ptr = value;
+	msgb->ptr += sizeof(u8);
+}
+
+static inline void __msgb_push_u16(struct msgbuf *msgb, u16 value)
+{
+	put_unaligned_le16(value, msgb->ptr);
+	msgb->ptr += sizeof(u16);
+}
+
+/**
+ * msgb_push_u16() - Push a u16 value to the buffer.
+ * @msgb:  The message buffer.
+ * @value: The value to push to the buffer.
+ */
+static inline void msgb_push_u16(struct msgbuf *msgb, u16 value)
+{
+	if (WARN_ON(msgb->ptr + sizeof(u16) > msgb->end))
+		return;
+
+	__msgb_push_u16(msgb, value);
+}
+
+/**
+ * msgb_push_syn() - Push SSH SYN bytes to the buffer.
+ * @msgb: The message buffer.
+ */
+static inline void msgb_push_syn(struct msgbuf *msgb)
+{
+	msgb_push_u16(msgb, SSH_MSG_SYN);
+}
+
+/**
+ * msgb_push_buf() - Push raw data to the buffer.
+ * @msgb: The message buffer.
+ * @buf:  The data to push to the buffer.
+ * @len:  The length of the data to push to the buffer.
+ */
+static inline void msgb_push_buf(struct msgbuf *msgb, const u8 *buf, size_t len)
+{
+	msgb->ptr = memcpy(msgb->ptr, buf, len) + len;
+}
+
+/**
+ * msgb_push_crc() - Compute CRC and push it to the buffer.
+ * @msgb: The message buffer.
+ * @buf:  The data for which the CRC should be computed.
+ * @len:  The length of the data for which the CRC should be computed.
+ */
+static inline void msgb_push_crc(struct msgbuf *msgb, const u8 *buf, size_t len)
+{
+	msgb_push_u16(msgb, ssh_crc(buf, len));
+}
+
+/**
+ * msgb_push_frame() - Push a SSH message frame header to the buffer.
+ * @msgb: The message buffer
+ * @ty:   The type of the frame.
+ * @len:  The length of the payload of the frame.
+ * @seq:  The sequence ID of the frame/packet.
+ */
+static inline void msgb_push_frame(struct msgbuf *msgb, u8 ty, u16 len, u8 seq)
+{
+	u8 *const begin = msgb->ptr;
+
+	if (WARN_ON(msgb->ptr + sizeof(struct ssh_frame) > msgb->end))
+		return;
+
+	__msgb_push_u8(msgb, ty);	/* Frame type. */
+	__msgb_push_u16(msgb, len);	/* Frame payload length. */
+	__msgb_push_u8(msgb, seq);	/* Frame sequence ID. */
+
+	msgb_push_crc(msgb, begin, msgb->ptr - begin);
+}
+
+/**
+ * msgb_push_ack() - Push a SSH ACK frame to the buffer.
+ * @msgb: The message buffer
+ * @seq:  The sequence ID of the frame/packet to be ACKed.
+ */
+static inline void msgb_push_ack(struct msgbuf *msgb, u8 seq)
+{
+	/* SYN. */
+	msgb_push_syn(msgb);
+
+	/* ACK-type frame + CRC. */
+	msgb_push_frame(msgb, SSH_FRAME_TYPE_ACK, 0x00, seq);
+
+	/* Payload CRC (ACK-type frames do not have a payload). */
+	msgb_push_crc(msgb, msgb->ptr, 0);
+}
+
+/**
+ * msgb_push_nak() - Push a SSH NAK frame to the buffer.
+ * @msgb: The message buffer
+ */
+static inline void msgb_push_nak(struct msgbuf *msgb)
+{
+	/* SYN. */
+	msgb_push_syn(msgb);
+
+	/* NAK-type frame + CRC. */
+	msgb_push_frame(msgb, SSH_FRAME_TYPE_NAK, 0x00, 0x00);
+
+	/* Payload CRC (ACK-type frames do not have a payload). */
+	msgb_push_crc(msgb, msgb->ptr, 0);
+}
+
+/**
+ * msgb_push_cmd() - Push a SSH command frame with payload to the buffer.
+ * @msgb: The message buffer.
+ * @seq:  The sequence ID (SEQ) of the frame/packet.
+ * @rqid: The request ID (RQID) of the request contained in the frame.
+ * @rqst: The request to wrap in the frame.
+ */
+static inline void msgb_push_cmd(struct msgbuf *msgb, u8 seq, u16 rqid,
+				 const struct ssam_request *rqst)
+{
+	const u8 type = SSH_FRAME_TYPE_DATA_SEQ;
+	u8 *cmd;
+
+	/* SYN. */
+	msgb_push_syn(msgb);
+
+	/* Command frame + CRC. */
+	msgb_push_frame(msgb, type, sizeof(struct ssh_command) + rqst->length, seq);
+
+	/* Frame payload: Command struct + payload. */
+	if (WARN_ON(msgb->ptr + sizeof(struct ssh_command) > msgb->end))
+		return;
+
+	cmd = msgb->ptr;
+
+	__msgb_push_u8(msgb, SSH_PLD_TYPE_CMD);		/* Payload type. */
+	__msgb_push_u8(msgb, rqst->target_category);	/* Target category. */
+	__msgb_push_u8(msgb, rqst->target_id);		/* Target ID (out). */
+	__msgb_push_u8(msgb, 0x00);			/* Target ID (in). */
+	__msgb_push_u8(msgb, rqst->instance_id);	/* Instance ID. */
+	__msgb_push_u16(msgb, rqid);			/* Request ID. */
+	__msgb_push_u8(msgb, rqst->command_id);		/* Command ID. */
+
+	/* Command payload. */
+	msgb_push_buf(msgb, rqst->payload, rqst->length);
+
+	/* CRC for command struct + payload. */
+	msgb_push_crc(msgb, cmd, msgb->ptr - cmd);
+}
+
+#endif /* _SURFACE_AGGREGATOR_SSH_MSGB_H */
diff --git a/drivers/platform/surface/aggregator/ssh_packet_layer.c b/drivers/platform/surface/aggregator/ssh_packet_layer.c
new file mode 100644
index 000000000000..66e38fdc7963
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_packet_layer.c
@@ -0,0 +1,1710 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * SSH packet transport layer.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <asm/unaligned.h>
+#include <linux/atomic.h>
+#include <linux/jiffies.h>
+#include <linux/kfifo.h>
+#include <linux/kref.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/serdev.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+
+#include "ssh_msgb.h"
+#include "ssh_packet_layer.h"
+#include "ssh_parser.h"
+
+/*
+ * To simplify reasoning about the code below, we define a few concepts. The
+ * system below is similar to a state-machine for packets, however, there are
+ * too many states to explicitly write them down. To (somewhat) manage the
+ * states and packets we rely on flags, reference counting, and some simple
+ * concepts. State transitions are triggered by actions.
+ *
+ * >> Actions <<
+ *
+ * - submit
+ * - transmission start (process next item in queue)
+ * - transmission finished (guaranteed to never be parallel to transmission
+ *   start)
+ * - ACK received
+ * - NAK received (this is equivalent to issuing re-submit for all pending
+ *   packets)
+ * - timeout (this is equivalent to re-issuing a submit or canceling)
+ * - cancel (non-pending and pending)
+ *
+ * >> Data Structures, Packet Ownership, General Overview <<
+ *
+ * The code below employs two main data structures: The packet queue,
+ * containing all packets scheduled for transmission, and the set of pending
+ * packets, containing all packets awaiting an ACK.
+ *
+ * Shared ownership of a packet is controlled via reference counting. Inside
+ * the transport system are a total of five packet owners:
+ *
+ * - the packet queue,
+ * - the pending set,
+ * - the transmitter thread,
+ * - the receiver thread (via ACKing), and
+ * - the timeout work item.
+ *
+ * Normal operation is as follows: The initial reference of the packet is
+ * obtained by submitting the packet and queuing it. The receiver thread takes
+ * packets from the queue. By doing this, it does not increment the refcount
+ * but takes over the reference (removing it from the queue). If the packet is
+ * sequenced (i.e. needs to be ACKed by the client), the transmitter thread
+ * sets-up the timeout and adds the packet to the pending set before starting
+ * to transmit it. As the timeout is handled by a reaper task, no additional
+ * reference for it is needed. After the transmit is done, the reference held
+ * by the transmitter thread is dropped. If the packet is unsequenced (i.e.
+ * does not need an ACK), the packet is completed by the transmitter thread
+ * before dropping that reference.
+ *
+ * On receival of an ACK, the receiver thread removes and obtains the
+ * reference to the packet from the pending set. The receiver thread will then
+ * complete the packet and drop its reference.
+ *
+ * On receival of a NAK, the receiver thread re-submits all currently pending
+ * packets.
+ *
+ * Packet timeouts are detected by the timeout reaper. This is a task,
+ * scheduled depending on the earliest packet timeout expiration date,
+ * checking all currently pending packets if their timeout has expired. If the
+ * timeout of a packet has expired, it is re-submitted and the number of tries
+ * of this packet is incremented. If this number reaches its limit, the packet
+ * will be completed with a failure.
+ *
+ * On transmission failure (such as repeated packet timeouts), the completion
+ * callback is immediately run by on thread on which the error was detected.
+ *
+ * To ensure that a packet eventually leaves the system it is marked as
+ * "locked" directly before it is going to be completed or when it is
+ * canceled. Marking a packet as "locked" has the effect that passing and
+ * creating new references of the packet is disallowed. This means that the
+ * packet cannot be added to the queue, the pending set, and the timeout, or
+ * be picked up by the transmitter thread or receiver thread. To remove a
+ * packet from the system it has to be marked as locked and subsequently all
+ * references from the data structures (queue, pending) have to be removed.
+ * References held by threads will eventually be dropped automatically as
+ * their execution progresses.
+ *
+ * Note that the packet completion callback is, in case of success and for a
+ * sequenced packet, guaranteed to run on the receiver thread, thus providing
+ * a way to reliably identify responses to the packet. The packet completion
+ * callback is only run once and it does not indicate that the packet has
+ * fully left the system (for this, one should rely on the release method,
+ * triggered when the reference count of the packet reaches zero). In case of
+ * re-submission (and with somewhat unlikely timing), it may be possible that
+ * the packet is being re-transmitted while the completion callback runs.
+ * Completion will occur both on success and internal error, as well as when
+ * the packet is canceled.
+ *
+ * >> Flags <<
+ *
+ * Flags are used to indicate the state and progression of a packet. Some flags
+ * have stricter guarantees than other:
+ *
+ * - locked
+ *   Indicates if the packet is locked. If the packet is locked, passing and/or
+ *   creating additional references to the packet is forbidden. The packet thus
+ *   may not be queued, dequeued, or removed or added to the pending set. Note
+ *   that the packet state flags may still change (e.g. it may be marked as
+ *   ACKed, transmitted, ...).
+ *
+ * - completed
+ *   Indicates if the packet completion callback has been executed or is about
+ *   to be executed. This flag is used to ensure that the packet completion
+ *   callback is only run once.
+ *
+ * - queued
+ *   Indicates if a packet is present in the submission queue or not. This flag
+ *   must only be modified with the queue lock held, and must be coherent to the
+ *   presence of the packet in the queue.
+ *
+ * - pending
+ *   Indicates if a packet is present in the set of pending packets or not.
+ *   This flag must only be modified with the pending lock held, and must be
+ *   coherent to the presence of the packet in the pending set.
+ *
+ * - transmitting
+ *   Indicates if the packet is currently transmitting. In case of
+ *   re-transmissions, it is only safe to wait on the "transmitted" completion
+ *   after this flag has been set. The completion will be set both in success
+ *   and error case.
+ *
+ * - transmitted
+ *   Indicates if the packet has been transmitted. This flag is not cleared by
+ *   the system, thus it indicates the first transmission only.
+ *
+ * - acked
+ *   Indicates if the packet has been acknowledged by the client. There are no
+ *   other guarantees given. For example, the packet may still be canceled
+ *   and/or the completion may be triggered an error even though this bit is
+ *   set. Rely on the status provided to the completion callback instead.
+ *
+ * - canceled
+ *   Indicates if the packet has been canceled from the outside. There are no
+ *   other guarantees given. Specifically, the packet may be completed by
+ *   another part of the system before the cancellation attempts to complete it.
+ *
+ * >> General Notes <<
+ *
+ * - To avoid deadlocks, if both queue and pending locks are required, the
+ *   pending lock must be acquired before the queue lock.
+ *
+ * - The packet priority must be accessed only while holding the queue lock.
+ *
+ * - The packet timestamp must be accessed only while holding the pending
+ *   lock.
+ */
+
+/*
+ * SSH_PTL_MAX_PACKET_TRIES - Maximum transmission attempts for packet.
+ *
+ * Maximum number of transmission attempts per sequenced packet in case of
+ * time-outs. Must be smaller than 16. If the packet times out after this
+ * amount of tries, the packet will be completed with %-ETIMEDOUT as status
+ * code.
+ */
+#define SSH_PTL_MAX_PACKET_TRIES		3
+
+/*
+ * SSH_PTL_TX_TIMEOUT - Packet transmission timeout.
+ *
+ * Timeout in jiffies for packet transmission via the underlying serial
+ * device. If transmitting the packet takes longer than this timeout, the
+ * packet will be completed with -ETIMEDOUT. It will not be re-submitted.
+ */
+#define SSH_PTL_TX_TIMEOUT			HZ
+
+/*
+ * SSH_PTL_PACKET_TIMEOUT - Packet response timeout.
+ *
+ * Timeout as ktime_t delta for ACKs. If we have not received an ACK in this
+ * time-frame after starting transmission, the packet will be re-submitted.
+ */
+#define SSH_PTL_PACKET_TIMEOUT			ms_to_ktime(1000)
+
+/*
+ * SSH_PTL_PACKET_TIMEOUT_RESOLUTION - Packet timeout granularity.
+ *
+ * Time-resolution for timeouts. Should be larger than one jiffy to avoid
+ * direct re-scheduling of reaper work_struct.
+ */
+#define SSH_PTL_PACKET_TIMEOUT_RESOLUTION	ms_to_ktime(max(2000 / HZ, 50))
+
+/*
+ * SSH_PTL_MAX_PENDING - Maximum number of pending packets.
+ *
+ * Maximum number of sequenced packets concurrently waiting for an ACK.
+ * Packets marked as blocking will not be transmitted while this limit is
+ * reached.
+ */
+#define SSH_PTL_MAX_PENDING			1
+
+/*
+ * SSH_PTL_RX_BUF_LEN - Evaluation-buffer size in bytes.
+ */
+#define SSH_PTL_RX_BUF_LEN			4096
+
+/*
+ * SSH_PTL_RX_FIFO_LEN - Fifo input-buffer size in bytes.
+ */
+#define SSH_PTL_RX_FIFO_LEN			4096
+
+static void __ssh_ptl_packet_release(struct kref *kref)
+{
+	struct ssh_packet *p = container_of(kref, struct ssh_packet, refcnt);
+
+	ptl_dbg_cond(p->ptl, "ptl: releasing packet %p\n", p);
+	p->ops->release(p);
+}
+
+/**
+ * ssh_packet_get() - Increment reference count of packet.
+ * @packet: The packet to increment the reference count of.
+ *
+ * Increments the reference count of the given packet. See ssh_packet_put()
+ * for the counter-part of this function.
+ *
+ * Return: Returns the packet provided as input.
+ */
+struct ssh_packet *ssh_packet_get(struct ssh_packet *packet)
+{
+	if (packet)
+		kref_get(&packet->refcnt);
+	return packet;
+}
+EXPORT_SYMBOL_GPL(ssh_packet_get);
+
+/**
+ * ssh_packet_put() - Decrement reference count of packet.
+ * @packet: The packet to decrement the reference count of.
+ *
+ * If the reference count reaches zero, the ``release`` callback specified in
+ * the packet's &struct ssh_packet_ops, i.e. ``packet->ops->release``, will be
+ * called.
+ *
+ * See ssh_packet_get() for the counter-part of this function.
+ */
+void ssh_packet_put(struct ssh_packet *packet)
+{
+	if (packet)
+		kref_put(&packet->refcnt, __ssh_ptl_packet_release);
+}
+EXPORT_SYMBOL_GPL(ssh_packet_put);
+
+static u8 ssh_packet_get_seq(struct ssh_packet *packet)
+{
+	return packet->data.ptr[SSH_MSGOFFSET_FRAME(seq)];
+}
+
+/**
+ * ssh_packet_init() - Initialize SSH packet.
+ * @packet:   The packet to initialize.
+ * @type:     Type-flags of the packet.
+ * @priority: Priority of the packet. See SSH_PACKET_PRIORITY() for details.
+ * @ops:      Packet operations.
+ *
+ * Initializes the given SSH packet. Sets the transmission buffer pointer to
+ * %NULL and the transmission buffer length to zero. For data-type packets,
+ * this buffer has to be set separately via ssh_packet_set_data() before
+ * submission, and must contain a valid SSH message, i.e. frame with optional
+ * payload of any type.
+ */
+void ssh_packet_init(struct ssh_packet *packet, unsigned long type,
+		     u8 priority, const struct ssh_packet_ops *ops)
+{
+	kref_init(&packet->refcnt);
+
+	packet->ptl = NULL;
+	INIT_LIST_HEAD(&packet->queue_node);
+	INIT_LIST_HEAD(&packet->pending_node);
+
+	packet->state = type & SSH_PACKET_FLAGS_TY_MASK;
+	packet->priority = priority;
+	packet->timestamp = KTIME_MAX;
+
+	packet->data.ptr = NULL;
+	packet->data.len = 0;
+
+	packet->ops = ops;
+}
+
+/**
+ * ssh_ctrl_packet_alloc() - Allocate control packet.
+ * @packet: Where the pointer to the newly allocated packet should be stored.
+ * @buffer: The buffer corresponding to this packet.
+ * @flags:  Flags used for allocation.
+ *
+ * Allocates a packet and corresponding transport buffer. Sets the packet's
+ * buffer reference to the allocated buffer. The packet must be freed via
+ * ssh_ctrl_packet_free(), which will also free the corresponding buffer. The
+ * corresponding buffer must not be freed separately. Intended to be used with
+ * %ssh_ptl_ctrl_packet_ops as packet operations.
+ *
+ * Return: Returns zero on success, %-ENOMEM if the allocation failed.
+ */
+static int ssh_ctrl_packet_alloc(struct ssh_packet **packet,
+				 struct ssam_span *buffer, gfp_t flags)
+{
+	*packet = kzalloc(sizeof(**packet) + SSH_MSG_LEN_CTRL, flags);
+	if (!*packet)
+		return -ENOMEM;
+
+	buffer->ptr = (u8 *)(*packet + 1);
+	buffer->len = SSH_MSG_LEN_CTRL;
+
+	return 0;
+}
+
+/**
+ * ssh_ctrl_packet_free() - Free control packet.
+ * @p: The packet to free.
+ */
+static void ssh_ctrl_packet_free(struct ssh_packet *p)
+{
+	kfree(p);
+}
+
+static const struct ssh_packet_ops ssh_ptl_ctrl_packet_ops = {
+	.complete = NULL,
+	.release = ssh_ctrl_packet_free,
+};
+
+static void ssh_ptl_timeout_reaper_mod(struct ssh_ptl *ptl, ktime_t now,
+				       ktime_t expires)
+{
+	unsigned long delta = msecs_to_jiffies(ktime_ms_delta(expires, now));
+	ktime_t aexp = ktime_add(expires, SSH_PTL_PACKET_TIMEOUT_RESOLUTION);
+
+	spin_lock(&ptl->rtx_timeout.lock);
+
+	/* Re-adjust / schedule reaper only if it is above resolution delta. */
+	if (ktime_before(aexp, ptl->rtx_timeout.expires)) {
+		ptl->rtx_timeout.expires = expires;
+		mod_delayed_work(system_wq, &ptl->rtx_timeout.reaper, delta);
+	}
+
+	spin_unlock(&ptl->rtx_timeout.lock);
+}
+
+/* Must be called with queue lock held. */
+static void ssh_packet_next_try(struct ssh_packet *p)
+{
+	u8 base = ssh_packet_priority_get_base(p->priority);
+	u8 try = ssh_packet_priority_get_try(p->priority);
+
+	lockdep_assert_held(&p->ptl->queue.lock);
+
+	p->priority = __SSH_PACKET_PRIORITY(base, try + 1);
+}
+
+/* Must be called with queue lock held. */
+static struct list_head *__ssh_ptl_queue_find_entrypoint(struct ssh_packet *p)
+{
+	struct list_head *head;
+	struct ssh_packet *q;
+
+	lockdep_assert_held(&p->ptl->queue.lock);
+
+	/*
+	 * We generally assume that there are less control (ACK/NAK) packets
+	 * and re-submitted data packets as there are normal data packets (at
+	 * least in situations in which many packets are queued; if there
+	 * aren't many packets queued the decision on how to iterate should be
+	 * basically irrelevant; the number of control/data packets is more or
+	 * less limited via the maximum number of pending packets). Thus, when
+	 * inserting a control or re-submitted data packet, (determined by
+	 * their priority), we search from front to back. Normal data packets
+	 * are, usually queued directly at the tail of the queue, so for those
+	 * search from back to front.
+	 */
+
+	if (p->priority > SSH_PACKET_PRIORITY(DATA, 0)) {
+		list_for_each(head, &p->ptl->queue.head) {
+			q = list_entry(head, struct ssh_packet, queue_node);
+
+			if (q->priority < p->priority)
+				break;
+		}
+	} else {
+		list_for_each_prev(head, &p->ptl->queue.head) {
+			q = list_entry(head, struct ssh_packet, queue_node);
+
+			if (q->priority >= p->priority) {
+				head = head->next;
+				break;
+			}
+		}
+	}
+
+	return head;
+}
+
+/* Must be called with queue lock held. */
+static int __ssh_ptl_queue_push(struct ssh_packet *packet)
+{
+	struct ssh_ptl *ptl = packet->ptl;
+	struct list_head *head;
+
+	lockdep_assert_held(&ptl->queue.lock);
+
+	if (test_bit(SSH_PTL_SF_SHUTDOWN_BIT, &ptl->state))
+		return -ESHUTDOWN;
+
+	/* Avoid further transitions when canceling/completing. */
+	if (test_bit(SSH_PACKET_SF_LOCKED_BIT, &packet->state))
+		return -EINVAL;
+
+	/* If this packet has already been queued, do not add it. */
+	if (test_and_set_bit(SSH_PACKET_SF_QUEUED_BIT, &packet->state))
+		return -EALREADY;
+
+	head = __ssh_ptl_queue_find_entrypoint(packet);
+
+	list_add_tail(&ssh_packet_get(packet)->queue_node, head);
+	return 0;
+}
+
+static int ssh_ptl_queue_push(struct ssh_packet *packet)
+{
+	int status;
+
+	spin_lock(&packet->ptl->queue.lock);
+	status = __ssh_ptl_queue_push(packet);
+	spin_unlock(&packet->ptl->queue.lock);
+
+	return status;
+}
+
+static void ssh_ptl_queue_remove(struct ssh_packet *packet)
+{
+	struct ssh_ptl *ptl = packet->ptl;
+
+	spin_lock(&ptl->queue.lock);
+
+	if (!test_and_clear_bit(SSH_PACKET_SF_QUEUED_BIT, &packet->state)) {
+		spin_unlock(&ptl->queue.lock);
+		return;
+	}
+
+	list_del(&packet->queue_node);
+
+	spin_unlock(&ptl->queue.lock);
+	ssh_packet_put(packet);
+}
+
+static void ssh_ptl_pending_push(struct ssh_packet *p)
+{
+	struct ssh_ptl *ptl = p->ptl;
+	const ktime_t timestamp = ktime_get_coarse_boottime();
+	const ktime_t timeout = ptl->rtx_timeout.timeout;
+
+	/*
+	 * Note: We can get the time for the timestamp before acquiring the
+	 * lock as this is the only place we're setting it and this function
+	 * is called only from the transmitter thread. Thus it is not possible
+	 * to overwrite the timestamp with an outdated value below.
+	 */
+
+	spin_lock(&ptl->pending.lock);
+
+	/* If we are canceling/completing this packet, do not add it. */
+	if (test_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state)) {
+		spin_unlock(&ptl->pending.lock);
+		return;
+	}
+
+	/*
+	 * On re-submission, the packet has already been added the pending
+	 * set. We still need to update the timestamp as the packet timeout is
+	 * reset for each (re-)submission.
+	 */
+	p->timestamp = timestamp;
+
+	/* In case it is already pending (e.g. re-submission), do not add it. */
+	if (!test_and_set_bit(SSH_PACKET_SF_PENDING_BIT, &p->state)) {
+		atomic_inc(&ptl->pending.count);
+		list_add_tail(&ssh_packet_get(p)->pending_node, &ptl->pending.head);
+	}
+
+	spin_unlock(&ptl->pending.lock);
+
+	/* Arm/update timeout reaper. */
+	ssh_ptl_timeout_reaper_mod(ptl, timestamp, timestamp + timeout);
+}
+
+static void ssh_ptl_pending_remove(struct ssh_packet *packet)
+{
+	struct ssh_ptl *ptl = packet->ptl;
+
+	spin_lock(&ptl->pending.lock);
+
+	if (!test_and_clear_bit(SSH_PACKET_SF_PENDING_BIT, &packet->state)) {
+		spin_unlock(&ptl->pending.lock);
+		return;
+	}
+
+	list_del(&packet->pending_node);
+	atomic_dec(&ptl->pending.count);
+
+	spin_unlock(&ptl->pending.lock);
+
+	ssh_packet_put(packet);
+}
+
+/* Warning: Does not check/set "completed" bit. */
+static void __ssh_ptl_complete(struct ssh_packet *p, int status)
+{
+	struct ssh_ptl *ptl = READ_ONCE(p->ptl);
+
+	ptl_dbg_cond(ptl, "ptl: completing packet %p (status: %d)\n", p, status);
+
+	if (p->ops->complete)
+		p->ops->complete(p, status);
+}
+
+static void ssh_ptl_remove_and_complete(struct ssh_packet *p, int status)
+{
+	/*
+	 * A call to this function should in general be preceded by
+	 * set_bit(SSH_PACKET_SF_LOCKED_BIT, &p->flags) to avoid re-adding the
+	 * packet to the structures it's going to be removed from.
+	 *
+	 * The set_bit call does not need explicit memory barriers as the
+	 * implicit barrier of the test_and_set_bit() call below ensure that the
+	 * flag is visible before we actually attempt to remove the packet.
+	 */
+
+	if (test_and_set_bit(SSH_PACKET_SF_COMPLETED_BIT, &p->state))
+		return;
+
+	ssh_ptl_queue_remove(p);
+	ssh_ptl_pending_remove(p);
+
+	__ssh_ptl_complete(p, status);
+}
+
+static bool ssh_ptl_tx_can_process(struct ssh_packet *packet)
+{
+	struct ssh_ptl *ptl = packet->ptl;
+
+	if (test_bit(SSH_PACKET_TY_FLUSH_BIT, &packet->state))
+		return !atomic_read(&ptl->pending.count);
+
+	/* We can always process non-blocking packets. */
+	if (!test_bit(SSH_PACKET_TY_BLOCKING_BIT, &packet->state))
+		return true;
+
+	/* If we are already waiting for this packet, send it again. */
+	if (test_bit(SSH_PACKET_SF_PENDING_BIT, &packet->state))
+		return true;
+
+	/* Otherwise: Check if we have the capacity to send. */
+	return atomic_read(&ptl->pending.count) < SSH_PTL_MAX_PENDING;
+}
+
+static struct ssh_packet *ssh_ptl_tx_pop(struct ssh_ptl *ptl)
+{
+	struct ssh_packet *packet = ERR_PTR(-ENOENT);
+	struct ssh_packet *p, *n;
+
+	spin_lock(&ptl->queue.lock);
+	list_for_each_entry_safe(p, n, &ptl->queue.head, queue_node) {
+		/*
+		 * If we are canceling or completing this packet, ignore it.
+		 * It's going to be removed from this queue shortly.
+		 */
+		if (test_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state))
+			continue;
+
+		/*
+		 * Packets should be ordered non-blocking/to-be-resent first.
+		 * If we cannot process this packet, assume that we can't
+		 * process any following packet either and abort.
+		 */
+		if (!ssh_ptl_tx_can_process(p)) {
+			packet = ERR_PTR(-EBUSY);
+			break;
+		}
+
+		/*
+		 * We are allowed to change the state now. Remove it from the
+		 * queue and mark it as being transmitted.
+		 */
+
+		list_del(&p->queue_node);
+
+		set_bit(SSH_PACKET_SF_TRANSMITTING_BIT, &p->state);
+		/* Ensure that state never gets zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_PACKET_SF_QUEUED_BIT, &p->state);
+
+		/*
+		 * Update number of tries. This directly influences the
+		 * priority in case the packet is re-submitted (e.g. via
+		 * timeout/NAK). Note that all reads and writes to the
+		 * priority after the first submission are guarded by the
+		 * queue lock.
+		 */
+		ssh_packet_next_try(p);
+
+		packet = p;
+		break;
+	}
+	spin_unlock(&ptl->queue.lock);
+
+	return packet;
+}
+
+static struct ssh_packet *ssh_ptl_tx_next(struct ssh_ptl *ptl)
+{
+	struct ssh_packet *p;
+
+	p = ssh_ptl_tx_pop(ptl);
+	if (IS_ERR(p))
+		return p;
+
+	if (test_bit(SSH_PACKET_TY_SEQUENCED_BIT, &p->state)) {
+		ptl_dbg(ptl, "ptl: transmitting sequenced packet %p\n", p);
+		ssh_ptl_pending_push(p);
+	} else {
+		ptl_dbg(ptl, "ptl: transmitting non-sequenced packet %p\n", p);
+	}
+
+	return p;
+}
+
+static void ssh_ptl_tx_compl_success(struct ssh_packet *packet)
+{
+	struct ssh_ptl *ptl = packet->ptl;
+
+	ptl_dbg(ptl, "ptl: successfully transmitted packet %p\n", packet);
+
+	/* Transition state to "transmitted". */
+	set_bit(SSH_PACKET_SF_TRANSMITTED_BIT, &packet->state);
+	/* Ensure that state never gets zero. */
+	smp_mb__before_atomic();
+	clear_bit(SSH_PACKET_SF_TRANSMITTING_BIT, &packet->state);
+
+	/* If the packet is unsequenced, we're done: Lock and complete. */
+	if (!test_bit(SSH_PACKET_TY_SEQUENCED_BIT, &packet->state)) {
+		set_bit(SSH_PACKET_SF_LOCKED_BIT, &packet->state);
+		ssh_ptl_remove_and_complete(packet, 0);
+	}
+
+	/*
+	 * Notify that a packet transmission has finished. In general we're only
+	 * waiting for one packet (if any), so wake_up_all should be fine.
+	 */
+	wake_up_all(&ptl->tx.packet_wq);
+}
+
+static void ssh_ptl_tx_compl_error(struct ssh_packet *packet, int status)
+{
+	/* Transmission failure: Lock the packet and try to complete it. */
+	set_bit(SSH_PACKET_SF_LOCKED_BIT, &packet->state);
+	/* Ensure that state never gets zero. */
+	smp_mb__before_atomic();
+	clear_bit(SSH_PACKET_SF_TRANSMITTING_BIT, &packet->state);
+
+	ptl_err(packet->ptl, "ptl: transmission error: %d\n", status);
+	ptl_dbg(packet->ptl, "ptl: failed to transmit packet: %p\n", packet);
+
+	ssh_ptl_remove_and_complete(packet, status);
+
+	/*
+	 * Notify that a packet transmission has finished. In general we're only
+	 * waiting for one packet (if any), so wake_up_all should be fine.
+	 */
+	wake_up_all(&packet->ptl->tx.packet_wq);
+}
+
+static long ssh_ptl_tx_wait_packet(struct ssh_ptl *ptl)
+{
+	int status;
+
+	status = wait_for_completion_interruptible(&ptl->tx.thread_cplt_pkt);
+	reinit_completion(&ptl->tx.thread_cplt_pkt);
+
+	/*
+	 * Ensure completion is cleared before continuing to avoid lost update
+	 * problems.
+	 */
+	smp_mb__after_atomic();
+
+	return status;
+}
+
+static long ssh_ptl_tx_wait_transfer(struct ssh_ptl *ptl, long timeout)
+{
+	long status;
+
+	status = wait_for_completion_interruptible_timeout(&ptl->tx.thread_cplt_tx,
+							   timeout);
+	reinit_completion(&ptl->tx.thread_cplt_tx);
+
+	/*
+	 * Ensure completion is cleared before continuing to avoid lost update
+	 * problems.
+	 */
+	smp_mb__after_atomic();
+
+	return status;
+}
+
+static int ssh_ptl_tx_packet(struct ssh_ptl *ptl, struct ssh_packet *packet)
+{
+	long timeout = SSH_PTL_TX_TIMEOUT;
+	size_t offset = 0;
+
+	/* Note: Flush-packets don't have any data. */
+	if (unlikely(!packet->data.ptr))
+		return 0;
+
+	ptl_dbg(ptl, "tx: sending data (length: %zu)\n", packet->data.len);
+	print_hex_dump_debug("tx: ", DUMP_PREFIX_OFFSET, 16, 1,
+			     packet->data.ptr, packet->data.len, false);
+
+	do {
+		ssize_t status, len;
+		u8 *buf;
+
+		buf = packet->data.ptr + offset;
+		len = packet->data.len - offset;
+
+		status = serdev_device_write_buf(ptl->serdev, buf, len);
+		if (status < 0)
+			return status;
+
+		if (status == len)
+			return 0;
+
+		offset += status;
+
+		timeout = ssh_ptl_tx_wait_transfer(ptl, timeout);
+		if (kthread_should_stop() || !atomic_read(&ptl->tx.running))
+			return -ESHUTDOWN;
+
+		if (timeout < 0)
+			return -EINTR;
+
+		if (timeout == 0)
+			return -ETIMEDOUT;
+	} while (true);
+}
+
+static int ssh_ptl_tx_threadfn(void *data)
+{
+	struct ssh_ptl *ptl = data;
+
+	while (!kthread_should_stop() && atomic_read(&ptl->tx.running)) {
+		struct ssh_packet *packet;
+		int status;
+
+		/* Try to get the next packet. */
+		packet = ssh_ptl_tx_next(ptl);
+
+		/* If no packet can be processed, we are done. */
+		if (IS_ERR(packet)) {
+			ssh_ptl_tx_wait_packet(ptl);
+			continue;
+		}
+
+		/* Transfer and complete packet. */
+		status = ssh_ptl_tx_packet(ptl, packet);
+		if (status)
+			ssh_ptl_tx_compl_error(packet, status);
+		else
+			ssh_ptl_tx_compl_success(packet);
+
+		ssh_packet_put(packet);
+	}
+
+	return 0;
+}
+
+/**
+ * ssh_ptl_tx_wakeup_packet() - Wake up packet transmitter thread for new
+ * packet.
+ * @ptl: The packet transport layer.
+ *
+ * Wakes up the packet transmitter thread, notifying it that a new packet has
+ * arrived and is ready for transfer. If the packet transport layer has been
+ * shut down, calls to this function will be ignored.
+ */
+static void ssh_ptl_tx_wakeup_packet(struct ssh_ptl *ptl)
+{
+	if (test_bit(SSH_PTL_SF_SHUTDOWN_BIT, &ptl->state))
+		return;
+
+	complete(&ptl->tx.thread_cplt_pkt);
+}
+
+/**
+ * ssh_ptl_tx_start() - Start packet transmitter thread.
+ * @ptl: The packet transport layer.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+int ssh_ptl_tx_start(struct ssh_ptl *ptl)
+{
+	atomic_set_release(&ptl->tx.running, 1);
+
+	ptl->tx.thread = kthread_run(ssh_ptl_tx_threadfn, ptl, "ssam_serial_hub-tx");
+	if (IS_ERR(ptl->tx.thread))
+		return PTR_ERR(ptl->tx.thread);
+
+	return 0;
+}
+
+/**
+ * ssh_ptl_tx_stop() - Stop packet transmitter thread.
+ * @ptl: The packet transport layer.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+int ssh_ptl_tx_stop(struct ssh_ptl *ptl)
+{
+	int status = 0;
+
+	if (!IS_ERR_OR_NULL(ptl->tx.thread)) {
+		/* Tell thread to stop. */
+		atomic_set_release(&ptl->tx.running, 0);
+
+		/*
+		 * Wake up thread in case it is paused. Do not use wakeup
+		 * helpers as this may be called when the shutdown bit has
+		 * already been set.
+		 */
+		complete(&ptl->tx.thread_cplt_pkt);
+		complete(&ptl->tx.thread_cplt_tx);
+
+		/* Finally, wait for thread to stop. */
+		status = kthread_stop(ptl->tx.thread);
+		ptl->tx.thread = NULL;
+	}
+
+	return status;
+}
+
+static struct ssh_packet *ssh_ptl_ack_pop(struct ssh_ptl *ptl, u8 seq_id)
+{
+	struct ssh_packet *packet = ERR_PTR(-ENOENT);
+	struct ssh_packet *p, *n;
+
+	spin_lock(&ptl->pending.lock);
+	list_for_each_entry_safe(p, n, &ptl->pending.head, pending_node) {
+		/*
+		 * We generally expect packets to be in order, so first packet
+		 * to be added to pending is first to be sent, is first to be
+		 * ACKed.
+		 */
+		if (unlikely(ssh_packet_get_seq(p) != seq_id))
+			continue;
+
+		/*
+		 * In case we receive an ACK while handling a transmission
+		 * error completion. The packet will be removed shortly.
+		 */
+		if (unlikely(test_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state))) {
+			packet = ERR_PTR(-EPERM);
+			break;
+		}
+
+		/*
+		 * Mark the packet as ACKed and remove it from pending by
+		 * removing its node and decrementing the pending counter.
+		 */
+		set_bit(SSH_PACKET_SF_ACKED_BIT, &p->state);
+		/* Ensure that state never gets zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_PACKET_SF_PENDING_BIT, &p->state);
+
+		atomic_dec(&ptl->pending.count);
+		list_del(&p->pending_node);
+		packet = p;
+
+		break;
+	}
+	spin_unlock(&ptl->pending.lock);
+
+	return packet;
+}
+
+static void ssh_ptl_wait_until_transmitted(struct ssh_packet *packet)
+{
+	wait_event(packet->ptl->tx.packet_wq,
+		   test_bit(SSH_PACKET_SF_TRANSMITTED_BIT, &packet->state) ||
+		   test_bit(SSH_PACKET_SF_LOCKED_BIT, &packet->state));
+}
+
+static void ssh_ptl_acknowledge(struct ssh_ptl *ptl, u8 seq)
+{
+	struct ssh_packet *p;
+
+	p = ssh_ptl_ack_pop(ptl, seq);
+	if (IS_ERR(p)) {
+		if (PTR_ERR(p) == -ENOENT) {
+			/*
+			 * The packet has not been found in the set of pending
+			 * packets.
+			 */
+			ptl_warn(ptl, "ptl: received ACK for non-pending packet\n");
+		} else {
+			/*
+			 * The packet is pending, but we are not allowed to take
+			 * it because it has been locked.
+			 */
+			WARN_ON(PTR_ERR(p) != -EPERM);
+		}
+		return;
+	}
+
+	ptl_dbg(ptl, "ptl: received ACK for packet %p\n", p);
+
+	/*
+	 * It is possible that the packet has been transmitted, but the state
+	 * has not been updated from "transmitting" to "transmitted" yet.
+	 * In that case, we need to wait for this transition to occur in order
+	 * to determine between success or failure.
+	 *
+	 * On transmission failure, the packet will be locked after this call.
+	 * On success, the transmitted bit will be set.
+	 */
+	ssh_ptl_wait_until_transmitted(p);
+
+	/*
+	 * The packet will already be locked in case of a transmission error or
+	 * cancellation. Let the transmitter or cancellation issuer complete the
+	 * packet.
+	 */
+	if (unlikely(test_and_set_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state))) {
+		if (unlikely(!test_bit(SSH_PACKET_SF_TRANSMITTED_BIT, &p->state)))
+			ptl_err(ptl, "ptl: received ACK before packet had been fully transmitted\n");
+
+		ssh_packet_put(p);
+		return;
+	}
+
+	ssh_ptl_remove_and_complete(p, 0);
+	ssh_packet_put(p);
+
+	if (atomic_read(&ptl->pending.count) < SSH_PTL_MAX_PENDING)
+		ssh_ptl_tx_wakeup_packet(ptl);
+}
+
+/**
+ * ssh_ptl_submit() - Submit a packet to the transport layer.
+ * @ptl: The packet transport layer to submit the packet to.
+ * @p:   The packet to submit.
+ *
+ * Submits a new packet to the transport layer, queuing it to be sent. This
+ * function should not be used for re-submission.
+ *
+ * Return: Returns zero on success, %-EINVAL if a packet field is invalid or
+ * the packet has been canceled prior to submission, %-EALREADY if the packet
+ * has already been submitted, or %-ESHUTDOWN if the packet transport layer
+ * has been shut down.
+ */
+int ssh_ptl_submit(struct ssh_ptl *ptl, struct ssh_packet *p)
+{
+	struct ssh_ptl *ptl_old;
+	int status;
+
+	/* Validate packet fields. */
+	if (test_bit(SSH_PACKET_TY_FLUSH_BIT, &p->state)) {
+		if (p->data.ptr || test_bit(SSH_PACKET_TY_SEQUENCED_BIT, &p->state))
+			return -EINVAL;
+	} else if (!p->data.ptr) {
+		return -EINVAL;
+	}
+
+	/*
+	 * The ptl reference only gets set on or before the first submission.
+	 * After the first submission, it has to be read-only.
+	 *
+	 * Note that ptl may already be set from upper-layer request
+	 * submission, thus we cannot expect it to be NULL.
+	 */
+	ptl_old = READ_ONCE(p->ptl);
+	if (!ptl_old)
+		WRITE_ONCE(p->ptl, ptl);
+	else if (WARN_ON(ptl_old != ptl))
+		return -EALREADY;	/* Submitted on different PTL. */
+
+	status = ssh_ptl_queue_push(p);
+	if (status)
+		return status;
+
+	if (!test_bit(SSH_PACKET_TY_BLOCKING_BIT, &p->state) ||
+	    (atomic_read(&ptl->pending.count) < SSH_PTL_MAX_PENDING))
+		ssh_ptl_tx_wakeup_packet(ptl);
+
+	return 0;
+}
+
+/*
+ * __ssh_ptl_resubmit() - Re-submit a packet to the transport layer.
+ * @packet: The packet to re-submit.
+ *
+ * Re-submits the given packet: Checks if it can be re-submitted and queues it
+ * if it can, resetting the packet timestamp in the process. Must be called
+ * with the pending lock held.
+ *
+ * Return: Returns %-ECANCELED if the packet has exceeded its number of tries,
+ * %-EINVAL if the packet has been locked, %-EALREADY if the packet is already
+ * on the queue, and %-ESHUTDOWN if the transmission layer has been shut down.
+ */
+static int __ssh_ptl_resubmit(struct ssh_packet *packet)
+{
+	int status;
+	u8 try;
+
+	lockdep_assert_held(&packet->ptl->pending.lock);
+
+	spin_lock(&packet->ptl->queue.lock);
+
+	/* Check if the packet is out of tries. */
+	try = ssh_packet_priority_get_try(packet->priority);
+	if (try >= SSH_PTL_MAX_PACKET_TRIES) {
+		spin_unlock(&packet->ptl->queue.lock);
+		return -ECANCELED;
+	}
+
+	status = __ssh_ptl_queue_push(packet);
+	if (status) {
+		/*
+		 * An error here indicates that the packet has either already
+		 * been queued, been locked, or the transport layer is being
+		 * shut down. In all cases: Ignore the error.
+		 */
+		spin_unlock(&packet->ptl->queue.lock);
+		return status;
+	}
+
+	packet->timestamp = KTIME_MAX;
+
+	spin_unlock(&packet->ptl->queue.lock);
+	return 0;
+}
+
+static void ssh_ptl_resubmit_pending(struct ssh_ptl *ptl)
+{
+	struct ssh_packet *p;
+	bool resub = false;
+
+	/*
+	 * Note: We deliberately do not remove/attempt to cancel and complete
+	 * packets that are out of tires in this function. The packet will be
+	 * eventually canceled and completed by the timeout. Removing the packet
+	 * here could lead to overly eager cancellation if the packet has not
+	 * been re-transmitted yet but the tries-counter already updated (i.e
+	 * ssh_ptl_tx_next() removed the packet from the queue and updated the
+	 * counter, but re-transmission for the last try has not actually
+	 * started yet).
+	 */
+
+	spin_lock(&ptl->pending.lock);
+
+	/* Re-queue all pending packets. */
+	list_for_each_entry(p, &ptl->pending.head, pending_node) {
+		/*
+		 * Re-submission fails if the packet is out of tries, has been
+		 * locked, is already queued, or the layer is being shut down.
+		 * No need to re-schedule tx-thread in those cases.
+		 */
+		if (!__ssh_ptl_resubmit(p))
+			resub = true;
+	}
+
+	spin_unlock(&ptl->pending.lock);
+
+	if (resub)
+		ssh_ptl_tx_wakeup_packet(ptl);
+}
+
+/**
+ * ssh_ptl_cancel() - Cancel a packet.
+ * @p: The packet to cancel.
+ *
+ * Cancels a packet. There are no guarantees on when completion and release
+ * callbacks will be called. This may occur during execution of this function
+ * or may occur at any point later.
+ *
+ * Note that it is not guaranteed that the packet will actually be canceled if
+ * the packet is concurrently completed by another process. The only guarantee
+ * of this function is that the packet will be completed (with success,
+ * failure, or cancellation) and released from the transport layer in a
+ * reasonable time-frame.
+ *
+ * May be called before the packet has been submitted, in which case any later
+ * packet submission fails.
+ */
+void ssh_ptl_cancel(struct ssh_packet *p)
+{
+	if (test_and_set_bit(SSH_PACKET_SF_CANCELED_BIT, &p->state))
+		return;
+
+	/*
+	 * Lock packet and commit with memory barrier. If this packet has
+	 * already been locked, it's going to be removed and completed by
+	 * another party, which should have precedence.
+	 */
+	if (test_and_set_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state))
+		return;
+
+	/*
+	 * By marking the packet as locked and employing the implicit memory
+	 * barrier of test_and_set_bit, we have guaranteed that, at this point,
+	 * the packet cannot be added to the queue any more.
+	 *
+	 * In case the packet has never been submitted, packet->ptl is NULL. If
+	 * the packet is currently being submitted, packet->ptl may be NULL or
+	 * non-NULL. Due marking the packet as locked above and committing with
+	 * the memory barrier, we have guaranteed that, if packet->ptl is NULL,
+	 * the packet will never be added to the queue. If packet->ptl is
+	 * non-NULL, we don't have any guarantees.
+	 */
+
+	if (READ_ONCE(p->ptl)) {
+		ssh_ptl_remove_and_complete(p, -ECANCELED);
+
+		if (atomic_read(&p->ptl->pending.count) < SSH_PTL_MAX_PENDING)
+			ssh_ptl_tx_wakeup_packet(p->ptl);
+
+	} else if (!test_and_set_bit(SSH_PACKET_SF_COMPLETED_BIT, &p->state)) {
+		__ssh_ptl_complete(p, -ECANCELED);
+	}
+}
+
+/* Must be called with pending lock held */
+static ktime_t ssh_packet_get_expiration(struct ssh_packet *p, ktime_t timeout)
+{
+	lockdep_assert_held(&p->ptl->pending.lock);
+
+	if (p->timestamp != KTIME_MAX)
+		return ktime_add(p->timestamp, timeout);
+	else
+		return KTIME_MAX;
+}
+
+static void ssh_ptl_timeout_reap(struct work_struct *work)
+{
+	struct ssh_ptl *ptl = to_ssh_ptl(work, rtx_timeout.reaper.work);
+	struct ssh_packet *p, *n;
+	LIST_HEAD(claimed);
+	ktime_t now = ktime_get_coarse_boottime();
+	ktime_t timeout = ptl->rtx_timeout.timeout;
+	ktime_t next = KTIME_MAX;
+	bool resub = false;
+	int status;
+
+	/*
+	 * Mark reaper as "not pending". This is done before checking any
+	 * packets to avoid lost-update type problems.
+	 */
+	spin_lock(&ptl->rtx_timeout.lock);
+	ptl->rtx_timeout.expires = KTIME_MAX;
+	spin_unlock(&ptl->rtx_timeout.lock);
+
+	spin_lock(&ptl->pending.lock);
+
+	list_for_each_entry_safe(p, n, &ptl->pending.head, pending_node) {
+		ktime_t expires = ssh_packet_get_expiration(p, timeout);
+
+		/*
+		 * Check if the timeout hasn't expired yet. Find out next
+		 * expiration date to be handled after this run.
+		 */
+		if (ktime_after(expires, now)) {
+			next = ktime_before(expires, next) ? expires : next;
+			continue;
+		}
+
+		status = __ssh_ptl_resubmit(p);
+
+		/*
+		 * Re-submission fails if the packet is out of tries, has been
+		 * locked, is already queued, or the layer is being shut down.
+		 * No need to re-schedule tx-thread in those cases.
+		 */
+		if (!status)
+			resub = true;
+
+		/* Go to next packet if this packet is not out of tries. */
+		if (status != -ECANCELED)
+			continue;
+
+		/* No more tries left: Cancel the packet. */
+
+		/*
+		 * If someone else has locked the packet already, don't use it
+		 * and let the other party complete it.
+		 */
+		if (test_and_set_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state))
+			continue;
+
+		/*
+		 * We have now marked the packet as locked. Thus it cannot be
+		 * added to the pending list again after we've removed it here.
+		 * We can therefore re-use the pending_node of this packet
+		 * temporarily.
+		 */
+
+		clear_bit(SSH_PACKET_SF_PENDING_BIT, &p->state);
+
+		atomic_dec(&ptl->pending.count);
+		list_del(&p->pending_node);
+
+		list_add_tail(&p->pending_node, &claimed);
+	}
+
+	spin_unlock(&ptl->pending.lock);
+
+	/* Cancel and complete the packet. */
+	list_for_each_entry_safe(p, n, &claimed, pending_node) {
+		if (!test_and_set_bit(SSH_PACKET_SF_COMPLETED_BIT, &p->state)) {
+			ssh_ptl_queue_remove(p);
+			__ssh_ptl_complete(p, -ETIMEDOUT);
+		}
+
+		/*
+		 * Drop the reference we've obtained by removing it from
+		 * the pending set.
+		 */
+		list_del(&p->pending_node);
+		ssh_packet_put(p);
+	}
+
+	/* Ensure that reaper doesn't run again immediately. */
+	next = max(next, ktime_add(now, SSH_PTL_PACKET_TIMEOUT_RESOLUTION));
+	if (next != KTIME_MAX)
+		ssh_ptl_timeout_reaper_mod(ptl, now, next);
+
+	if (resub)
+		ssh_ptl_tx_wakeup_packet(ptl);
+}
+
+static bool ssh_ptl_rx_retransmit_check(struct ssh_ptl *ptl, u8 seq)
+{
+	int i;
+
+	/*
+	 * Check if SEQ has been seen recently (i.e. packet was
+	 * re-transmitted and we should ignore it).
+	 */
+	for (i = 0; i < ARRAY_SIZE(ptl->rx.blocked.seqs); i++) {
+		if (likely(ptl->rx.blocked.seqs[i] != seq))
+			continue;
+
+		ptl_dbg(ptl, "ptl: ignoring repeated data packet\n");
+		return true;
+	}
+
+	/* Update list of blocked sequence IDs. */
+	ptl->rx.blocked.seqs[ptl->rx.blocked.offset] = seq;
+	ptl->rx.blocked.offset = (ptl->rx.blocked.offset + 1)
+				  % ARRAY_SIZE(ptl->rx.blocked.seqs);
+
+	return false;
+}
+
+static void ssh_ptl_rx_dataframe(struct ssh_ptl *ptl,
+				 const struct ssh_frame *frame,
+				 const struct ssam_span *payload)
+{
+	if (ssh_ptl_rx_retransmit_check(ptl, frame->seq))
+		return;
+
+	ptl->ops.data_received(ptl, payload);
+}
+
+static void ssh_ptl_send_ack(struct ssh_ptl *ptl, u8 seq)
+{
+	struct ssh_packet *packet;
+	struct ssam_span buf;
+	struct msgbuf msgb;
+	int status;
+
+	status = ssh_ctrl_packet_alloc(&packet, &buf, GFP_KERNEL);
+	if (status) {
+		ptl_err(ptl, "ptl: failed to allocate ACK packet\n");
+		return;
+	}
+
+	ssh_packet_init(packet, 0, SSH_PACKET_PRIORITY(ACK, 0),
+			&ssh_ptl_ctrl_packet_ops);
+
+	msgb_init(&msgb, buf.ptr, buf.len);
+	msgb_push_ack(&msgb, seq);
+	ssh_packet_set_data(packet, msgb.begin, msgb_bytes_used(&msgb));
+
+	ssh_ptl_submit(ptl, packet);
+	ssh_packet_put(packet);
+}
+
+static void ssh_ptl_send_nak(struct ssh_ptl *ptl)
+{
+	struct ssh_packet *packet;
+	struct ssam_span buf;
+	struct msgbuf msgb;
+	int status;
+
+	status = ssh_ctrl_packet_alloc(&packet, &buf, GFP_KERNEL);
+	if (status) {
+		ptl_err(ptl, "ptl: failed to allocate NAK packet\n");
+		return;
+	}
+
+	ssh_packet_init(packet, 0, SSH_PACKET_PRIORITY(NAK, 0),
+			&ssh_ptl_ctrl_packet_ops);
+
+	msgb_init(&msgb, buf.ptr, buf.len);
+	msgb_push_nak(&msgb);
+	ssh_packet_set_data(packet, msgb.begin, msgb_bytes_used(&msgb));
+
+	ssh_ptl_submit(ptl, packet);
+	ssh_packet_put(packet);
+}
+
+static size_t ssh_ptl_rx_eval(struct ssh_ptl *ptl, struct ssam_span *source)
+{
+	struct ssh_frame *frame;
+	struct ssam_span payload;
+	struct ssam_span aligned;
+	bool syn_found;
+	int status;
+
+	/* Find SYN. */
+	syn_found = sshp_find_syn(source, &aligned);
+
+	if (unlikely(aligned.ptr - source->ptr) > 0) {
+		ptl_warn(ptl, "rx: parser: invalid start of frame, skipping\n");
+
+		/*
+		 * Notes:
+		 * - This might send multiple NAKs in case the communication
+		 *   starts with an invalid SYN and is broken down into multiple
+		 *   pieces. This should generally be handled fine, we just
+		 *   might receive duplicate data in this case, which is
+		 *   detected when handling data frames.
+		 * - This path will also be executed on invalid CRCs: When an
+		 *   invalid CRC is encountered, the code below will skip data
+		 *   until directly after the SYN. This causes the search for
+		 *   the next SYN, which is generally not placed directly after
+		 *   the last one.
+		 *
+		 *   Open question: Should we send this in case of invalid
+		 *   payload CRCs if the frame-type is non-sequential (current
+		 *   implementation) or should we drop that frame without
+		 *   telling the EC?
+		 */
+		ssh_ptl_send_nak(ptl);
+	}
+
+	if (unlikely(!syn_found))
+		return aligned.ptr - source->ptr;
+
+	/* Parse and validate frame. */
+	status = sshp_parse_frame(&ptl->serdev->dev, &aligned, &frame, &payload,
+				  SSH_PTL_RX_BUF_LEN);
+	if (status)	/* Invalid frame: skip to next SYN. */
+		return aligned.ptr - source->ptr + sizeof(u16);
+	if (!frame)	/* Not enough data. */
+		return aligned.ptr - source->ptr;
+
+	switch (frame->type) {
+	case SSH_FRAME_TYPE_ACK:
+		ssh_ptl_acknowledge(ptl, frame->seq);
+		break;
+
+	case SSH_FRAME_TYPE_NAK:
+		ssh_ptl_resubmit_pending(ptl);
+		break;
+
+	case SSH_FRAME_TYPE_DATA_SEQ:
+		ssh_ptl_send_ack(ptl, frame->seq);
+		fallthrough;
+
+	case SSH_FRAME_TYPE_DATA_NSQ:
+		ssh_ptl_rx_dataframe(ptl, frame, &payload);
+		break;
+
+	default:
+		ptl_warn(ptl, "ptl: received frame with unknown type %#04x\n",
+			 frame->type);
+		break;
+	}
+
+	return aligned.ptr - source->ptr + SSH_MESSAGE_LENGTH(frame->len);
+}
+
+static int ssh_ptl_rx_threadfn(void *data)
+{
+	struct ssh_ptl *ptl = data;
+
+	while (true) {
+		struct ssam_span span;
+		size_t offs = 0;
+		size_t n;
+
+		wait_event_interruptible(ptl->rx.wq,
+					 !kfifo_is_empty(&ptl->rx.fifo) ||
+					 kthread_should_stop());
+		if (kthread_should_stop())
+			break;
+
+		/* Copy from fifo to evaluation buffer. */
+		n = sshp_buf_read_from_fifo(&ptl->rx.buf, &ptl->rx.fifo);
+
+		ptl_dbg(ptl, "rx: received data (size: %zu)\n", n);
+		print_hex_dump_debug("rx: ", DUMP_PREFIX_OFFSET, 16, 1,
+				     ptl->rx.buf.ptr + ptl->rx.buf.len - n,
+				     n, false);
+
+		/* Parse until we need more bytes or buffer is empty. */
+		while (offs < ptl->rx.buf.len) {
+			sshp_buf_span_from(&ptl->rx.buf, offs, &span);
+			n = ssh_ptl_rx_eval(ptl, &span);
+			if (n == 0)
+				break;	/* Need more bytes. */
+
+			offs += n;
+		}
+
+		/* Throw away the evaluated parts. */
+		sshp_buf_drop(&ptl->rx.buf, offs);
+	}
+
+	return 0;
+}
+
+static void ssh_ptl_rx_wakeup(struct ssh_ptl *ptl)
+{
+	wake_up(&ptl->rx.wq);
+}
+
+/**
+ * ssh_ptl_rx_start() - Start packet transport layer receiver thread.
+ * @ptl: The packet transport layer.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+int ssh_ptl_rx_start(struct ssh_ptl *ptl)
+{
+	if (ptl->rx.thread)
+		return 0;
+
+	ptl->rx.thread = kthread_run(ssh_ptl_rx_threadfn, ptl,
+				     "ssam_serial_hub-rx");
+	if (IS_ERR(ptl->rx.thread))
+		return PTR_ERR(ptl->rx.thread);
+
+	return 0;
+}
+
+/**
+ * ssh_ptl_rx_stop() - Stop packet transport layer receiver thread.
+ * @ptl: The packet transport layer.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+int ssh_ptl_rx_stop(struct ssh_ptl *ptl)
+{
+	int status = 0;
+
+	if (ptl->rx.thread) {
+		status = kthread_stop(ptl->rx.thread);
+		ptl->rx.thread = NULL;
+	}
+
+	return status;
+}
+
+/**
+ * ssh_ptl_rx_rcvbuf() - Push data from lower-layer transport to the packet
+ * layer.
+ * @ptl: The packet transport layer.
+ * @buf: Pointer to the data to push to the layer.
+ * @n:   Size of the data to push to the layer, in bytes.
+ *
+ * Pushes data from a lower-layer transport to the receiver fifo buffer of the
+ * packet layer and notifies the receiver thread. Calls to this function are
+ * ignored once the packet layer has been shut down.
+ *
+ * Return: Returns the number of bytes transferred (positive or zero) on
+ * success. Returns %-ESHUTDOWN if the packet layer has been shut down.
+ */
+int ssh_ptl_rx_rcvbuf(struct ssh_ptl *ptl, const u8 *buf, size_t n)
+{
+	int used;
+
+	if (test_bit(SSH_PTL_SF_SHUTDOWN_BIT, &ptl->state))
+		return -ESHUTDOWN;
+
+	used = kfifo_in(&ptl->rx.fifo, buf, n);
+	if (used)
+		ssh_ptl_rx_wakeup(ptl);
+
+	return used;
+}
+
+/**
+ * ssh_ptl_shutdown() - Shut down the packet transport layer.
+ * @ptl: The packet transport layer.
+ *
+ * Shuts down the packet transport layer, removing and canceling all queued
+ * and pending packets. Packets canceled by this operation will be completed
+ * with %-ESHUTDOWN as status. Receiver and transmitter threads will be
+ * stopped.
+ *
+ * As a result of this function, the transport layer will be marked as shut
+ * down. Submission of packets after the transport layer has been shut down
+ * will fail with %-ESHUTDOWN.
+ */
+void ssh_ptl_shutdown(struct ssh_ptl *ptl)
+{
+	LIST_HEAD(complete_q);
+	LIST_HEAD(complete_p);
+	struct ssh_packet *p, *n;
+	int status;
+
+	/* Ensure that no new packets (including ACK/NAK) can be submitted. */
+	set_bit(SSH_PTL_SF_SHUTDOWN_BIT, &ptl->state);
+	/*
+	 * Ensure that the layer gets marked as shut-down before actually
+	 * stopping it. In combination with the check in ssh_ptl_queue_push(),
+	 * this guarantees that no new packets can be added and all already
+	 * queued packets are properly canceled. In combination with the check
+	 * in ssh_ptl_rx_rcvbuf(), this guarantees that received data is
+	 * properly cut off.
+	 */
+	smp_mb__after_atomic();
+
+	status = ssh_ptl_rx_stop(ptl);
+	if (status)
+		ptl_err(ptl, "ptl: failed to stop receiver thread\n");
+
+	status = ssh_ptl_tx_stop(ptl);
+	if (status)
+		ptl_err(ptl, "ptl: failed to stop transmitter thread\n");
+
+	cancel_delayed_work_sync(&ptl->rtx_timeout.reaper);
+
+	/*
+	 * At this point, all threads have been stopped. This means that the
+	 * only references to packets from inside the system are in the queue
+	 * and pending set.
+	 *
+	 * Note: We still need locks here because someone could still be
+	 * canceling packets.
+	 *
+	 * Note 2: We can re-use queue_node (or pending_node) if we mark the
+	 * packet as locked an then remove it from the queue (or pending set
+	 * respectively). Marking the packet as locked avoids re-queuing
+	 * (which should already be prevented by having stopped the treads...)
+	 * and not setting QUEUED_BIT (or PENDING_BIT) prevents removal from a
+	 * new list via other threads (e.g. cancellation).
+	 *
+	 * Note 3: There may be overlap between complete_p and complete_q.
+	 * This is handled via test_and_set_bit() on the "completed" flag
+	 * (also handles cancellation).
+	 */
+
+	/* Mark queued packets as locked and move them to complete_q. */
+	spin_lock(&ptl->queue.lock);
+	list_for_each_entry_safe(p, n, &ptl->queue.head, queue_node) {
+		set_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state);
+		/* Ensure that state does not get zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_PACKET_SF_QUEUED_BIT, &p->state);
+
+		list_del(&p->queue_node);
+		list_add_tail(&p->queue_node, &complete_q);
+	}
+	spin_unlock(&ptl->queue.lock);
+
+	/* Mark pending packets as locked and move them to complete_p. */
+	spin_lock(&ptl->pending.lock);
+	list_for_each_entry_safe(p, n, &ptl->pending.head, pending_node) {
+		set_bit(SSH_PACKET_SF_LOCKED_BIT, &p->state);
+		/* Ensure that state does not get zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_PACKET_SF_PENDING_BIT, &p->state);
+
+		list_del(&p->pending_node);
+		list_add_tail(&p->pending_node, &complete_q);
+	}
+	atomic_set(&ptl->pending.count, 0);
+	spin_unlock(&ptl->pending.lock);
+
+	/* Complete and drop packets on complete_q. */
+	list_for_each_entry(p, &complete_q, queue_node) {
+		if (!test_and_set_bit(SSH_PACKET_SF_COMPLETED_BIT, &p->state))
+			__ssh_ptl_complete(p, -ESHUTDOWN);
+
+		ssh_packet_put(p);
+	}
+
+	/* Complete and drop packets on complete_p. */
+	list_for_each_entry(p, &complete_p, pending_node) {
+		if (!test_and_set_bit(SSH_PACKET_SF_COMPLETED_BIT, &p->state))
+			__ssh_ptl_complete(p, -ESHUTDOWN);
+
+		ssh_packet_put(p);
+	}
+
+	/*
+	 * At this point we have guaranteed that the system doesn't reference
+	 * any packets any more.
+	 */
+}
+
+/**
+ * ssh_ptl_init() - Initialize packet transport layer.
+ * @ptl:    The packet transport layer to initialize.
+ * @serdev: The underlying serial device, i.e. the lower-level transport.
+ * @ops:    Packet layer operations.
+ *
+ * Initializes the given packet transport layer. Transmitter and receiver
+ * threads must be started separately via ssh_ptl_tx_start() and
+ * ssh_ptl_rx_start(), after the packet-layer has been initialized and the
+ * lower-level transport layer has been set up.
+ *
+ * Return: Returns zero on success and a nonzero error code on failure.
+ */
+int ssh_ptl_init(struct ssh_ptl *ptl, struct serdev_device *serdev,
+		 struct ssh_ptl_ops *ops)
+{
+	int i, status;
+
+	ptl->serdev = serdev;
+	ptl->state = 0;
+
+	spin_lock_init(&ptl->queue.lock);
+	INIT_LIST_HEAD(&ptl->queue.head);
+
+	spin_lock_init(&ptl->pending.lock);
+	INIT_LIST_HEAD(&ptl->pending.head);
+	atomic_set_release(&ptl->pending.count, 0);
+
+	ptl->tx.thread = NULL;
+	atomic_set(&ptl->tx.running, 0);
+	init_completion(&ptl->tx.thread_cplt_pkt);
+	init_completion(&ptl->tx.thread_cplt_tx);
+	init_waitqueue_head(&ptl->tx.packet_wq);
+
+	ptl->rx.thread = NULL;
+	init_waitqueue_head(&ptl->rx.wq);
+
+	spin_lock_init(&ptl->rtx_timeout.lock);
+	ptl->rtx_timeout.timeout = SSH_PTL_PACKET_TIMEOUT;
+	ptl->rtx_timeout.expires = KTIME_MAX;
+	INIT_DELAYED_WORK(&ptl->rtx_timeout.reaper, ssh_ptl_timeout_reap);
+
+	ptl->ops = *ops;
+
+	/* Initialize list of recent/blocked SEQs with invalid sequence IDs. */
+	for (i = 0; i < ARRAY_SIZE(ptl->rx.blocked.seqs); i++)
+		ptl->rx.blocked.seqs[i] = U16_MAX;
+	ptl->rx.blocked.offset = 0;
+
+	status = kfifo_alloc(&ptl->rx.fifo, SSH_PTL_RX_FIFO_LEN, GFP_KERNEL);
+	if (status)
+		return status;
+
+	status = sshp_buf_alloc(&ptl->rx.buf, SSH_PTL_RX_BUF_LEN, GFP_KERNEL);
+	if (status)
+		kfifo_free(&ptl->rx.fifo);
+
+	return status;
+}
+
+/**
+ * ssh_ptl_destroy() - Deinitialize packet transport layer.
+ * @ptl: The packet transport layer to deinitialize.
+ *
+ * Deinitializes the given packet transport layer and frees resources
+ * associated with it. If receiver and/or transmitter threads have been
+ * started, the layer must first be shut down via ssh_ptl_shutdown() before
+ * this function can be called.
+ */
+void ssh_ptl_destroy(struct ssh_ptl *ptl)
+{
+	kfifo_free(&ptl->rx.fifo);
+	sshp_buf_free(&ptl->rx.buf);
+}
diff --git a/drivers/platform/surface/aggregator/ssh_packet_layer.h b/drivers/platform/surface/aggregator/ssh_packet_layer.h
new file mode 100644
index 000000000000..058f111292ca
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_packet_layer.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SSH packet transport layer.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _SURFACE_AGGREGATOR_SSH_PACKET_LAYER_H
+#define _SURFACE_AGGREGATOR_SSH_PACKET_LAYER_H
+
+#include <linux/atomic.h>
+#include <linux/kfifo.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/serdev.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+#include "ssh_parser.h"
+
+/**
+ * enum ssh_ptl_state_flags - State-flags for &struct ssh_ptl.
+ *
+ * @SSH_PTL_SF_SHUTDOWN_BIT:
+ *	Indicates that the packet transport layer has been shut down or is
+ *	being shut down and should not accept any new packets/data.
+ */
+enum ssh_ptl_state_flags {
+	SSH_PTL_SF_SHUTDOWN_BIT,
+};
+
+/**
+ * struct ssh_ptl_ops - Callback operations for packet transport layer.
+ * @data_received: Function called when a data-packet has been received. Both,
+ *                 the packet layer on which the packet has been received and
+ *                 the packet's payload data are provided to this function.
+ */
+struct ssh_ptl_ops {
+	void (*data_received)(struct ssh_ptl *p, const struct ssam_span *data);
+};
+
+/**
+ * struct ssh_ptl - SSH packet transport layer.
+ * @serdev:        Serial device providing the underlying data transport.
+ * @state:         State(-flags) of the transport layer.
+ * @queue:         Packet submission queue.
+ * @queue.lock:    Lock for modifying the packet submission queue.
+ * @queue.head:    List-head of the packet submission queue.
+ * @pending:       Set/list of pending packets.
+ * @pending.lock:  Lock for modifying the pending set.
+ * @pending.head:  List-head of the pending set/list.
+ * @pending.count: Number of currently pending packets.
+ * @tx:            Transmitter subsystem.
+ * @tx.running:    Flag indicating (desired) transmitter thread state.
+ * @tx.thread:     Transmitter thread.
+ * @tx.thread_cplt_tx:  Completion for transmitter thread waiting on transfer.
+ * @tx.thread_cplt_pkt: Completion for transmitter thread waiting on packets.
+ * @tx.packet_wq:  Waitqueue-head for packet transmit completion.
+ * @rx:            Receiver subsystem.
+ * @rx.thread:     Receiver thread.
+ * @rx.wq:         Waitqueue-head for receiver thread.
+ * @rx.fifo:       Buffer for receiving data/pushing data to receiver thread.
+ * @rx.buf:        Buffer for evaluating data on receiver thread.
+ * @rx.blocked:    List of recent/blocked sequence IDs to detect retransmission.
+ * @rx.blocked.seqs:   Array of blocked sequence IDs.
+ * @rx.blocked.offset: Offset indicating where a new ID should be inserted.
+ * @rtx_timeout:   Retransmission timeout subsystem.
+ * @rtx_timeout.lock:    Lock for modifying the retransmission timeout reaper.
+ * @rtx_timeout.timeout: Timeout interval for retransmission.
+ * @rtx_timeout.expires: Time specifying when the reaper work is next scheduled.
+ * @rtx_timeout.reaper:  Work performing timeout checks and subsequent actions.
+ * @ops:           Packet layer operations.
+ */
+struct ssh_ptl {
+	struct serdev_device *serdev;
+	unsigned long state;
+
+	struct {
+		spinlock_t lock;
+		struct list_head head;
+	} queue;
+
+	struct {
+		spinlock_t lock;
+		struct list_head head;
+		atomic_t count;
+	} pending;
+
+	struct {
+		atomic_t running;
+		struct task_struct *thread;
+		struct completion thread_cplt_tx;
+		struct completion thread_cplt_pkt;
+		struct wait_queue_head packet_wq;
+	} tx;
+
+	struct {
+		struct task_struct *thread;
+		struct wait_queue_head wq;
+		struct kfifo fifo;
+		struct sshp_buf buf;
+
+		struct {
+			u16 seqs[8];
+			u16 offset;
+		} blocked;
+	} rx;
+
+	struct {
+		spinlock_t lock;
+		ktime_t timeout;
+		ktime_t expires;
+		struct delayed_work reaper;
+	} rtx_timeout;
+
+	struct ssh_ptl_ops ops;
+};
+
+#define __ssam_prcond(func, p, fmt, ...)		\
+	do {						\
+		typeof(p) __p = (p);			\
+							\
+		if (__p)				\
+			func(__p, fmt, ##__VA_ARGS__);	\
+	} while (0)
+
+#define ptl_dbg(p, fmt, ...)  dev_dbg(&(p)->serdev->dev, fmt, ##__VA_ARGS__)
+#define ptl_info(p, fmt, ...) dev_info(&(p)->serdev->dev, fmt, ##__VA_ARGS__)
+#define ptl_warn(p, fmt, ...) dev_warn(&(p)->serdev->dev, fmt, ##__VA_ARGS__)
+#define ptl_err(p, fmt, ...)  dev_err(&(p)->serdev->dev, fmt, ##__VA_ARGS__)
+#define ptl_dbg_cond(p, fmt, ...) __ssam_prcond(ptl_dbg, p, fmt, ##__VA_ARGS__)
+
+#define to_ssh_ptl(ptr, member) \
+	container_of(ptr, struct ssh_ptl, member)
+
+int ssh_ptl_init(struct ssh_ptl *ptl, struct serdev_device *serdev,
+		 struct ssh_ptl_ops *ops);
+
+void ssh_ptl_destroy(struct ssh_ptl *ptl);
+
+/**
+ * ssh_ptl_get_device() - Get device associated with packet transport layer.
+ * @ptl: The packet transport layer.
+ *
+ * Return: Returns the device on which the given packet transport layer builds
+ * upon.
+ */
+static inline struct device *ssh_ptl_get_device(struct ssh_ptl *ptl)
+{
+	return ptl->serdev ? &ptl->serdev->dev : NULL;
+}
+
+int ssh_ptl_tx_start(struct ssh_ptl *ptl);
+int ssh_ptl_tx_stop(struct ssh_ptl *ptl);
+int ssh_ptl_rx_start(struct ssh_ptl *ptl);
+int ssh_ptl_rx_stop(struct ssh_ptl *ptl);
+void ssh_ptl_shutdown(struct ssh_ptl *ptl);
+
+int ssh_ptl_submit(struct ssh_ptl *ptl, struct ssh_packet *p);
+void ssh_ptl_cancel(struct ssh_packet *p);
+
+int ssh_ptl_rx_rcvbuf(struct ssh_ptl *ptl, const u8 *buf, size_t n);
+
+/**
+ * ssh_ptl_tx_wakeup_transfer() - Wake up packet transmitter thread for
+ * transfer.
+ * @ptl: The packet transport layer.
+ *
+ * Wakes up the packet transmitter thread, notifying it that the underlying
+ * transport has more space for data to be transmitted. If the packet
+ * transport layer has been shut down, calls to this function will be ignored.
+ */
+static inline void ssh_ptl_tx_wakeup_transfer(struct ssh_ptl *ptl)
+{
+	if (test_bit(SSH_PTL_SF_SHUTDOWN_BIT, &ptl->state))
+		return;
+
+	complete(&ptl->tx.thread_cplt_tx);
+}
+
+void ssh_packet_init(struct ssh_packet *packet, unsigned long type,
+		     u8 priority, const struct ssh_packet_ops *ops);
+
+#endif /* _SURFACE_AGGREGATOR_SSH_PACKET_LAYER_H */
diff --git a/drivers/platform/surface/aggregator/ssh_parser.c b/drivers/platform/surface/aggregator/ssh_parser.c
new file mode 100644
index 000000000000..e2dead8de94a
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_parser.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * SSH message parser.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <asm/unaligned.h>
+#include <linux/compiler.h>
+#include <linux/device.h>
+#include <linux/types.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+#include "ssh_parser.h"
+
+/**
+ * sshp_validate_crc() - Validate a CRC in raw message data.
+ * @src: The span of data over which the CRC should be computed.
+ * @crc: The pointer to the expected u16 CRC value.
+ *
+ * Computes the CRC of the provided data span (@src), compares it to the CRC
+ * stored at the given address (@crc), and returns the result of this
+ * comparison, i.e. %true if equal. This function is intended to run on raw
+ * input/message data.
+ *
+ * Return: Returns %true if the computed CRC matches the stored CRC, %false
+ * otherwise.
+ */
+static bool sshp_validate_crc(const struct ssam_span *src, const u8 *crc)
+{
+	u16 actual = ssh_crc(src->ptr, src->len);
+	u16 expected = get_unaligned_le16(crc);
+
+	return actual == expected;
+}
+
+/**
+ * sshp_starts_with_syn() - Check if the given data starts with SSH SYN bytes.
+ * @src: The data span to check the start of.
+ */
+static bool sshp_starts_with_syn(const struct ssam_span *src)
+{
+	return src->len >= 2 && get_unaligned_le16(src->ptr) == SSH_MSG_SYN;
+}
+
+/**
+ * sshp_find_syn() - Find SSH SYN bytes in the given data span.
+ * @src: The data span to search in.
+ * @rem: The span (output) indicating the remaining data, starting with SSH
+ *       SYN bytes, if found.
+ *
+ * Search for SSH SYN bytes in the given source span. If found, set the @rem
+ * span to the remaining data, starting with the first SYN bytes and capped by
+ * the source span length, and return %true. This function does not copy any
+ * data, but rather only sets pointers to the respective start addresses and
+ * length values.
+ *
+ * If no SSH SYN bytes could be found, set the @rem span to the zero-length
+ * span at the end of the source span and return %false.
+ *
+ * If partial SSH SYN bytes could be found at the end of the source span, set
+ * the @rem span to cover these partial SYN bytes, capped by the end of the
+ * source span, and return %false. This function should then be re-run once
+ * more data is available.
+ *
+ * Return: Returns %true if a complete SSH SYN sequence could be found,
+ * %false otherwise.
+ */
+bool sshp_find_syn(const struct ssam_span *src, struct ssam_span *rem)
+{
+	size_t i;
+
+	for (i = 0; i < src->len - 1; i++) {
+		if (likely(get_unaligned_le16(src->ptr + i) == SSH_MSG_SYN)) {
+			rem->ptr = src->ptr + i;
+			rem->len = src->len - i;
+			return true;
+		}
+	}
+
+	if (unlikely(src->ptr[src->len - 1] == (SSH_MSG_SYN & 0xff))) {
+		rem->ptr = src->ptr + src->len - 1;
+		rem->len = 1;
+		return false;
+	}
+
+	rem->ptr = src->ptr + src->len;
+	rem->len = 0;
+	return false;
+}
+
+/**
+ * sshp_parse_frame() - Parse SSH frame.
+ * @dev: The device used for logging.
+ * @source: The source to parse from.
+ * @frame: The parsed frame (output).
+ * @payload: The parsed payload (output).
+ * @maxlen: The maximum supported message length.
+ *
+ * Parses and validates a SSH frame, including its payload, from the given
+ * source. Sets the provided @frame pointer to the start of the frame and
+ * writes the limits of the frame payload to the provided @payload span
+ * pointer.
+ *
+ * This function does not copy any data, but rather only validates the message
+ * data and sets pointers (and length values) to indicate the respective parts.
+ *
+ * If no complete SSH frame could be found, the frame pointer will be set to
+ * the %NULL pointer and the payload span will be set to the null span (start
+ * pointer %NULL, size zero).
+ *
+ * Return: Returns zero on success or if the frame is incomplete, %-ENOMSG if
+ * the start of the message is invalid, %-EBADMSG if any (frame-header or
+ * payload) CRC is invalid, or %-EMSGSIZE if the SSH message is bigger than
+ * the maximum message length specified in the @maxlen parameter.
+ */
+int sshp_parse_frame(const struct device *dev, const struct ssam_span *source,
+		     struct ssh_frame **frame, struct ssam_span *payload,
+		     size_t maxlen)
+{
+	struct ssam_span sf;
+	struct ssam_span sp;
+
+	/* Initialize output. */
+	*frame = NULL;
+	payload->ptr = NULL;
+	payload->len = 0;
+
+	if (!sshp_starts_with_syn(source)) {
+		dev_warn(dev, "rx: parser: invalid start of frame\n");
+		return -ENOMSG;
+	}
+
+	/* Check for minimum packet length. */
+	if (unlikely(source->len < SSH_MESSAGE_LENGTH(0))) {
+		dev_dbg(dev, "rx: parser: not enough data for frame\n");
+		return 0;
+	}
+
+	/* Pin down frame. */
+	sf.ptr = source->ptr + sizeof(u16);
+	sf.len = sizeof(struct ssh_frame);
+
+	/* Validate frame CRC. */
+	if (unlikely(!sshp_validate_crc(&sf, sf.ptr + sf.len))) {
+		dev_warn(dev, "rx: parser: invalid frame CRC\n");
+		return -EBADMSG;
+	}
+
+	/* Ensure packet does not exceed maximum length. */
+	sp.len = get_unaligned_le16(&((struct ssh_frame *)sf.ptr)->len);
+	if (unlikely(SSH_MESSAGE_LENGTH(sp.len) > maxlen)) {
+		dev_warn(dev, "rx: parser: frame too large: %llu bytes\n",
+			 SSH_MESSAGE_LENGTH(sp.len));
+		return -EMSGSIZE;
+	}
+
+	/* Pin down payload. */
+	sp.ptr = sf.ptr + sf.len + sizeof(u16);
+
+	/* Check for frame + payload length. */
+	if (source->len < SSH_MESSAGE_LENGTH(sp.len)) {
+		dev_dbg(dev, "rx: parser: not enough data for payload\n");
+		return 0;
+	}
+
+	/* Validate payload CRC. */
+	if (unlikely(!sshp_validate_crc(&sp, sp.ptr + sp.len))) {
+		dev_warn(dev, "rx: parser: invalid payload CRC\n");
+		return -EBADMSG;
+	}
+
+	*frame = (struct ssh_frame *)sf.ptr;
+	*payload = sp;
+
+	dev_dbg(dev, "rx: parser: valid frame found (type: %#04x, len: %u)\n",
+		(*frame)->type, (*frame)->len);
+
+	return 0;
+}
+
+/**
+ * sshp_parse_command() - Parse SSH command frame payload.
+ * @dev: The device used for logging.
+ * @source: The source to parse from.
+ * @command: The parsed command (output).
+ * @command_data: The parsed command data/payload (output).
+ *
+ * Parses and validates a SSH command frame payload. Sets the @command pointer
+ * to the command header and the @command_data span to the command data (i.e.
+ * payload of the command). This will result in a zero-length span if the
+ * command does not have any associated data/payload. This function does not
+ * check the frame-payload-type field, which should be checked by the caller
+ * before calling this function.
+ *
+ * The @source parameter should be the complete frame payload, e.g. returned
+ * by the sshp_parse_frame() command.
+ *
+ * This function does not copy any data, but rather only validates the frame
+ * payload data and sets pointers (and length values) to indicate the
+ * respective parts.
+ *
+ * Return: Returns zero on success or %-ENOMSG if @source does not represent a
+ * valid command-type frame payload, i.e. is too short.
+ */
+int sshp_parse_command(const struct device *dev, const struct ssam_span *source,
+		       struct ssh_command **command,
+		       struct ssam_span *command_data)
+{
+	/* Check for minimum length. */
+	if (unlikely(source->len < sizeof(struct ssh_command))) {
+		*command = NULL;
+		command_data->ptr = NULL;
+		command_data->len = 0;
+
+		dev_err(dev, "rx: parser: command payload is too short\n");
+		return -ENOMSG;
+	}
+
+	*command = (struct ssh_command *)source->ptr;
+	command_data->ptr = source->ptr + sizeof(struct ssh_command);
+	command_data->len = source->len - sizeof(struct ssh_command);
+
+	dev_dbg(dev, "rx: parser: valid command found (tc: %#04x, cid: %#04x)\n",
+		(*command)->tc, (*command)->cid);
+
+	return 0;
+}
diff --git a/drivers/platform/surface/aggregator/ssh_parser.h b/drivers/platform/surface/aggregator/ssh_parser.h
new file mode 100644
index 000000000000..63c38d350988
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_parser.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SSH message parser.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _SURFACE_AGGREGATOR_SSH_PARSER_H
+#define _SURFACE_AGGREGATOR_SSH_PARSER_H
+
+#include <linux/device.h>
+#include <linux/kfifo.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+
+/**
+ * struct sshp_buf - Parser buffer for SSH messages.
+ * @ptr: Pointer to the beginning of the buffer.
+ * @len: Number of bytes used in the buffer.
+ * @cap: Maximum capacity of the buffer.
+ */
+struct sshp_buf {
+	u8    *ptr;
+	size_t len;
+	size_t cap;
+};
+
+/**
+ * sshp_buf_init() - Initialize a SSH parser buffer.
+ * @buf: The buffer to initialize.
+ * @ptr: The memory backing the buffer.
+ * @cap: The length of the memory backing the buffer, i.e. its capacity.
+ *
+ * Initializes the buffer with the given memory as backing and set its used
+ * length to zero.
+ */
+static inline void sshp_buf_init(struct sshp_buf *buf, u8 *ptr, size_t cap)
+{
+	buf->ptr = ptr;
+	buf->len = 0;
+	buf->cap = cap;
+}
+
+/**
+ * sshp_buf_alloc() - Allocate and initialize a SSH parser buffer.
+ * @buf:   The buffer to initialize/allocate to.
+ * @cap:   The desired capacity of the buffer.
+ * @flags: The flags used for allocating the memory.
+ *
+ * Allocates @cap bytes and initializes the provided buffer struct with the
+ * allocated memory.
+ *
+ * Return: Returns zero on success and %-ENOMEM if allocation failed.
+ */
+static inline int sshp_buf_alloc(struct sshp_buf *buf, size_t cap, gfp_t flags)
+{
+	u8 *ptr;
+
+	ptr = kzalloc(cap, flags);
+	if (!ptr)
+		return -ENOMEM;
+
+	sshp_buf_init(buf, ptr, cap);
+	return 0;
+}
+
+/**
+ * sshp_buf_free() - Free a SSH parser buffer.
+ * @buf: The buffer to free.
+ *
+ * Frees a SSH parser buffer by freeing the memory backing it and then
+ * resetting its pointer to %NULL and length and capacity to zero. Intended to
+ * free a buffer previously allocated with sshp_buf_alloc().
+ */
+static inline void sshp_buf_free(struct sshp_buf *buf)
+{
+	kfree(buf->ptr);
+	buf->ptr = NULL;
+	buf->len = 0;
+	buf->cap = 0;
+}
+
+/**
+ * sshp_buf_drop() - Drop data from the beginning of the buffer.
+ * @buf: The buffer to drop data from.
+ * @n:   The number of bytes to drop.
+ *
+ * Drops the first @n bytes from the buffer. Re-aligns any remaining data to
+ * the beginning of the buffer.
+ */
+static inline void sshp_buf_drop(struct sshp_buf *buf, size_t n)
+{
+	memmove(buf->ptr, buf->ptr + n, buf->len - n);
+	buf->len -= n;
+}
+
+/**
+ * sshp_buf_read_from_fifo() - Transfer data from a fifo to the buffer.
+ * @buf:  The buffer to write the data into.
+ * @fifo: The fifo to read the data from.
+ *
+ * Transfers the data contained in the fifo to the buffer, removing it from
+ * the fifo. This function will try to transfer as much data as possible,
+ * limited either by the remaining space in the buffer or by the number of
+ * bytes available in the fifo.
+ *
+ * Return: Returns the number of bytes transferred.
+ */
+static inline size_t sshp_buf_read_from_fifo(struct sshp_buf *buf,
+					     struct kfifo *fifo)
+{
+	size_t n;
+
+	n =  kfifo_out(fifo, buf->ptr + buf->len, buf->cap - buf->len);
+	buf->len += n;
+
+	return n;
+}
+
+/**
+ * sshp_buf_span_from() - Initialize a span from the given buffer and offset.
+ * @buf:    The buffer to create the span from.
+ * @offset: The offset in the buffer at which the span should start.
+ * @span:   The span to initialize (output).
+ *
+ * Initializes the provided span to point to the memory at the given offset in
+ * the buffer, with the length of the span being capped by the number of bytes
+ * used in the buffer after the offset (i.e. bytes remaining after the
+ * offset).
+ *
+ * Warning: This function does not validate that @offset is less than or equal
+ * to the number of bytes used in the buffer or the buffer capacity. This must
+ * be guaranteed by the caller.
+ */
+static inline void sshp_buf_span_from(struct sshp_buf *buf, size_t offset,
+				      struct ssam_span *span)
+{
+	span->ptr = buf->ptr + offset;
+	span->len = buf->len - offset;
+}
+
+bool sshp_find_syn(const struct ssam_span *src, struct ssam_span *rem);
+
+int sshp_parse_frame(const struct device *dev, const struct ssam_span *source,
+		     struct ssh_frame **frame, struct ssam_span *payload,
+		     size_t maxlen);
+
+int sshp_parse_command(const struct device *dev, const struct ssam_span *source,
+		       struct ssh_command **command,
+		       struct ssam_span *command_data);
+
+#endif /* _SURFACE_AGGREGATOR_SSH_PARSER_h */
diff --git a/drivers/platform/surface/aggregator/ssh_request_layer.c b/drivers/platform/surface/aggregator/ssh_request_layer.c
new file mode 100644
index 000000000000..66c839a995f3
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_request_layer.c
@@ -0,0 +1,1211 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * SSH request transport layer.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <asm/unaligned.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/ktime.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+#include <linux/surface_aggregator/controller.h>
+
+#include "ssh_packet_layer.h"
+#include "ssh_request_layer.h"
+
+/*
+ * SSH_RTL_REQUEST_TIMEOUT - Request timeout.
+ *
+ * Timeout as ktime_t delta for request responses. If we have not received a
+ * response in this time-frame after finishing the underlying packet
+ * transmission, the request will be completed with %-ETIMEDOUT as status
+ * code.
+ */
+#define SSH_RTL_REQUEST_TIMEOUT			ms_to_ktime(3000)
+
+/*
+ * SSH_RTL_REQUEST_TIMEOUT_RESOLUTION - Request timeout granularity.
+ *
+ * Time-resolution for timeouts. Should be larger than one jiffy to avoid
+ * direct re-scheduling of reaper work_struct.
+ */
+#define SSH_RTL_REQUEST_TIMEOUT_RESOLUTION	ms_to_ktime(max(2000 / HZ, 50))
+
+/*
+ * SSH_RTL_MAX_PENDING - Maximum number of pending requests.
+ *
+ * Maximum number of requests concurrently waiting to be completed (i.e.
+ * waiting for the corresponding packet transmission to finish if they don't
+ * have a response or waiting for a response if they have one).
+ */
+#define SSH_RTL_MAX_PENDING		3
+
+/*
+ * SSH_RTL_TX_BATCH - Maximum number of requests processed per work execution.
+ * Used to prevent livelocking of the workqueue. Value chosen via educated
+ * guess, may be adjusted.
+ */
+#define SSH_RTL_TX_BATCH		10
+
+static u16 ssh_request_get_rqid(struct ssh_request *rqst)
+{
+	return get_unaligned_le16(rqst->packet.data.ptr
+				  + SSH_MSGOFFSET_COMMAND(rqid));
+}
+
+static u32 ssh_request_get_rqid_safe(struct ssh_request *rqst)
+{
+	if (!rqst->packet.data.ptr)
+		return U32_MAX;
+
+	return ssh_request_get_rqid(rqst);
+}
+
+static void ssh_rtl_queue_remove(struct ssh_request *rqst)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+
+	spin_lock(&rtl->queue.lock);
+
+	if (!test_and_clear_bit(SSH_REQUEST_SF_QUEUED_BIT, &rqst->state)) {
+		spin_unlock(&rtl->queue.lock);
+		return;
+	}
+
+	list_del(&rqst->node);
+
+	spin_unlock(&rtl->queue.lock);
+	ssh_request_put(rqst);
+}
+
+static bool ssh_rtl_queue_empty(struct ssh_rtl *rtl)
+{
+	bool empty;
+
+	spin_lock(&rtl->queue.lock);
+	empty = list_empty(&rtl->queue.head);
+	spin_unlock(&rtl->queue.lock);
+
+	return empty;
+}
+
+static void ssh_rtl_pending_remove(struct ssh_request *rqst)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+
+	spin_lock(&rtl->pending.lock);
+
+	if (!test_and_clear_bit(SSH_REQUEST_SF_PENDING_BIT, &rqst->state)) {
+		spin_unlock(&rtl->pending.lock);
+		return;
+	}
+
+	atomic_dec(&rtl->pending.count);
+	list_del(&rqst->node);
+
+	spin_unlock(&rtl->pending.lock);
+
+	ssh_request_put(rqst);
+}
+
+static int ssh_rtl_tx_pending_push(struct ssh_request *rqst)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+
+	spin_lock(&rtl->pending.lock);
+
+	if (test_bit(SSH_REQUEST_SF_LOCKED_BIT, &rqst->state)) {
+		spin_unlock(&rtl->pending.lock);
+		return -EINVAL;
+	}
+
+	if (test_and_set_bit(SSH_REQUEST_SF_PENDING_BIT, &rqst->state)) {
+		spin_unlock(&rtl->pending.lock);
+		return -EALREADY;
+	}
+
+	atomic_inc(&rtl->pending.count);
+	list_add_tail(&ssh_request_get(rqst)->node, &rtl->pending.head);
+
+	spin_unlock(&rtl->pending.lock);
+	return 0;
+}
+
+static void ssh_rtl_complete_with_status(struct ssh_request *rqst, int status)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+
+	/* rtl/ptl may not be set if we're canceling before submitting. */
+	rtl_dbg_cond(rtl, "rtl: completing request (rqid: %#06x, status: %d)\n",
+		     ssh_request_get_rqid_safe(rqst), status);
+
+	rqst->ops->complete(rqst, NULL, NULL, status);
+}
+
+static void ssh_rtl_complete_with_rsp(struct ssh_request *rqst,
+				      const struct ssh_command *cmd,
+				      const struct ssam_span *data)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+
+	rtl_dbg(rtl, "rtl: completing request with response (rqid: %#06x)\n",
+		ssh_request_get_rqid(rqst));
+
+	rqst->ops->complete(rqst, cmd, data, 0);
+}
+
+static bool ssh_rtl_tx_can_process(struct ssh_request *rqst)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+
+	if (test_bit(SSH_REQUEST_TY_FLUSH_BIT, &rqst->state))
+		return !atomic_read(&rtl->pending.count);
+
+	return atomic_read(&rtl->pending.count) < SSH_RTL_MAX_PENDING;
+}
+
+static struct ssh_request *ssh_rtl_tx_next(struct ssh_rtl *rtl)
+{
+	struct ssh_request *rqst = ERR_PTR(-ENOENT);
+	struct ssh_request *p, *n;
+
+	spin_lock(&rtl->queue.lock);
+
+	/* Find first non-locked request and remove it. */
+	list_for_each_entry_safe(p, n, &rtl->queue.head, node) {
+		if (unlikely(test_bit(SSH_REQUEST_SF_LOCKED_BIT, &p->state)))
+			continue;
+
+		if (!ssh_rtl_tx_can_process(p)) {
+			rqst = ERR_PTR(-EBUSY);
+			break;
+		}
+
+		/* Remove from queue and mark as transmitting. */
+		set_bit(SSH_REQUEST_SF_TRANSMITTING_BIT, &p->state);
+		/* Ensure state never gets zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_REQUEST_SF_QUEUED_BIT, &p->state);
+
+		list_del(&p->node);
+
+		rqst = p;
+		break;
+	}
+
+	spin_unlock(&rtl->queue.lock);
+	return rqst;
+}
+
+static int ssh_rtl_tx_try_process_one(struct ssh_rtl *rtl)
+{
+	struct ssh_request *rqst;
+	int status;
+
+	/* Get and prepare next request for transmit. */
+	rqst = ssh_rtl_tx_next(rtl);
+	if (IS_ERR(rqst))
+		return PTR_ERR(rqst);
+
+	/* Add it to/mark it as pending. */
+	status = ssh_rtl_tx_pending_push(rqst);
+	if (status) {
+		ssh_request_put(rqst);
+		return -EAGAIN;
+	}
+
+	/* Submit packet. */
+	status = ssh_ptl_submit(&rtl->ptl, &rqst->packet);
+	if (status == -ESHUTDOWN) {
+		/*
+		 * Packet has been refused due to the packet layer shutting
+		 * down. Complete it here.
+		 */
+		set_bit(SSH_REQUEST_SF_LOCKED_BIT, &rqst->state);
+		/*
+		 * Note: A barrier is not required here, as there are only two
+		 * references in the system at this point: The one that we have,
+		 * and the other one that belongs to the pending set. Due to the
+		 * request being marked as "transmitting", our process is the
+		 * only one allowed to remove the pending node and change the
+		 * state. Normally, the task would fall to the packet callback,
+		 * but as this is a path where submission failed, this callback
+		 * will never be executed.
+		 */
+
+		ssh_rtl_pending_remove(rqst);
+		ssh_rtl_complete_with_status(rqst, -ESHUTDOWN);
+
+		ssh_request_put(rqst);
+		return -ESHUTDOWN;
+
+	} else if (status) {
+		/*
+		 * If submitting the packet failed and the packet layer isn't
+		 * shutting down, the packet has either been submitted/queued
+		 * before (-EALREADY, which cannot happen as we have
+		 * guaranteed that requests cannot be re-submitted), or the
+		 * packet was marked as locked (-EINVAL). To mark the packet
+		 * locked at this stage, the request, and thus the packets
+		 * itself, had to have been canceled. Simply drop the
+		 * reference. Cancellation itself will remove it from the set
+		 * of pending requests.
+		 */
+
+		WARN_ON(status != -EINVAL);
+
+		ssh_request_put(rqst);
+		return -EAGAIN;
+	}
+
+	ssh_request_put(rqst);
+	return 0;
+}
+
+static bool ssh_rtl_tx_schedule(struct ssh_rtl *rtl)
+{
+	if (atomic_read(&rtl->pending.count) >= SSH_RTL_MAX_PENDING)
+		return false;
+
+	if (ssh_rtl_queue_empty(rtl))
+		return false;
+
+	return schedule_work(&rtl->tx.work);
+}
+
+static void ssh_rtl_tx_work_fn(struct work_struct *work)
+{
+	struct ssh_rtl *rtl = to_ssh_rtl(work, tx.work);
+	unsigned int iterations = SSH_RTL_TX_BATCH;
+	int status;
+
+	/*
+	 * Try to be nice and not block/live-lock the workqueue: Run a maximum
+	 * of 10 tries, then re-submit if necessary. This should not be
+	 * necessary for normal execution, but guarantee it anyway.
+	 */
+	do {
+		status = ssh_rtl_tx_try_process_one(rtl);
+		if (status == -ENOENT || status == -EBUSY)
+			return;		/* No more requests to process. */
+
+		if (status == -ESHUTDOWN) {
+			/*
+			 * Packet system shutting down. No new packets can be
+			 * transmitted. Return silently, the party initiating
+			 * the shutdown should handle the rest.
+			 */
+			return;
+		}
+
+		WARN_ON(status != 0 && status != -EAGAIN);
+	} while (--iterations);
+
+	/* Out of tries, reschedule. */
+	ssh_rtl_tx_schedule(rtl);
+}
+
+/**
+ * ssh_rtl_submit() - Submit a request to the transport layer.
+ * @rtl:  The request transport layer.
+ * @rqst: The request to submit.
+ *
+ * Submits a request to the transport layer. A single request may not be
+ * submitted multiple times without reinitializing it.
+ *
+ * Return: Returns zero on success, %-EINVAL if the request type is invalid or
+ * the request has been canceled prior to submission, %-EALREADY if the
+ * request has already been submitted, or %-ESHUTDOWN in case the request
+ * transport layer has been shut down.
+ */
+int ssh_rtl_submit(struct ssh_rtl *rtl, struct ssh_request *rqst)
+{
+	/*
+	 * Ensure that requests expecting a response are sequenced. If this
+	 * invariant ever changes, see the comment in ssh_rtl_complete() on what
+	 * is required to be changed in the code.
+	 */
+	if (test_bit(SSH_REQUEST_TY_HAS_RESPONSE_BIT, &rqst->state))
+		if (!test_bit(SSH_PACKET_TY_SEQUENCED_BIT, &rqst->packet.state))
+			return -EINVAL;
+
+	spin_lock(&rtl->queue.lock);
+
+	/*
+	 * Try to set ptl and check if this request has already been submitted.
+	 *
+	 * Must be inside lock as we might run into a lost update problem
+	 * otherwise: If this were outside of the lock, cancellation in
+	 * ssh_rtl_cancel_nonpending() may run after we've set the ptl
+	 * reference but before we enter the lock. In that case, we'd detect
+	 * that the request is being added to the queue and would try to remove
+	 * it from that, but removal might fail because it hasn't actually been
+	 * added yet. By putting this cmpxchg in the critical section, we
+	 * ensure that the queuing detection only triggers when we are already
+	 * in the critical section and the remove process will wait until the
+	 * push operation has been completed (via lock) due to that. Only then,
+	 * we can safely try to remove it.
+	 */
+	if (cmpxchg(&rqst->packet.ptl, NULL, &rtl->ptl)) {
+		spin_unlock(&rtl->queue.lock);
+		return -EALREADY;
+	}
+
+	/*
+	 * Ensure that we set ptl reference before we continue modifying state.
+	 * This is required for non-pending cancellation. This barrier is paired
+	 * with the one in ssh_rtl_cancel_nonpending().
+	 *
+	 * By setting the ptl reference before we test for "locked", we can
+	 * check if the "locked" test may have already run. See comments in
+	 * ssh_rtl_cancel_nonpending() for more detail.
+	 */
+	smp_mb__after_atomic();
+
+	if (test_bit(SSH_RTL_SF_SHUTDOWN_BIT, &rtl->state)) {
+		spin_unlock(&rtl->queue.lock);
+		return -ESHUTDOWN;
+	}
+
+	if (test_bit(SSH_REQUEST_SF_LOCKED_BIT, &rqst->state)) {
+		spin_unlock(&rtl->queue.lock);
+		return -EINVAL;
+	}
+
+	set_bit(SSH_REQUEST_SF_QUEUED_BIT, &rqst->state);
+	list_add_tail(&ssh_request_get(rqst)->node, &rtl->queue.head);
+
+	spin_unlock(&rtl->queue.lock);
+
+	ssh_rtl_tx_schedule(rtl);
+	return 0;
+}
+
+static void ssh_rtl_timeout_reaper_mod(struct ssh_rtl *rtl, ktime_t now,
+				       ktime_t expires)
+{
+	unsigned long delta = msecs_to_jiffies(ktime_ms_delta(expires, now));
+	ktime_t aexp = ktime_add(expires, SSH_RTL_REQUEST_TIMEOUT_RESOLUTION);
+
+	spin_lock(&rtl->rtx_timeout.lock);
+
+	/* Re-adjust / schedule reaper only if it is above resolution delta. */
+	if (ktime_before(aexp, rtl->rtx_timeout.expires)) {
+		rtl->rtx_timeout.expires = expires;
+		mod_delayed_work(system_wq, &rtl->rtx_timeout.reaper, delta);
+	}
+
+	spin_unlock(&rtl->rtx_timeout.lock);
+}
+
+static void ssh_rtl_timeout_start(struct ssh_request *rqst)
+{
+	struct ssh_rtl *rtl = ssh_request_rtl(rqst);
+	ktime_t timestamp = ktime_get_coarse_boottime();
+	ktime_t timeout = rtl->rtx_timeout.timeout;
+
+	if (test_bit(SSH_REQUEST_SF_LOCKED_BIT, &rqst->state))
+		return;
+
+	/*
+	 * Note: The timestamp gets set only once. This happens on the packet
+	 * callback. All other access to it is read-only.
+	 */
+	WRITE_ONCE(rqst->timestamp, timestamp);
+	/*
+	 * Ensure timestamp is set before starting the reaper. Paired with
+	 * implicit barrier following check on ssh_request_get_expiration() in
+	 * ssh_rtl_timeout_reap.
+	 */
+	smp_mb__after_atomic();
+
+	ssh_rtl_timeout_reaper_mod(rtl, timestamp, timestamp + timeout);
+}
+
+static void ssh_rtl_complete(struct ssh_rtl *rtl,
+			     const struct ssh_command *command,
+			     const struct ssam_span *command_data)
+{
+	struct ssh_request *r = NULL;
+	struct ssh_request *p, *n;
+	u16 rqid = get_unaligned_le16(&command->rqid);
+
+	/*
+	 * Get request from pending based on request ID and mark it as response
+	 * received and locked.
+	 */
+	spin_lock(&rtl->pending.lock);
+	list_for_each_entry_safe(p, n, &rtl->pending.head, node) {
+		/* We generally expect requests to be processed in order. */
+		if (unlikely(ssh_request_get_rqid(p) != rqid))
+			continue;
+
+		/*
+		 * Mark as "response received" and "locked" as we're going to
+		 * complete it.
+		 */
+		set_bit(SSH_REQUEST_SF_LOCKED_BIT, &p->state);
+		set_bit(SSH_REQUEST_SF_RSPRCVD_BIT, &p->state);
+		/* Ensure state never gets zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_REQUEST_SF_PENDING_BIT, &p->state);
+
+		atomic_dec(&rtl->pending.count);
+		list_del(&p->node);
+
+		r = p;
+		break;
+	}
+	spin_unlock(&rtl->pending.lock);
+
+	if (!r) {
+		rtl_warn(rtl, "rtl: dropping unexpected command message (rqid = %#06x)\n",
+			 rqid);
+		return;
+	}
+
+	/* If the request hasn't been completed yet, we will do this now. */
+	if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state)) {
+		ssh_request_put(r);
+		ssh_rtl_tx_schedule(rtl);
+		return;
+	}
+
+	/*
+	 * Make sure the request has been transmitted. In case of a sequenced
+	 * request, we are guaranteed that the completion callback will run on
+	 * the receiver thread directly when the ACK for the packet has been
+	 * received. Similarly, this function is guaranteed to run on the
+	 * receiver thread. Thus we are guaranteed that if the packet has been
+	 * successfully transmitted and received an ACK, the transmitted flag
+	 * has been set and is visible here.
+	 *
+	 * We are currently not handling unsequenced packets here, as those
+	 * should never expect a response as ensured in ssh_rtl_submit. If this
+	 * ever changes, one would have to test for
+	 *
+	 *	(r->state & (transmitting | transmitted))
+	 *
+	 * on unsequenced packets to determine if they could have been
+	 * transmitted. There are no synchronization guarantees as in the
+	 * sequenced case, since, in this case, the callback function will not
+	 * run on the same thread. Thus an exact determination is impossible.
+	 */
+	if (!test_bit(SSH_REQUEST_SF_TRANSMITTED_BIT, &r->state)) {
+		rtl_err(rtl, "rtl: received response before ACK for request (rqid = %#06x)\n",
+			rqid);
+
+		/*
+		 * NB: Timeout has already been canceled, request already been
+		 * removed from pending and marked as locked and completed. As
+		 * we receive a "false" response, the packet might still be
+		 * queued though.
+		 */
+		ssh_rtl_queue_remove(r);
+
+		ssh_rtl_complete_with_status(r, -EREMOTEIO);
+		ssh_request_put(r);
+
+		ssh_rtl_tx_schedule(rtl);
+		return;
+	}
+
+	/*
+	 * NB: Timeout has already been canceled, request already been
+	 * removed from pending and marked as locked and completed. The request
+	 * can also not be queued any more, as it has been marked as
+	 * transmitting and later transmitted. Thus no need to remove it from
+	 * anywhere.
+	 */
+
+	ssh_rtl_complete_with_rsp(r, command, command_data);
+	ssh_request_put(r);
+
+	ssh_rtl_tx_schedule(rtl);
+}
+
+static bool ssh_rtl_cancel_nonpending(struct ssh_request *r)
+{
+	struct ssh_rtl *rtl;
+	unsigned long flags, fixed;
+	bool remove;
+
+	/*
+	 * Handle unsubmitted request: Try to mark the packet as locked,
+	 * expecting the state to be zero (i.e. unsubmitted). Note that, if
+	 * setting the state worked, we might still be adding the packet to the
+	 * queue in a currently executing submit call. In that case, however,
+	 * ptl reference must have been set previously, as locked is checked
+	 * after setting ptl. Furthermore, when the ptl reference is set, the
+	 * submission process is guaranteed to have entered the critical
+	 * section. Thus only if we successfully locked this request and ptl is
+	 * NULL, we have successfully removed the request, i.e. we are
+	 * guaranteed that, due to the "locked" check in ssh_rtl_submit(), the
+	 * packet will never be added. Otherwise, we need to try and grab it
+	 * from the queue, where we are now guaranteed that the packet is or has
+	 * been due to the critical section.
+	 *
+	 * Note that if the cmpxchg() fails, we are guaranteed that ptl has
+	 * been set and is non-NULL, as states can only be nonzero after this
+	 * has been set. Also note that we need to fetch the static (type)
+	 * flags to ensure that they don't cause the cmpxchg() to fail.
+	 */
+	fixed = READ_ONCE(r->state) & SSH_REQUEST_FLAGS_TY_MASK;
+	flags = cmpxchg(&r->state, fixed, SSH_REQUEST_SF_LOCKED_BIT);
+
+	/*
+	 * Force correct ordering with regards to state and ptl reference access
+	 * to safe-guard cancellation to concurrent submission against a
+	 * lost-update problem. First try to exchange state, then also check
+	 * ptl if that worked. This barrier is paired with the
+	 * one in ssh_rtl_submit().
+	 */
+	smp_mb__after_atomic();
+
+	if (flags == fixed && !READ_ONCE(r->packet.ptl)) {
+		if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+			return true;
+
+		ssh_rtl_complete_with_status(r, -ECANCELED);
+		return true;
+	}
+
+	rtl = ssh_request_rtl(r);
+	spin_lock(&rtl->queue.lock);
+
+	/*
+	 * Note: 1) Requests cannot be re-submitted. 2) If a request is
+	 * queued, it cannot be "transmitting"/"pending" yet. Thus, if we
+	 * successfully remove the request here, we have removed all its
+	 * occurrences in the system.
+	 */
+
+	remove = test_and_clear_bit(SSH_REQUEST_SF_QUEUED_BIT, &r->state);
+	if (!remove) {
+		spin_unlock(&rtl->queue.lock);
+		return false;
+	}
+
+	set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state);
+	list_del(&r->node);
+
+	spin_unlock(&rtl->queue.lock);
+
+	ssh_request_put(r);	/* Drop reference obtained from queue. */
+
+	if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+		return true;
+
+	ssh_rtl_complete_with_status(r, -ECANCELED);
+	return true;
+}
+
+static bool ssh_rtl_cancel_pending(struct ssh_request *r)
+{
+	/* If the packet is already locked, it's going to be removed shortly. */
+	if (test_and_set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state))
+		return true;
+
+	/*
+	 * Now that we have locked the packet, we have guaranteed that it can't
+	 * be added to the system any more. If ptl is NULL, the locked
+	 * check in ssh_rtl_submit() has not been run and any submission,
+	 * currently in progress or called later, won't add the packet. Thus we
+	 * can directly complete it.
+	 *
+	 * The implicit memory barrier of test_and_set_bit() should be enough
+	 * to ensure that the correct order (first lock, then check ptl) is
+	 * ensured. This is paired with the barrier in ssh_rtl_submit().
+	 */
+	if (!READ_ONCE(r->packet.ptl)) {
+		if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+			return true;
+
+		ssh_rtl_complete_with_status(r, -ECANCELED);
+		return true;
+	}
+
+	/*
+	 * Try to cancel the packet. If the packet has not been completed yet,
+	 * this will subsequently (and synchronously) call the completion
+	 * callback of the packet, which will complete the request.
+	 */
+	ssh_ptl_cancel(&r->packet);
+
+	/*
+	 * If the packet has been completed with success, i.e. has not been
+	 * canceled by the above call, the request may not have been completed
+	 * yet (may be waiting for a response). Check if we need to do this
+	 * here.
+	 */
+	if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+		return true;
+
+	ssh_rtl_queue_remove(r);
+	ssh_rtl_pending_remove(r);
+	ssh_rtl_complete_with_status(r, -ECANCELED);
+
+	return true;
+}
+
+/**
+ * ssh_rtl_cancel() - Cancel request.
+ * @rqst:    The request to cancel.
+ * @pending: Whether to also cancel pending requests.
+ *
+ * Cancels the given request. If @pending is %false, this will not cancel
+ * pending requests, i.e. requests that have already been submitted to the
+ * packet layer but not been completed yet. If @pending is %true, this will
+ * cancel the given request regardless of the state it is in.
+ *
+ * If the request has been canceled by calling this function, both completion
+ * and release callbacks of the request will be executed in a reasonable
+ * time-frame. This may happen during execution of this function, however,
+ * there is no guarantee for this. For example, a request currently
+ * transmitting will be canceled/completed only after transmission has
+ * completed, and the respective callbacks will be executed on the transmitter
+ * thread, which may happen during, but also some time after execution of the
+ * cancel function.
+ *
+ * Return: Returns %true if the given request has been canceled or completed,
+ * either by this function or prior to calling this function, %false
+ * otherwise. If @pending is %true, this function will always return %true.
+ */
+bool ssh_rtl_cancel(struct ssh_request *rqst, bool pending)
+{
+	struct ssh_rtl *rtl;
+	bool canceled;
+
+	if (test_and_set_bit(SSH_REQUEST_SF_CANCELED_BIT, &rqst->state))
+		return true;
+
+	if (pending)
+		canceled = ssh_rtl_cancel_pending(rqst);
+	else
+		canceled = ssh_rtl_cancel_nonpending(rqst);
+
+	/* Note: rtl may be NULL if request has not been submitted yet. */
+	rtl = ssh_request_rtl(rqst);
+	if (canceled && rtl)
+		ssh_rtl_tx_schedule(rtl);
+
+	return canceled;
+}
+
+static void ssh_rtl_packet_callback(struct ssh_packet *p, int status)
+{
+	struct ssh_request *r = to_ssh_request(p);
+
+	if (unlikely(status)) {
+		set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state);
+
+		if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+			return;
+
+		/*
+		 * The packet may get canceled even though it has not been
+		 * submitted yet. The request may still be queued. Check the
+		 * queue and remove it if necessary. As the timeout would have
+		 * been started in this function on success, there's no need
+		 * to cancel it here.
+		 */
+		ssh_rtl_queue_remove(r);
+		ssh_rtl_pending_remove(r);
+		ssh_rtl_complete_with_status(r, status);
+
+		ssh_rtl_tx_schedule(ssh_request_rtl(r));
+		return;
+	}
+
+	/* Update state: Mark as transmitted and clear transmitting. */
+	set_bit(SSH_REQUEST_SF_TRANSMITTED_BIT, &r->state);
+	/* Ensure state never gets zero. */
+	smp_mb__before_atomic();
+	clear_bit(SSH_REQUEST_SF_TRANSMITTING_BIT, &r->state);
+
+	/* If we expect a response, we just need to start the timeout. */
+	if (test_bit(SSH_REQUEST_TY_HAS_RESPONSE_BIT, &r->state)) {
+		/*
+		 * Note: This is the only place where the timestamp gets set,
+		 * all other access to it is read-only.
+		 */
+		ssh_rtl_timeout_start(r);
+		return;
+	}
+
+	/*
+	 * If we don't expect a response, lock, remove, and complete the
+	 * request. Note that, at this point, the request is guaranteed to have
+	 * left the queue and no timeout has been started. Thus we only need to
+	 * remove it from pending. If the request has already been completed (it
+	 * may have been canceled) return.
+	 */
+
+	set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state);
+	if (test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+		return;
+
+	ssh_rtl_pending_remove(r);
+	ssh_rtl_complete_with_status(r, 0);
+
+	ssh_rtl_tx_schedule(ssh_request_rtl(r));
+}
+
+static ktime_t ssh_request_get_expiration(struct ssh_request *r, ktime_t timeout)
+{
+	ktime_t timestamp = READ_ONCE(r->timestamp);
+
+	if (timestamp != KTIME_MAX)
+		return ktime_add(timestamp, timeout);
+	else
+		return KTIME_MAX;
+}
+
+static void ssh_rtl_timeout_reap(struct work_struct *work)
+{
+	struct ssh_rtl *rtl = to_ssh_rtl(work, rtx_timeout.reaper.work);
+	struct ssh_request *r, *n;
+	LIST_HEAD(claimed);
+	ktime_t now = ktime_get_coarse_boottime();
+	ktime_t timeout = rtl->rtx_timeout.timeout;
+	ktime_t next = KTIME_MAX;
+
+	/*
+	 * Mark reaper as "not pending". This is done before checking any
+	 * requests to avoid lost-update type problems.
+	 */
+	spin_lock(&rtl->rtx_timeout.lock);
+	rtl->rtx_timeout.expires = KTIME_MAX;
+	spin_unlock(&rtl->rtx_timeout.lock);
+
+	spin_lock(&rtl->pending.lock);
+	list_for_each_entry_safe(r, n, &rtl->pending.head, node) {
+		ktime_t expires = ssh_request_get_expiration(r, timeout);
+
+		/*
+		 * Check if the timeout hasn't expired yet. Find out next
+		 * expiration date to be handled after this run.
+		 */
+		if (ktime_after(expires, now)) {
+			next = ktime_before(expires, next) ? expires : next;
+			continue;
+		}
+
+		/* Avoid further transitions if locked. */
+		if (test_and_set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state))
+			continue;
+
+		/*
+		 * We have now marked the packet as locked. Thus it cannot be
+		 * added to the pending or queued lists again after we've
+		 * removed it here. We can therefore re-use the node of this
+		 * packet temporarily.
+		 */
+
+		clear_bit(SSH_REQUEST_SF_PENDING_BIT, &r->state);
+
+		atomic_dec(&rtl->pending.count);
+		list_del(&r->node);
+
+		list_add_tail(&r->node, &claimed);
+	}
+	spin_unlock(&rtl->pending.lock);
+
+	/* Cancel and complete the request. */
+	list_for_each_entry_safe(r, n, &claimed, node) {
+		/*
+		 * At this point we've removed the packet from pending. This
+		 * means that we've obtained the last (only) reference of the
+		 * system to it. Thus we can just complete it.
+		 */
+		if (!test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+			ssh_rtl_complete_with_status(r, -ETIMEDOUT);
+
+		/*
+		 * Drop the reference we've obtained by removing it from the
+		 * pending set.
+		 */
+		list_del(&r->node);
+		ssh_request_put(r);
+	}
+
+	/* Ensure that the reaper doesn't run again immediately. */
+	next = max(next, ktime_add(now, SSH_RTL_REQUEST_TIMEOUT_RESOLUTION));
+	if (next != KTIME_MAX)
+		ssh_rtl_timeout_reaper_mod(rtl, now, next);
+
+	ssh_rtl_tx_schedule(rtl);
+}
+
+static void ssh_rtl_rx_event(struct ssh_rtl *rtl, const struct ssh_command *cmd,
+			     const struct ssam_span *data)
+{
+	rtl_dbg(rtl, "rtl: handling event (rqid: %#06x)\n",
+		get_unaligned_le16(&cmd->rqid));
+
+	rtl->ops.handle_event(rtl, cmd, data);
+}
+
+static void ssh_rtl_rx_command(struct ssh_ptl *p, const struct ssam_span *data)
+{
+	struct ssh_rtl *rtl = to_ssh_rtl(p, ptl);
+	struct device *dev = &p->serdev->dev;
+	struct ssh_command *command;
+	struct ssam_span command_data;
+
+	if (sshp_parse_command(dev, data, &command, &command_data))
+		return;
+
+	if (ssh_rqid_is_event(get_unaligned_le16(&command->rqid)))
+		ssh_rtl_rx_event(rtl, command, &command_data);
+	else
+		ssh_rtl_complete(rtl, command, &command_data);
+}
+
+static void ssh_rtl_rx_data(struct ssh_ptl *p, const struct ssam_span *data)
+{
+	if (!data->len) {
+		ptl_err(p, "rtl: rx: no data frame payload\n");
+		return;
+	}
+
+	switch (data->ptr[0]) {
+	case SSH_PLD_TYPE_CMD:
+		ssh_rtl_rx_command(p, data);
+		break;
+
+	default:
+		ptl_err(p, "rtl: rx: unknown frame payload type (type: %#04x)\n",
+			data->ptr[0]);
+		break;
+	}
+}
+
+static void ssh_rtl_packet_release(struct ssh_packet *p)
+{
+	struct ssh_request *rqst;
+
+	rqst = to_ssh_request(p);
+	rqst->ops->release(rqst);
+}
+
+static const struct ssh_packet_ops ssh_rtl_packet_ops = {
+	.complete = ssh_rtl_packet_callback,
+	.release = ssh_rtl_packet_release,
+};
+
+/**
+ * ssh_request_init() - Initialize SSH request.
+ * @rqst:  The request to initialize.
+ * @flags: Request flags, determining the type of the request.
+ * @ops:   Request operations.
+ *
+ * Initializes the given SSH request and underlying packet. Sets the message
+ * buffer pointer to %NULL and the message buffer length to zero. This buffer
+ * has to be set separately via ssh_request_set_data() before submission and
+ * must contain a valid SSH request message.
+ *
+ * Return: Returns zero on success or %-EINVAL if the given flags are invalid.
+ */
+int ssh_request_init(struct ssh_request *rqst, enum ssam_request_flags flags,
+		     const struct ssh_request_ops *ops)
+{
+	unsigned long type = BIT(SSH_PACKET_TY_BLOCKING_BIT);
+
+	/* Unsequenced requests cannot have a response. */
+	if (flags & SSAM_REQUEST_UNSEQUENCED && flags & SSAM_REQUEST_HAS_RESPONSE)
+		return -EINVAL;
+
+	if (!(flags & SSAM_REQUEST_UNSEQUENCED))
+		type |= BIT(SSH_PACKET_TY_SEQUENCED_BIT);
+
+	ssh_packet_init(&rqst->packet, type, SSH_PACKET_PRIORITY(DATA, 0),
+			&ssh_rtl_packet_ops);
+
+	INIT_LIST_HEAD(&rqst->node);
+
+	rqst->state = 0;
+	if (flags & SSAM_REQUEST_HAS_RESPONSE)
+		rqst->state |= BIT(SSH_REQUEST_TY_HAS_RESPONSE_BIT);
+
+	rqst->timestamp = KTIME_MAX;
+	rqst->ops = ops;
+
+	return 0;
+}
+
+/**
+ * ssh_rtl_init() - Initialize request transport layer.
+ * @rtl:    The request transport layer to initialize.
+ * @serdev: The underlying serial device, i.e. the lower-level transport.
+ * @ops:    Request transport layer operations.
+ *
+ * Initializes the given request transport layer and associated packet
+ * transport layer. Transmitter and receiver threads must be started
+ * separately via ssh_rtl_tx_start() and ssh_rtl_rx_start(), after the
+ * request-layer has been initialized and the lower-level serial device layer
+ * has been set up.
+ *
+ * Return: Returns zero on success and a nonzero error code on failure.
+ */
+int ssh_rtl_init(struct ssh_rtl *rtl, struct serdev_device *serdev,
+		 const struct ssh_rtl_ops *ops)
+{
+	struct ssh_ptl_ops ptl_ops;
+	int status;
+
+	ptl_ops.data_received = ssh_rtl_rx_data;
+
+	status = ssh_ptl_init(&rtl->ptl, serdev, &ptl_ops);
+	if (status)
+		return status;
+
+	spin_lock_init(&rtl->queue.lock);
+	INIT_LIST_HEAD(&rtl->queue.head);
+
+	spin_lock_init(&rtl->pending.lock);
+	INIT_LIST_HEAD(&rtl->pending.head);
+	atomic_set_release(&rtl->pending.count, 0);
+
+	INIT_WORK(&rtl->tx.work, ssh_rtl_tx_work_fn);
+
+	spin_lock_init(&rtl->rtx_timeout.lock);
+	rtl->rtx_timeout.timeout = SSH_RTL_REQUEST_TIMEOUT;
+	rtl->rtx_timeout.expires = KTIME_MAX;
+	INIT_DELAYED_WORK(&rtl->rtx_timeout.reaper, ssh_rtl_timeout_reap);
+
+	rtl->ops = *ops;
+
+	return 0;
+}
+
+/**
+ * ssh_rtl_destroy() - Deinitialize request transport layer.
+ * @rtl: The request transport layer to deinitialize.
+ *
+ * Deinitializes the given request transport layer and frees resources
+ * associated with it. If receiver and/or transmitter threads have been
+ * started, the layer must first be shut down via ssh_rtl_shutdown() before
+ * this function can be called.
+ */
+void ssh_rtl_destroy(struct ssh_rtl *rtl)
+{
+	ssh_ptl_destroy(&rtl->ptl);
+}
+
+/**
+ * ssh_rtl_tx_start() - Start request transmitter and receiver.
+ * @rtl: The request transport layer.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+int ssh_rtl_start(struct ssh_rtl *rtl)
+{
+	int status;
+
+	status = ssh_ptl_tx_start(&rtl->ptl);
+	if (status)
+		return status;
+
+	ssh_rtl_tx_schedule(rtl);
+
+	status = ssh_ptl_rx_start(&rtl->ptl);
+	if (status) {
+		ssh_rtl_flush(rtl, msecs_to_jiffies(5000));
+		ssh_ptl_tx_stop(&rtl->ptl);
+		return status;
+	}
+
+	return 0;
+}
+
+struct ssh_flush_request {
+	struct ssh_request base;
+	struct completion completion;
+	int status;
+};
+
+static void ssh_rtl_flush_request_complete(struct ssh_request *r,
+					   const struct ssh_command *cmd,
+					   const struct ssam_span *data,
+					   int status)
+{
+	struct ssh_flush_request *rqst;
+
+	rqst = container_of(r, struct ssh_flush_request, base);
+	rqst->status = status;
+}
+
+static void ssh_rtl_flush_request_release(struct ssh_request *r)
+{
+	struct ssh_flush_request *rqst;
+
+	rqst = container_of(r, struct ssh_flush_request, base);
+	complete_all(&rqst->completion);
+}
+
+static const struct ssh_request_ops ssh_rtl_flush_request_ops = {
+	.complete = ssh_rtl_flush_request_complete,
+	.release = ssh_rtl_flush_request_release,
+};
+
+/**
+ * ssh_rtl_flush() - Flush the request transport layer.
+ * @rtl:     request transport layer
+ * @timeout: timeout for the flush operation in jiffies
+ *
+ * Queue a special flush request and wait for its completion. This request
+ * will be completed after all other currently queued and pending requests
+ * have been completed. Instead of a normal data packet, this request submits
+ * a special flush packet, meaning that upon completion, also the underlying
+ * packet transport layer has been flushed.
+ *
+ * Flushing the request layer guarantees that all previously submitted
+ * requests have been fully completed before this call returns. Additionally,
+ * flushing blocks execution of all later submitted requests until the flush
+ * has been completed.
+ *
+ * If the caller ensures that no new requests are submitted after a call to
+ * this function, the request transport layer is guaranteed to have no
+ * remaining requests when this call returns. The same guarantee does not hold
+ * for the packet layer, on which control packets may still be queued after
+ * this call.
+ *
+ * Return: Returns zero on success, %-ETIMEDOUT if the flush timed out and has
+ * been canceled as a result of the timeout, or %-ESHUTDOWN if the packet
+ * and/or request transport layer has been shut down before this call. May
+ * also return %-EINTR if the underlying packet transmission has been
+ * interrupted.
+ */
+int ssh_rtl_flush(struct ssh_rtl *rtl, unsigned long timeout)
+{
+	const unsigned int init_flags = SSAM_REQUEST_UNSEQUENCED;
+	struct ssh_flush_request rqst;
+	int status;
+
+	ssh_request_init(&rqst.base, init_flags, &ssh_rtl_flush_request_ops);
+	rqst.base.packet.state |= BIT(SSH_PACKET_TY_FLUSH_BIT);
+	rqst.base.packet.priority = SSH_PACKET_PRIORITY(FLUSH, 0);
+	rqst.base.state |= BIT(SSH_REQUEST_TY_FLUSH_BIT);
+
+	init_completion(&rqst.completion);
+
+	status = ssh_rtl_submit(rtl, &rqst.base);
+	if (status)
+		return status;
+
+	ssh_request_put(&rqst.base);
+
+	if (!wait_for_completion_timeout(&rqst.completion, timeout)) {
+		ssh_rtl_cancel(&rqst.base, true);
+		wait_for_completion(&rqst.completion);
+	}
+
+	WARN_ON(rqst.status != 0 && rqst.status != -ECANCELED &&
+		rqst.status != -ESHUTDOWN && rqst.status != -EINTR);
+
+	return rqst.status == -ECANCELED ? -ETIMEDOUT : rqst.status;
+}
+
+/**
+ * ssh_rtl_shutdown() - Shut down request transport layer.
+ * @rtl: The request transport layer.
+ *
+ * Shuts down the request transport layer, removing and canceling all queued
+ * and pending requests. Requests canceled by this operation will be completed
+ * with %-ESHUTDOWN as status. Receiver and transmitter threads will be
+ * stopped, the lower-level packet layer will be shutdown.
+ *
+ * As a result of this function, the transport layer will be marked as shut
+ * down. Submission of requests after the transport layer has been shut down
+ * will fail with %-ESHUTDOWN.
+ */
+void ssh_rtl_shutdown(struct ssh_rtl *rtl)
+{
+	struct ssh_request *r, *n;
+	LIST_HEAD(claimed);
+	int pending;
+
+	set_bit(SSH_RTL_SF_SHUTDOWN_BIT, &rtl->state);
+	/*
+	 * Ensure that the layer gets marked as shut-down before actually
+	 * stopping it. In combination with the check in ssh_rtl_submit(),
+	 * this guarantees that no new requests can be added and all already
+	 * queued requests are properly canceled.
+	 */
+	smp_mb__after_atomic();
+
+	/* Remove requests from queue. */
+	spin_lock(&rtl->queue.lock);
+	list_for_each_entry_safe(r, n, &rtl->queue.head, node) {
+		set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state);
+		/* Ensure state never gets zero. */
+		smp_mb__before_atomic();
+		clear_bit(SSH_REQUEST_SF_QUEUED_BIT, &r->state);
+
+		list_del(&r->node);
+		list_add_tail(&r->node, &claimed);
+	}
+	spin_unlock(&rtl->queue.lock);
+
+	/*
+	 * We have now guaranteed that the queue is empty and no more new
+	 * requests can be submitted (i.e. it will stay empty). This means that
+	 * calling ssh_rtl_tx_schedule() will not schedule tx.work any more. So
+	 * we can simply call cancel_work_sync() on tx.work here and when that
+	 * returns, we've locked it down. This also means that after this call,
+	 * we don't submit any more packets to the underlying packet layer, so
+	 * we can also shut that down.
+	 */
+
+	cancel_work_sync(&rtl->tx.work);
+	ssh_ptl_shutdown(&rtl->ptl);
+	cancel_delayed_work_sync(&rtl->rtx_timeout.reaper);
+
+	/*
+	 * Shutting down the packet layer should also have canceled all
+	 * requests. Thus the pending set should be empty. Attempt to handle
+	 * this gracefully anyways, even though this should be dead code.
+	 */
+
+	pending = atomic_read(&rtl->pending.count);
+	if (WARN_ON(pending)) {
+		spin_lock(&rtl->pending.lock);
+		list_for_each_entry_safe(r, n, &rtl->pending.head, node) {
+			set_bit(SSH_REQUEST_SF_LOCKED_BIT, &r->state);
+			/* Ensure state never gets zero. */
+			smp_mb__before_atomic();
+			clear_bit(SSH_REQUEST_SF_PENDING_BIT, &r->state);
+
+			list_del(&r->node);
+			list_add_tail(&r->node, &claimed);
+		}
+		spin_unlock(&rtl->pending.lock);
+	}
+
+	/* Finally, cancel and complete the requests we claimed before. */
+	list_for_each_entry_safe(r, n, &claimed, node) {
+		/*
+		 * We need test_and_set() because we still might compete with
+		 * cancellation.
+		 */
+		if (!test_and_set_bit(SSH_REQUEST_SF_COMPLETED_BIT, &r->state))
+			ssh_rtl_complete_with_status(r, -ESHUTDOWN);
+
+		/*
+		 * Drop the reference we've obtained by removing it from the
+		 * lists.
+		 */
+		list_del(&r->node);
+		ssh_request_put(r);
+	}
+}
diff --git a/drivers/platform/surface/aggregator/ssh_request_layer.h b/drivers/platform/surface/aggregator/ssh_request_layer.h
new file mode 100644
index 000000000000..cb35815858d1
--- /dev/null
+++ b/drivers/platform/surface/aggregator/ssh_request_layer.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SSH request transport layer.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _SURFACE_AGGREGATOR_SSH_REQUEST_LAYER_H
+#define _SURFACE_AGGREGATOR_SSH_REQUEST_LAYER_H
+
+#include <linux/atomic.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+#include <linux/surface_aggregator/controller.h>
+
+#include "ssh_packet_layer.h"
+
+/**
+ * enum ssh_rtl_state_flags - State-flags for &struct ssh_rtl.
+ *
+ * @SSH_RTL_SF_SHUTDOWN_BIT:
+ *	Indicates that the request transport layer has been shut down or is
+ *	being shut down and should not accept any new requests.
+ */
+enum ssh_rtl_state_flags {
+	SSH_RTL_SF_SHUTDOWN_BIT,
+};
+
+/**
+ * struct ssh_rtl_ops - Callback operations for request transport layer.
+ * @handle_event: Function called when a SSH event has been received. The
+ *                specified function takes the request layer, received command
+ *                struct, and corresponding payload as arguments. If the event
+ *                has no payload, the payload span is empty (not %NULL).
+ */
+struct ssh_rtl_ops {
+	void (*handle_event)(struct ssh_rtl *rtl, const struct ssh_command *cmd,
+			     const struct ssam_span *data);
+};
+
+/**
+ * struct ssh_rtl - SSH request transport layer.
+ * @ptl:           Underlying packet transport layer.
+ * @state:         State(-flags) of the transport layer.
+ * @queue:         Request submission queue.
+ * @queue.lock:    Lock for modifying the request submission queue.
+ * @queue.head:    List-head of the request submission queue.
+ * @pending:       Set/list of pending requests.
+ * @pending.lock:  Lock for modifying the request set.
+ * @pending.head:  List-head of the pending set/list.
+ * @pending.count: Number of currently pending requests.
+ * @tx:            Transmitter subsystem.
+ * @tx.work:       Transmitter work item.
+ * @rtx_timeout:   Retransmission timeout subsystem.
+ * @rtx_timeout.lock:    Lock for modifying the retransmission timeout reaper.
+ * @rtx_timeout.timeout: Timeout interval for retransmission.
+ * @rtx_timeout.expires: Time specifying when the reaper work is next scheduled.
+ * @rtx_timeout.reaper:  Work performing timeout checks and subsequent actions.
+ * @ops:           Request layer operations.
+ */
+struct ssh_rtl {
+	struct ssh_ptl ptl;
+	unsigned long state;
+
+	struct {
+		spinlock_t lock;
+		struct list_head head;
+	} queue;
+
+	struct {
+		spinlock_t lock;
+		struct list_head head;
+		atomic_t count;
+	} pending;
+
+	struct {
+		struct work_struct work;
+	} tx;
+
+	struct {
+		spinlock_t lock;
+		ktime_t timeout;
+		ktime_t expires;
+		struct delayed_work reaper;
+	} rtx_timeout;
+
+	struct ssh_rtl_ops ops;
+};
+
+#define rtl_dbg(r, fmt, ...)  ptl_dbg(&(r)->ptl, fmt, ##__VA_ARGS__)
+#define rtl_info(p, fmt, ...) ptl_info(&(p)->ptl, fmt, ##__VA_ARGS__)
+#define rtl_warn(r, fmt, ...) ptl_warn(&(r)->ptl, fmt, ##__VA_ARGS__)
+#define rtl_err(r, fmt, ...)  ptl_err(&(r)->ptl, fmt, ##__VA_ARGS__)
+#define rtl_dbg_cond(r, fmt, ...) __ssam_prcond(rtl_dbg, r, fmt, ##__VA_ARGS__)
+
+#define to_ssh_rtl(ptr, member) \
+	container_of(ptr, struct ssh_rtl, member)
+
+/**
+ * ssh_rtl_get_device() - Get device associated with request transport layer.
+ * @rtl: The request transport layer.
+ *
+ * Return: Returns the device on which the given request transport layer
+ * builds upon.
+ */
+static inline struct device *ssh_rtl_get_device(struct ssh_rtl *rtl)
+{
+	return ssh_ptl_get_device(&rtl->ptl);
+}
+
+/**
+ * ssh_request_rtl() - Get request transport layer associated with request.
+ * @rqst: The request to get the request transport layer reference for.
+ *
+ * Return: Returns the &struct ssh_rtl associated with the given SSH request.
+ */
+static inline struct ssh_rtl *ssh_request_rtl(struct ssh_request *rqst)
+{
+	struct ssh_ptl *ptl;
+
+	ptl = READ_ONCE(rqst->packet.ptl);
+	return likely(ptl) ? to_ssh_rtl(ptl, ptl) : NULL;
+}
+
+int ssh_rtl_submit(struct ssh_rtl *rtl, struct ssh_request *rqst);
+bool ssh_rtl_cancel(struct ssh_request *rqst, bool pending);
+
+int ssh_rtl_init(struct ssh_rtl *rtl, struct serdev_device *serdev,
+		 const struct ssh_rtl_ops *ops);
+
+int ssh_rtl_start(struct ssh_rtl *rtl);
+int ssh_rtl_flush(struct ssh_rtl *rtl, unsigned long timeout);
+void ssh_rtl_shutdown(struct ssh_rtl *rtl);
+void ssh_rtl_destroy(struct ssh_rtl *rtl);
+
+int ssh_request_init(struct ssh_request *rqst, enum ssam_request_flags flags,
+		     const struct ssh_request_ops *ops);
+
+#endif /* _SURFACE_AGGREGATOR_SSH_REQUEST_LAYER_H */
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
new file mode 100644
index 000000000000..f4b1ba887384
--- /dev/null
+++ b/include/linux/surface_aggregator/controller.h
@@ -0,0 +1,824 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Surface System Aggregator Module (SSAM) controller interface.
+ *
+ * Main communication interface for the SSAM EC. Provides a controller
+ * managing access and communication to and from the SSAM EC, as well as main
+ * communication structures and definitions.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _LINUX_SURFACE_AGGREGATOR_CONTROLLER_H
+#define _LINUX_SURFACE_AGGREGATOR_CONTROLLER_H
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/types.h>
+
+#include <linux/surface_aggregator/serial_hub.h>
+
+
+/* -- Main data types and definitions --------------------------------------- */
+
+/**
+ * enum ssam_event_flags - Flags for enabling/disabling SAM events
+ * @SSAM_EVENT_SEQUENCED: The event will be sent via a sequenced data frame.
+ */
+enum ssam_event_flags {
+	SSAM_EVENT_SEQUENCED = BIT(0),
+};
+
+/**
+ * struct ssam_event - SAM event sent from the EC to the host.
+ * @target_category: Target category of the event source. See &enum ssam_ssh_tc.
+ * @target_id:       Target ID of the event source.
+ * @command_id:      Command ID of the event.
+ * @instance_id:     Instance ID of the event source.
+ * @length:          Length of the event payload in bytes.
+ * @data:            Event payload data.
+ */
+struct ssam_event {
+	u8 target_category;
+	u8 target_id;
+	u8 command_id;
+	u8 instance_id;
+	u16 length;
+	u8 data[];
+};
+
+/**
+ * enum ssam_request_flags - Flags for SAM requests.
+ *
+ * @SSAM_REQUEST_HAS_RESPONSE:
+ *	Specifies that the request expects a response. If not set, the request
+ *	will be directly completed after its underlying packet has been
+ *	transmitted. If set, the request transport system waits for a response
+ *	of the request.
+ *
+ * @SSAM_REQUEST_UNSEQUENCED:
+ *	Specifies that the request should be transmitted via an unsequenced
+ *	packet. If set, the request must not have a response, meaning that this
+ *	flag and the %SSAM_REQUEST_HAS_RESPONSE flag are mutually exclusive.
+ */
+enum ssam_request_flags {
+	SSAM_REQUEST_HAS_RESPONSE = BIT(0),
+	SSAM_REQUEST_UNSEQUENCED  = BIT(1),
+};
+
+/**
+ * struct ssam_request - SAM request description.
+ * @target_category: Category of the request's target. See &enum ssam_ssh_tc.
+ * @target_id:       ID of the request's target.
+ * @command_id:      Command ID of the request.
+ * @instance_id:     Instance ID of the request's target.
+ * @flags:           Flags for the request. See &enum ssam_request_flags.
+ * @length:          Length of the request payload in bytes.
+ * @payload:         Request payload data.
+ *
+ * This struct fully describes a SAM request with payload. It is intended to
+ * help set up the actual transport struct, e.g. &struct ssam_request_sync,
+ * and specifically its raw message data via ssam_request_write_data().
+ */
+struct ssam_request {
+	u8 target_category;
+	u8 target_id;
+	u8 command_id;
+	u8 instance_id;
+	u16 flags;
+	u16 length;
+	const u8 *payload;
+};
+
+/**
+ * struct ssam_response - Response buffer for SAM request.
+ * @capacity: Capacity of the buffer, in bytes.
+ * @length:   Length of the actual data stored in the memory pointed to by
+ *            @pointer, in bytes. Set by the transport system.
+ * @pointer:  Pointer to the buffer's memory, storing the response payload data.
+ */
+struct ssam_response {
+	size_t capacity;
+	size_t length;
+	u8 *pointer;
+};
+
+struct ssam_controller;
+
+struct ssam_controller *ssam_get_controller(void);
+struct ssam_controller *ssam_client_bind(struct device *client);
+int ssam_client_link(struct ssam_controller *ctrl, struct device *client);
+
+struct device *ssam_controller_device(struct ssam_controller *c);
+
+struct ssam_controller *ssam_controller_get(struct ssam_controller *c);
+void ssam_controller_put(struct ssam_controller *c);
+
+void ssam_controller_statelock(struct ssam_controller *c);
+void ssam_controller_stateunlock(struct ssam_controller *c);
+
+ssize_t ssam_request_write_data(struct ssam_span *buf,
+				struct ssam_controller *ctrl,
+				const struct ssam_request *spec);
+
+
+/* -- Synchronous request interface. ---------------------------------------- */
+
+/**
+ * struct ssam_request_sync - Synchronous SAM request struct.
+ * @base:   Underlying SSH request.
+ * @comp:   Completion used to signal full completion of the request. After the
+ *          request has been submitted, this struct may only be modified or
+ *          deallocated after the completion has been signaled.
+ *          request has been submitted,
+ * @resp:   Buffer to store the response.
+ * @status: Status of the request, set after the base request has been
+ *          completed or has failed.
+ */
+struct ssam_request_sync {
+	struct ssh_request base;
+	struct completion comp;
+	struct ssam_response *resp;
+	int status;
+};
+
+int ssam_request_sync_alloc(size_t payload_len, gfp_t flags,
+			    struct ssam_request_sync **rqst,
+			    struct ssam_span *buffer);
+
+void ssam_request_sync_free(struct ssam_request_sync *rqst);
+
+int ssam_request_sync_init(struct ssam_request_sync *rqst,
+			   enum ssam_request_flags flags);
+
+/**
+ * ssam_request_sync_set_data - Set message data of a synchronous request.
+ * @rqst: The request.
+ * @ptr:  Pointer to the request message data.
+ * @len:  Length of the request message data.
+ *
+ * Set the request message data of a synchronous request. The provided buffer
+ * needs to live until the request has been completed.
+ */
+static inline void ssam_request_sync_set_data(struct ssam_request_sync *rqst,
+					      u8 *ptr, size_t len)
+{
+	ssh_request_set_data(&rqst->base, ptr, len);
+}
+
+/**
+ * ssam_request_sync_set_resp - Set response buffer of a synchronous request.
+ * @rqst: The request.
+ * @resp: The response buffer.
+ *
+ * Sets the response buffer of a synchronous request. This buffer will store
+ * the response of the request after it has been completed. May be %NULL if no
+ * response is expected.
+ */
+static inline void ssam_request_sync_set_resp(struct ssam_request_sync *rqst,
+					      struct ssam_response *resp)
+{
+	rqst->resp = resp;
+}
+
+int ssam_request_sync_submit(struct ssam_controller *ctrl,
+			     struct ssam_request_sync *rqst);
+
+/**
+ * ssam_request_sync_wait - Wait for completion of a synchronous request.
+ * @rqst: The request to wait for.
+ *
+ * Wait for completion and release of a synchronous request. After this
+ * function terminates, the request is guaranteed to have left the transport
+ * system. After successful submission of a request, this function must be
+ * called before accessing the response of the request, freeing the request,
+ * or freeing any of the buffers associated with the request.
+ *
+ * This function must not be called if the request has not been submitted yet
+ * and may lead to a deadlock/infinite wait if a subsequent request submission
+ * fails in that case, due to the completion never triggering.
+ *
+ * Return: Returns the status of the given request, which is set on completion
+ * of the packet. This value is zero on success and negative on failure.
+ */
+static inline int ssam_request_sync_wait(struct ssam_request_sync *rqst)
+{
+	wait_for_completion(&rqst->comp);
+	return rqst->status;
+}
+
+int ssam_request_sync(struct ssam_controller *ctrl,
+		      const struct ssam_request *spec,
+		      struct ssam_response *rsp);
+
+int ssam_request_sync_with_buffer(struct ssam_controller *ctrl,
+				  const struct ssam_request *spec,
+				  struct ssam_response *rsp,
+				  struct ssam_span *buf);
+
+/**
+ * ssam_request_sync_onstack - Execute a synchronous request on the stack.
+ * @ctrl: The controller via which the request is submitted.
+ * @rqst: The request specification.
+ * @rsp:  The response buffer.
+ * @payload_len: The (maximum) request payload length.
+ *
+ * Allocates a synchronous request with specified payload length on the stack,
+ * fully initializes it via the provided request specification, submits it,
+ * and finally waits for its completion before returning its status. This
+ * helper macro essentially allocates the request message buffer on the stack
+ * and then calls ssam_request_sync_with_buffer().
+ *
+ * Note: The @payload_len parameter specifies the maximum payload length, used
+ * for buffer allocation. The actual payload length may be smaller.
+ *
+ * Return: Returns the status of the request or any failure during setup, i.e.
+ * zero on success and a negative value on failure.
+ */
+#define ssam_request_sync_onstack(ctrl, rqst, rsp, payload_len)			\
+	({									\
+		u8 __data[SSH_COMMAND_MESSAGE_LENGTH(payload_len)];		\
+		struct ssam_span __buf = { &__data[0], ARRAY_SIZE(__data) };	\
+										\
+		ssam_request_sync_with_buffer(ctrl, rqst, rsp, &__buf);		\
+	})
+
+/**
+ * __ssam_retry - Retry request in case of I/O errors or timeouts.
+ * @request: The request function to execute. Must return an integer.
+ * @n:       Number of tries.
+ * @args:    Arguments for the request function.
+ *
+ * Executes the given request function, i.e. calls @request. In case the
+ * request returns %-EREMOTEIO (indicates I/O error) or %-ETIMEDOUT (request
+ * or underlying packet timed out), @request will be re-executed again, up to
+ * @n times in total.
+ *
+ * Return: Returns the return value of the last execution of @request.
+ */
+#define __ssam_retry(request, n, args...)				\
+	({								\
+		int __i, __s = 0;					\
+									\
+		for (__i = (n); __i > 0; __i--) {			\
+			__s = request(args);				\
+			if (__s != -ETIMEDOUT && __s != -EREMOTEIO)	\
+				break;					\
+		}							\
+		__s;							\
+	})
+
+/**
+ * ssam_retry - Retry request in case of I/O errors or timeouts up to three
+ * times in total.
+ * @request: The request function to execute. Must return an integer.
+ * @args:    Arguments for the request function.
+ *
+ * Executes the given request function, i.e. calls @request. In case the
+ * request returns %-EREMOTEIO (indicates I/O error) or -%ETIMEDOUT (request
+ * or underlying packet timed out), @request will be re-executed again, up to
+ * three times in total.
+ *
+ * See __ssam_retry() for a more generic macro for this purpose.
+ *
+ * Return: Returns the return value of the last execution of @request.
+ */
+#define ssam_retry(request, args...) \
+	__ssam_retry(request, 3, args)
+
+/**
+ * struct ssam_request_spec - Blue-print specification of SAM request.
+ * @target_category: Category of the request's target. See &enum ssam_ssh_tc.
+ * @target_id:       ID of the request's target.
+ * @command_id:      Command ID of the request.
+ * @instance_id:     Instance ID of the request's target.
+ * @flags:           Flags for the request. See &enum ssam_request_flags.
+ *
+ * Blue-print specification for a SAM request. This struct describes the
+ * unique static parameters of a request (i.e. type) without specifying any of
+ * its instance-specific data (e.g. payload). It is intended to be used as base
+ * for defining simple request functions via the
+ * ``SSAM_DEFINE_SYNC_REQUEST_x()`` family of macros.
+ */
+struct ssam_request_spec {
+	u8 target_category;
+	u8 target_id;
+	u8 command_id;
+	u8 instance_id;
+	u8 flags;
+};
+
+/**
+ * struct ssam_request_spec_md - Blue-print specification for multi-device SAM
+ * request.
+ * @target_category: Category of the request's target. See &enum ssam_ssh_tc.
+ * @command_id:      Command ID of the request.
+ * @flags:           Flags for the request. See &enum ssam_request_flags.
+ *
+ * Blue-print specification for a multi-device SAM request, i.e. a request
+ * that is applicable to multiple device instances, described by their
+ * individual target and instance IDs. This struct describes the unique static
+ * parameters of a request (i.e. type) without specifying any of its
+ * instance-specific data (e.g. payload) and without specifying any of its
+ * device specific IDs (i.e. target and instance ID). It is intended to be
+ * used as base for defining simple multi-device request functions via the
+ * ``SSAM_DEFINE_SYNC_REQUEST_MD_x()`` and ``SSAM_DEFINE_SYNC_REQUEST_CL_x()``
+ * families of macros.
+ */
+struct ssam_request_spec_md {
+	u8 target_category;
+	u8 command_id;
+	u8 flags;
+};
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_N() - Define synchronous SAM request function
+ * with neither argument nor return value.
+ * @name: Name of the generated function.
+ * @spec: Specification (&struct ssam_request_spec) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request having neither argument nor return value. The
+ * generated function takes care of setting up the request struct and buffer
+ * allocation, as well as execution of the request itself, returning once the
+ * request has been fully completed. The required transport buffer will be
+ * allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_controller
+ * *ctrl)``, returning the status of the request, which is zero on success and
+ * negative on failure. The ``ctrl`` parameter is the controller via which the
+ * request is being sent.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_N(name, spec...)				\
+	int name(struct ssam_controller *ctrl)					\
+	{									\
+		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
+		struct ssam_request rqst;					\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = s.target_id;					\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = s.instance_id;				\
+		rqst.flags = s.flags;						\
+		rqst.length = 0;						\
+		rqst.payload = NULL;						\
+										\
+		return ssam_request_sync_onstack(ctrl, &rqst, NULL, 0);		\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_W() - Define synchronous SAM request function with
+ * argument.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @spec:  Specification (&struct ssam_request_spec) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request taking an argument of type @atype and having no
+ * return value. The generated function takes care of setting up the request
+ * struct, buffer allocation, as well as execution of the request itself,
+ * returning once the request has been fully completed. The required transport
+ * buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_controller
+ * *ctrl, const atype *arg)``, returning the status of the request, which is
+ * zero on success and negative on failure. The ``ctrl`` parameter is the
+ * controller via which the request is sent. The request argument is specified
+ * via the ``arg`` pointer.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_W(name, atype, spec...)			\
+	int name(struct ssam_controller *ctrl, const atype *arg)		\
+	{									\
+		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
+		struct ssam_request rqst;					\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = s.target_id;					\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = s.instance_id;				\
+		rqst.flags = s.flags;						\
+		rqst.length = sizeof(atype);					\
+		rqst.payload = (u8 *)arg;					\
+										\
+		return ssam_request_sync_onstack(ctrl, &rqst, NULL,		\
+						 sizeof(atype));		\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_R() - Define synchronous SAM request function with
+ * return value.
+ * @name:  Name of the generated function.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request taking no argument but having a return value of
+ * type @rtype. The generated function takes care of setting up the request
+ * and response structs, buffer allocation, as well as execution of the
+ * request itself, returning once the request has been fully completed. The
+ * required transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_controller
+ * *ctrl, rtype *ret)``, returning the status of the request, which is zero on
+ * success and negative on failure. The ``ctrl`` parameter is the controller
+ * via which the request is sent. The request's return value is written to the
+ * memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_R(name, rtype, spec...)			\
+	int name(struct ssam_controller *ctrl, rtype *ret)			\
+	{									\
+		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
+		struct ssam_request rqst;					\
+		struct ssam_response rsp;					\
+		int status;							\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = s.target_id;					\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = s.instance_id;				\
+		rqst.flags = s.flags | SSAM_REQUEST_HAS_RESPONSE;		\
+		rqst.length = 0;						\
+		rqst.payload = NULL;						\
+										\
+		rsp.capacity = sizeof(rtype);					\
+		rsp.length = 0;							\
+		rsp.pointer = (u8 *)ret;					\
+										\
+		status = ssam_request_sync_onstack(ctrl, &rqst, &rsp, 0);	\
+		if (status)							\
+			return status;						\
+										\
+		if (rsp.length != sizeof(rtype)) {				\
+			struct device *dev = ssam_controller_device(ctrl);	\
+			dev_err(dev,						\
+				"rqst: invalid response length, expected %zu, got %zu (tc: %#04x, cid: %#04x)", \
+				sizeof(rtype), rsp.length, rqst.target_category,\
+				rqst.command_id);				\
+			return -EIO;						\
+		}								\
+										\
+		return 0;							\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_MD_N() - Define synchronous multi-device SAM
+ * request function with neither argument nor return value.
+ * @name: Name of the generated function.
+ * @spec: Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request having neither argument nor return value. Device
+ * specifying parameters are not hard-coded, but instead must be provided to
+ * the function. The generated function takes care of setting up the request
+ * struct, buffer allocation, as well as execution of the request itself,
+ * returning once the request has been fully completed. The required transport
+ * buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_controller
+ * *ctrl, u8 tid, u8 iid)``, returning the status of the request, which is
+ * zero on success and negative on failure. The ``ctrl`` parameter is the
+ * controller via which the request is sent, ``tid`` the target ID for the
+ * request, and ``iid`` the instance ID.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_MD_N(name, spec...)				\
+	int name(struct ssam_controller *ctrl, u8 tid, u8 iid)			\
+	{									\
+		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
+		struct ssam_request rqst;					\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = tid;						\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = iid;						\
+		rqst.flags = s.flags;						\
+		rqst.length = 0;						\
+		rqst.payload = NULL;						\
+										\
+		return ssam_request_sync_onstack(ctrl, &rqst, NULL, 0);		\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_MD_W() - Define synchronous multi-device SAM
+ * request function with argument.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request taking an argument of type @atype and having no
+ * return value. Device specifying parameters are not hard-coded, but instead
+ * must be provided to the function. The generated function takes care of
+ * setting up the request struct, buffer allocation, as well as execution of
+ * the request itself, returning once the request has been fully completed.
+ * The required transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_controller
+ * *ctrl, u8 tid, u8 iid, const atype *arg)``, returning the status of the
+ * request, which is zero on success and negative on failure. The ``ctrl``
+ * parameter is the controller via which the request is sent, ``tid`` the
+ * target ID for the request, and ``iid`` the instance ID. The request argument
+ * is specified via the ``arg`` pointer.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_MD_W(name, atype, spec...)			\
+	int name(struct ssam_controller *ctrl, u8 tid, u8 iid, const atype *arg)\
+	{									\
+		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
+		struct ssam_request rqst;					\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = tid;						\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = iid;						\
+		rqst.flags = s.flags;						\
+		rqst.length = sizeof(atype);					\
+		rqst.payload = (u8 *)arg;					\
+										\
+		return ssam_request_sync_onstack(ctrl, &rqst, NULL,		\
+						 sizeof(atype));		\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_MD_R() - Define synchronous multi-device SAM
+ * request function with return value.
+ * @name:  Name of the generated function.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request taking no argument but having a return value of
+ * type @rtype. Device specifying parameters are not hard-coded, but instead
+ * must be provided to the function. The generated function takes care of
+ * setting up the request and response structs, buffer allocation, as well as
+ * execution of the request itself, returning once the request has been fully
+ * completed. The required transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_controller
+ * *ctrl, u8 tid, u8 iid, rtype *ret)``, returning the status of the request,
+ * which is zero on success and negative on failure. The ``ctrl`` parameter is
+ * the controller via which the request is sent, ``tid`` the target ID for the
+ * request, and ``iid`` the instance ID. The request's return value is written
+ * to the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_MD_R(name, rtype, spec...)			\
+	int name(struct ssam_controller *ctrl, u8 tid, u8 iid, rtype *ret)	\
+	{									\
+		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
+		struct ssam_request rqst;					\
+		struct ssam_response rsp;					\
+		int status;							\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = tid;						\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = iid;						\
+		rqst.flags = s.flags | SSAM_REQUEST_HAS_RESPONSE;		\
+		rqst.length = 0;						\
+		rqst.payload = NULL;						\
+										\
+		rsp.capacity = sizeof(rtype);					\
+		rsp.length = 0;							\
+		rsp.pointer = (u8 *)ret;					\
+										\
+		status = ssam_request_sync_onstack(ctrl, &rqst, &rsp, 0);	\
+		if (status)							\
+			return status;						\
+										\
+		if (rsp.length != sizeof(rtype)) {				\
+			struct device *dev = ssam_controller_device(ctrl);	\
+			dev_err(dev,						\
+				"rqst: invalid response length, expected %zu, got %zu (tc: %#04x, cid: %#04x)", \
+				sizeof(rtype), rsp.length, rqst.target_category,\
+				rqst.command_id);				\
+			return -EIO;						\
+		}								\
+										\
+		return 0;							\
+	}
+
+
+/* -- Event notifier/callbacks. --------------------------------------------- */
+
+#define SSAM_NOTIF_STATE_SHIFT		2
+#define SSAM_NOTIF_STATE_MASK		((1 << SSAM_NOTIF_STATE_SHIFT) - 1)
+
+/**
+ * enum ssam_notif_flags - Flags used in return values from SSAM notifier
+ * callback functions.
+ *
+ * @SSAM_NOTIF_HANDLED:
+ *	Indicates that the notification has been handled. This flag should be
+ *	set by the handler if the handler can act/has acted upon the event
+ *	provided to it. This flag should not be set if the handler is not a
+ *	primary handler intended for the provided event.
+ *
+ *	If this flag has not been set by any handler after the notifier chain
+ *	has been traversed, a warning will be emitted, stating that the event
+ *	has not been handled.
+ *
+ * @SSAM_NOTIF_STOP:
+ *	Indicates that the notifier traversal should stop. If this flag is
+ *	returned from a notifier callback, notifier chain traversal will
+ *	immediately stop and any remaining notifiers will not be called. This
+ *	flag is automatically set when ssam_notifier_from_errno() is called
+ *	with a negative error value.
+ */
+enum ssam_notif_flags {
+	SSAM_NOTIF_HANDLED = BIT(0),
+	SSAM_NOTIF_STOP    = BIT(1),
+};
+
+struct ssam_event_notifier;
+
+typedef u32 (*ssam_notifier_fn_t)(struct ssam_event_notifier *nf,
+				  const struct ssam_event *event);
+
+/**
+ * struct ssam_notifier_block - Base notifier block for SSAM event
+ * notifications.
+ * @node:     The node for the list of notifiers.
+ * @fn:       The callback function of this notifier. This function takes the
+ *            respective notifier block and event as input and should return
+ *            a notifier value, which can either be obtained from the flags
+ *            provided in &enum ssam_notif_flags, converted from a standard
+ *            error value via ssam_notifier_from_errno(), or a combination of
+ *            both (e.g. ``ssam_notifier_from_errno(e) | SSAM_NOTIF_HANDLED``).
+ * @priority: Priority value determining the order in which notifier callbacks
+ *            will be called. A higher value means higher priority, i.e. the
+ *            associated callback will be executed earlier than other (lower
+ *            priority) callbacks.
+ */
+struct ssam_notifier_block {
+	struct list_head node;
+	ssam_notifier_fn_t fn;
+	int priority;
+};
+
+/**
+ * ssam_notifier_from_errno() - Convert standard error value to notifier
+ * return code.
+ * @err: The error code to convert, must be negative (in case of failure) or
+ *       zero (in case of success).
+ *
+ * Return: Returns the notifier return value obtained by converting the
+ * specified @err value. In case @err is negative, the %SSAM_NOTIF_STOP flag
+ * will be set, causing notifier call chain traversal to abort.
+ */
+static inline u32 ssam_notifier_from_errno(int err)
+{
+	if (WARN_ON(err > 0) || err == 0)
+		return 0;
+	else
+		return ((-err) << SSAM_NOTIF_STATE_SHIFT) | SSAM_NOTIF_STOP;
+}
+
+/**
+ * ssam_notifier_to_errno() - Convert notifier return code to standard error
+ * value.
+ * @ret: The notifier return value to convert.
+ *
+ * Return: Returns the negative error value encoded in @ret or zero if @ret
+ * indicates success.
+ */
+static inline int ssam_notifier_to_errno(u32 ret)
+{
+	return -(ret >> SSAM_NOTIF_STATE_SHIFT);
+}
+
+
+/* -- Event/notification registry. ------------------------------------------ */
+
+/**
+ * struct ssam_event_registry - Registry specification used for enabling events.
+ * @target_category: Target category for the event registry requests.
+ * @target_id:       Target ID for the event registry requests.
+ * @cid_enable:      Command ID for the event-enable request.
+ * @cid_disable:     Command ID for the event-disable request.
+ *
+ * This struct describes a SAM event registry via the minimal collection of
+ * SAM IDs specifying the requests to use for enabling and disabling an event.
+ * The individual event to be enabled/disabled itself is specified via &struct
+ * ssam_event_id.
+ */
+struct ssam_event_registry {
+	u8 target_category;
+	u8 target_id;
+	u8 cid_enable;
+	u8 cid_disable;
+};
+
+/**
+ * struct ssam_event_id - Unique event ID used for enabling events.
+ * @target_category: Target category of the event source.
+ * @instance:        Instance ID of the event source.
+ *
+ * This struct specifies the event to be enabled/disabled via an externally
+ * provided registry. It does not specify the registry to be used itself, this
+ * is done via &struct ssam_event_registry.
+ */
+struct ssam_event_id {
+	u8 target_category;
+	u8 instance;
+};
+
+/**
+ * enum ssam_event_mask - Flags specifying how events are matched to notifiers.
+ *
+ * @SSAM_EVENT_MASK_NONE:
+ *	Run the callback for any event with matching target category. Do not
+ *	do any additional filtering.
+ *
+ * @SSAM_EVENT_MASK_TARGET:
+ *	In addition to filtering by target category, only execute the notifier
+ *	callback for events with a target ID matching to the one of the
+ *	registry used for enabling/disabling the event.
+ *
+ * @SSAM_EVENT_MASK_INSTANCE:
+ *	In addition to filtering by target category, only execute the notifier
+ *	callback for events with an instance ID matching to the instance ID
+ *	used when enabling the event.
+ *
+ * @SSAM_EVENT_MASK_STRICT:
+ *	Do all the filtering above.
+ */
+enum ssam_event_mask {
+	SSAM_EVENT_MASK_TARGET   = BIT(0),
+	SSAM_EVENT_MASK_INSTANCE = BIT(1),
+
+	SSAM_EVENT_MASK_NONE = 0,
+	SSAM_EVENT_MASK_STRICT =
+		  SSAM_EVENT_MASK_TARGET
+		| SSAM_EVENT_MASK_INSTANCE,
+};
+
+/**
+ * SSAM_EVENT_REGISTRY() - Define a new event registry.
+ * @tc:      Target category for the event registry requests.
+ * @tid:     Target ID for the event registry requests.
+ * @cid_en:  Command ID for the event-enable request.
+ * @cid_dis: Command ID for the event-disable request.
+ *
+ * Return: Returns the &struct ssam_event_registry specified by the given
+ * parameters.
+ */
+#define SSAM_EVENT_REGISTRY(tc, tid, cid_en, cid_dis)	\
+	((struct ssam_event_registry) {			\
+		.target_category = (tc),		\
+		.target_id = (tid),			\
+		.cid_enable = (cid_en),			\
+		.cid_disable = (cid_dis),		\
+	})
+
+#define SSAM_EVENT_REGISTRY_SAM	\
+	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_SAM, 0x01, 0x0b, 0x0c)
+
+#define SSAM_EVENT_REGISTRY_KIP	\
+	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_KIP, 0x02, 0x27, 0x28)
+
+#define SSAM_EVENT_REGISTRY_REG \
+	SSAM_EVENT_REGISTRY(SSAM_SSH_TC_REG, 0x02, 0x01, 0x02)
+
+/**
+ * struct ssam_event_notifier - Notifier block for SSAM events.
+ * @base:        The base notifier block with callback function and priority.
+ * @event:       The event for which this block will receive notifications.
+ * @event.reg:   Registry via which the event will be enabled/disabled.
+ * @event.id:    ID specifying the event.
+ * @event.mask:  Flags determining how events are matched to the notifier.
+ * @event.flags: Flags used for enabling the event.
+ */
+struct ssam_event_notifier {
+	struct ssam_notifier_block base;
+
+	struct {
+		struct ssam_event_registry reg;
+		struct ssam_event_id id;
+		enum ssam_event_mask mask;
+		u8 flags;
+	} event;
+};
+
+int ssam_notifier_register(struct ssam_controller *ctrl,
+			   struct ssam_event_notifier *n);
+
+int ssam_notifier_unregister(struct ssam_controller *ctrl,
+			     struct ssam_event_notifier *n);
+
+#endif /* _LINUX_SURFACE_AGGREGATOR_CONTROLLER_H */
diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
new file mode 100644
index 000000000000..64276fbfa1d5
--- /dev/null
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -0,0 +1,672 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Surface Serial Hub (SSH) protocol and communication interface.
+ *
+ * Lower-level communication layers and SSH protocol definitions for the
+ * Surface System Aggregator Module (SSAM). Provides the interface for basic
+ * packet- and request-based communication with the SSAM EC via SSH.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
+#define _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H
+
+#include <linux/crc-ccitt.h>
+#include <linux/kref.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+
+/* -- Data structures for SAM-over-SSH communication. ----------------------- */
+
+/**
+ * enum ssh_frame_type - Frame types for SSH frames.
+ *
+ * @SSH_FRAME_TYPE_DATA_SEQ:
+ *	Indicates a data frame, followed by a payload with the length specified
+ *	in the ``struct ssh_frame.len`` field. This frame is sequenced, meaning
+ *	that an ACK is required.
+ *
+ * @SSH_FRAME_TYPE_DATA_NSQ:
+ *	Same as %SSH_FRAME_TYPE_DATA_SEQ, but unsequenced, meaning that the
+ *	message does not have to be ACKed.
+ *
+ * @SSH_FRAME_TYPE_ACK:
+ *	Indicates an ACK message.
+ *
+ * @SSH_FRAME_TYPE_NAK:
+ *	Indicates an error response for previously sent frame. In general, this
+ *	means that the frame and/or payload is malformed, e.g. a CRC is wrong.
+ *	For command-type payloads, this can also mean that the command is
+ *	invalid.
+ */
+enum ssh_frame_type {
+	SSH_FRAME_TYPE_DATA_SEQ = 0x80,
+	SSH_FRAME_TYPE_DATA_NSQ = 0x00,
+	SSH_FRAME_TYPE_ACK      = 0x40,
+	SSH_FRAME_TYPE_NAK      = 0x04,
+};
+
+/**
+ * struct ssh_frame - SSH communication frame.
+ * @type: The type of the frame. See &enum ssh_frame_type.
+ * @len:  The length of the frame payload directly following the CRC for this
+ *        frame. Does not include the final CRC for that payload.
+ * @seq:  The sequence number for this message/exchange.
+ */
+struct ssh_frame {
+	u8 type;
+	__le16 len;
+	u8 seq;
+} __packed;
+
+static_assert(sizeof(struct ssh_frame) == 4);
+
+/*
+ * SSH_FRAME_MAX_PAYLOAD_SIZE - Maximum SSH frame payload length in bytes.
+ *
+ * This is the physical maximum length of the protocol. Implementations may
+ * set a more constrained limit.
+ */
+#define SSH_FRAME_MAX_PAYLOAD_SIZE	U16_MAX
+
+/**
+ * enum ssh_payload_type - Type indicator for the SSH payload.
+ * @SSH_PLD_TYPE_CMD: The payload is a command structure with optional command
+ *                    payload.
+ */
+enum ssh_payload_type {
+	SSH_PLD_TYPE_CMD = 0x80,
+};
+
+/**
+ * struct ssh_command - Payload of a command-type frame.
+ * @type:    The type of the payload. See &enum ssh_payload_type. Should be
+ *           SSH_PLD_TYPE_CMD for this struct.
+ * @tc:      Command target category.
+ * @tid_out: Output target ID. Should be zero if this an incoming (EC to host)
+ *           message.
+ * @tid_in:  Input target ID. Should be zero if this is an outgoing (host to
+ *           EC) message.
+ * @iid:     Instance ID.
+ * @rqid:    Request ID. Used to match requests with responses and differentiate
+ *           between responses and events.
+ * @cid:     Command ID.
+ */
+struct ssh_command {
+	u8 type;
+	u8 tc;
+	u8 tid_out;
+	u8 tid_in;
+	u8 iid;
+	__le16 rqid;
+	u8 cid;
+} __packed;
+
+static_assert(sizeof(struct ssh_command) == 8);
+
+/*
+ * SSH_COMMAND_MAX_PAYLOAD_SIZE - Maximum SSH command payload length in bytes.
+ *
+ * This is the physical maximum length of the protocol. Implementations may
+ * set a more constrained limit.
+ */
+#define SSH_COMMAND_MAX_PAYLOAD_SIZE \
+	(SSH_FRAME_MAX_PAYLOAD_SIZE - sizeof(struct ssh_command))
+
+/*
+ * SSH_MSG_LEN_BASE - Base-length of a SSH message.
+ *
+ * This is the minimum number of bytes required to form a message. The actual
+ * message length is SSH_MSG_LEN_BASE plus the length of the frame payload.
+ */
+#define SSH_MSG_LEN_BASE	(sizeof(struct ssh_frame) + 3ull * sizeof(u16))
+
+/*
+ * SSH_MSG_LEN_CTRL - Length of a SSH control message.
+ *
+ * This is the length of a SSH control message, which is equal to a SSH
+ * message without any payload.
+ */
+#define SSH_MSG_LEN_CTRL	SSH_MSG_LEN_BASE
+
+/**
+ * SSH_MESSAGE_LENGTH() - Compute length of SSH message.
+ * @payload_size: Length of the payload inside the SSH frame.
+ *
+ * Return: Returns the length of a SSH message with payload of specified size.
+ */
+#define SSH_MESSAGE_LENGTH(payload_size) (SSH_MSG_LEN_BASE + (payload_size))
+
+/**
+ * SSH_COMMAND_MESSAGE_LENGTH() - Compute length of SSH command message.
+ * @payload_size: Length of the command payload.
+ *
+ * Return: Returns the length of a SSH command message with command payload of
+ * specified size.
+ */
+#define SSH_COMMAND_MESSAGE_LENGTH(payload_size) \
+	SSH_MESSAGE_LENGTH(sizeof(struct ssh_command) + (payload_size))
+
+/**
+ * SSH_MSGOFFSET_FRAME() - Compute offset in SSH message to specified field in
+ * frame.
+ * @field: The field for which the offset should be computed.
+ *
+ * Return: Returns the offset of the specified &struct ssh_frame field in the
+ * raw SSH message data as. Takes SYN bytes (u16) preceding the frame into
+ * account.
+ */
+#define SSH_MSGOFFSET_FRAME(field) \
+	(sizeof(u16) + offsetof(struct ssh_frame, field))
+
+/**
+ * SSH_MSGOFFSET_COMMAND() - Compute offset in SSH message to specified field
+ * in command.
+ * @field: The field for which the offset should be computed.
+ *
+ * Return: Returns the offset of the specified &struct ssh_command field in
+ * the raw SSH message data. Takes SYN bytes (u16) preceding the frame and the
+ * frame CRC (u16) between frame and command into account.
+ */
+#define SSH_MSGOFFSET_COMMAND(field) \
+	(2ull * sizeof(u16) + sizeof(struct ssh_frame) \
+		+ offsetof(struct ssh_command, field))
+
+/*
+ * SSH_MSG_SYN - SSH message synchronization (SYN) bytes as u16.
+ */
+#define SSH_MSG_SYN		((u16)0x55aa)
+
+/**
+ * ssh_crc() - Compute CRC for SSH messages.
+ * @buf: The pointer pointing to the data for which the CRC should be computed.
+ * @len: The length of the data for which the CRC should be computed.
+ *
+ * Return: Returns the CRC computed on the provided data, as used for SSH
+ * messages.
+ */
+static inline u16 ssh_crc(const u8 *buf, size_t len)
+{
+	return crc_ccitt_false(0xffff, buf, len);
+}
+
+/*
+ * SSH_NUM_EVENTS - The number of reserved event IDs.
+ *
+ * The number of reserved event IDs, used for registering an SSH event
+ * handler. Valid event IDs are numbers below or equal to this value, with
+ * exception of zero, which is not an event ID. Thus, this is also the
+ * absolute maximum number of event handlers that can be registered.
+ */
+#define SSH_NUM_EVENTS		34
+
+/*
+ * SSH_NUM_TARGETS - The number of communication targets used in the protocol.
+ */
+#define SSH_NUM_TARGETS		2
+
+/**
+ * ssh_rqid_next_valid() - Return the next valid request ID.
+ * @rqid: The current request ID.
+ *
+ * Return: Returns the next valid request ID, following the current request ID
+ * provided to this function. This function skips any request IDs reserved for
+ * events.
+ */
+static inline u16 ssh_rqid_next_valid(u16 rqid)
+{
+	return rqid > 0 ? rqid + 1u : rqid + SSH_NUM_EVENTS + 1u;
+}
+
+/**
+ * ssh_rqid_to_event() - Convert request ID to its corresponding event ID.
+ * @rqid: The request ID to convert.
+ */
+static inline u16 ssh_rqid_to_event(u16 rqid)
+{
+	return rqid - 1u;
+}
+
+/**
+ * ssh_rqid_is_event() - Check if given request ID is a valid event ID.
+ * @rqid: The request ID to check.
+ */
+static inline bool ssh_rqid_is_event(u16 rqid)
+{
+	return ssh_rqid_to_event(rqid) < SSH_NUM_EVENTS;
+}
+
+/**
+ * ssh_tc_to_rqid() - Convert target category to its corresponding request ID.
+ * @tc: The target category to convert.
+ */
+static inline u16 ssh_tc_to_rqid(u8 tc)
+{
+	return tc;
+}
+
+/**
+ * ssh_tid_to_index() - Convert target ID to its corresponding target index.
+ * @tid: The target ID to convert.
+ */
+static inline u8 ssh_tid_to_index(u8 tid)
+{
+	return tid - 1u;
+}
+
+/**
+ * ssh_tid_is_valid() - Check if target ID is valid/supported.
+ * @tid: The target ID to check.
+ */
+static inline bool ssh_tid_is_valid(u8 tid)
+{
+	return ssh_tid_to_index(tid) < SSH_NUM_TARGETS;
+}
+
+/**
+ * struct ssam_span - Reference to a buffer region.
+ * @ptr: Pointer to the buffer region.
+ * @len: Length of the buffer region.
+ *
+ * A reference to a (non-owned) buffer segment, consisting of pointer and
+ * length. Use of this struct indicates non-owned data, i.e. data of which the
+ * life-time is managed (i.e. it is allocated/freed) via another pointer.
+ */
+struct ssam_span {
+	u8    *ptr;
+	size_t len;
+};
+
+/*
+ * Known SSH/EC target categories.
+ *
+ * List of currently known target category values; "Known" as in we know they
+ * exist and are valid on at least some device/model. Detailed functionality
+ * or the full category name is only known for some of these categories and
+ * is detailed in the respective comment below.
+ *
+ * These values and abbreviations have been extracted from strings inside the
+ * Windows driver.
+ */
+enum ssam_ssh_tc {
+				/* Category 0x00 is invalid for EC use. */
+	SSAM_SSH_TC_SAM = 0x01,	/* Generic system functionality, real-time clock. */
+	SSAM_SSH_TC_BAT = 0x02,	/* Battery/power subsystem. */
+	SSAM_SSH_TC_TMP = 0x03,	/* Thermal subsystem. */
+	SSAM_SSH_TC_PMC = 0x04,
+	SSAM_SSH_TC_FAN = 0x05,
+	SSAM_SSH_TC_PoM = 0x06,
+	SSAM_SSH_TC_DBG = 0x07,
+	SSAM_SSH_TC_KBD = 0x08,	/* Legacy keyboard (Laptop 1/2). */
+	SSAM_SSH_TC_FWU = 0x09,
+	SSAM_SSH_TC_UNI = 0x0a,
+	SSAM_SSH_TC_LPC = 0x0b,
+	SSAM_SSH_TC_TCL = 0x0c,
+	SSAM_SSH_TC_SFL = 0x0d,
+	SSAM_SSH_TC_KIP = 0x0e,
+	SSAM_SSH_TC_EXT = 0x0f,
+	SSAM_SSH_TC_BLD = 0x10,
+	SSAM_SSH_TC_BAS = 0x11,	/* Detachment system (Surface Book 2/3). */
+	SSAM_SSH_TC_SEN = 0x12,
+	SSAM_SSH_TC_SRQ = 0x13,
+	SSAM_SSH_TC_MCU = 0x14,
+	SSAM_SSH_TC_HID = 0x15,	/* Generic HID input subsystem. */
+	SSAM_SSH_TC_TCH = 0x16,
+	SSAM_SSH_TC_BKL = 0x17,
+	SSAM_SSH_TC_TAM = 0x18,
+	SSAM_SSH_TC_ACC = 0x19,
+	SSAM_SSH_TC_UFI = 0x1a,
+	SSAM_SSH_TC_USC = 0x1b,
+	SSAM_SSH_TC_PEN = 0x1c,
+	SSAM_SSH_TC_VID = 0x1d,
+	SSAM_SSH_TC_AUD = 0x1e,
+	SSAM_SSH_TC_SMC = 0x1f,
+	SSAM_SSH_TC_KPD = 0x20,
+	SSAM_SSH_TC_REG = 0x21,	/* Extended event registry. */
+};
+
+
+/* -- Packet transport layer (ptl). ----------------------------------------- */
+
+/**
+ * enum ssh_packet_base_priority - Base priorities for &struct ssh_packet.
+ * @SSH_PACKET_PRIORITY_FLUSH: Base priority for flush packets.
+ * @SSH_PACKET_PRIORITY_DATA:  Base priority for normal data packets.
+ * @SSH_PACKET_PRIORITY_NAK:   Base priority for NAK packets.
+ * @SSH_PACKET_PRIORITY_ACK:   Base priority for ACK packets.
+ */
+enum ssh_packet_base_priority {
+	SSH_PACKET_PRIORITY_FLUSH = 0,	/* same as DATA to sequence flush */
+	SSH_PACKET_PRIORITY_DATA  = 0,
+	SSH_PACKET_PRIORITY_NAK   = 1,
+	SSH_PACKET_PRIORITY_ACK   = 2,
+};
+
+/*
+ * Same as SSH_PACKET_PRIORITY() below, only with actual values.
+ */
+#define __SSH_PACKET_PRIORITY(base, try) \
+	(((base) << 4) | ((try) & 0x0f))
+
+/**
+ * SSH_PACKET_PRIORITY() - Compute packet priority from base priority and
+ * number of tries.
+ * @base: The base priority as suffix of &enum ssh_packet_base_priority, e.g.
+ *        ``FLUSH``, ``DATA``, ``ACK``, or ``NAK``.
+ * @try:  The number of tries (must be less than 16).
+ *
+ * Compute the combined packet priority. The combined priority is dominated by
+ * the base priority, whereas the number of (re-)tries decides the precedence
+ * of packets with the same base priority, giving higher priority to packets
+ * that already have more tries.
+ *
+ * Return: Returns the computed priority as value fitting inside a &u8. A
+ * higher number means a higher priority.
+ */
+#define SSH_PACKET_PRIORITY(base, try) \
+	__SSH_PACKET_PRIORITY(SSH_PACKET_PRIORITY_##base, (try))
+
+/**
+ * ssh_packet_priority_get_try() - Get number of tries from packet priority.
+ * @priority: The packet priority.
+ *
+ * Return: Returns the number of tries encoded in the specified packet
+ * priority.
+ */
+static inline u8 ssh_packet_priority_get_try(u8 priority)
+{
+	return priority & 0x0f;
+}
+
+/**
+ * ssh_packet_priority_get_base - Get base priority from packet priority.
+ * @priority: The packet priority.
+ *
+ * Return: Returns the base priority encoded in the given packet priority.
+ */
+static inline u8 ssh_packet_priority_get_base(u8 priority)
+{
+	return (priority & 0xf0) >> 4;
+}
+
+enum ssh_packet_flags {
+	/* state flags */
+	SSH_PACKET_SF_LOCKED_BIT,
+	SSH_PACKET_SF_QUEUED_BIT,
+	SSH_PACKET_SF_PENDING_BIT,
+	SSH_PACKET_SF_TRANSMITTING_BIT,
+	SSH_PACKET_SF_TRANSMITTED_BIT,
+	SSH_PACKET_SF_ACKED_BIT,
+	SSH_PACKET_SF_CANCELED_BIT,
+	SSH_PACKET_SF_COMPLETED_BIT,
+
+	/* type flags */
+	SSH_PACKET_TY_FLUSH_BIT,
+	SSH_PACKET_TY_SEQUENCED_BIT,
+	SSH_PACKET_TY_BLOCKING_BIT,
+
+	/* mask for state flags */
+	SSH_PACKET_FLAGS_SF_MASK =
+		  BIT(SSH_PACKET_SF_LOCKED_BIT)
+		| BIT(SSH_PACKET_SF_QUEUED_BIT)
+		| BIT(SSH_PACKET_SF_PENDING_BIT)
+		| BIT(SSH_PACKET_SF_TRANSMITTING_BIT)
+		| BIT(SSH_PACKET_SF_TRANSMITTED_BIT)
+		| BIT(SSH_PACKET_SF_ACKED_BIT)
+		| BIT(SSH_PACKET_SF_CANCELED_BIT)
+		| BIT(SSH_PACKET_SF_COMPLETED_BIT),
+
+	/* mask for type flags */
+	SSH_PACKET_FLAGS_TY_MASK =
+		  BIT(SSH_PACKET_TY_FLUSH_BIT)
+		| BIT(SSH_PACKET_TY_SEQUENCED_BIT)
+		| BIT(SSH_PACKET_TY_BLOCKING_BIT),
+};
+
+struct ssh_ptl;
+struct ssh_packet;
+
+/**
+ * struct ssh_packet_ops - Callback operations for a SSH packet.
+ * @release:  Function called when the packet reference count reaches zero.
+ *            This callback must be relied upon to ensure that the packet has
+ *            left the transport system(s).
+ * @complete: Function called when the packet is completed, either with
+ *            success or failure. In case of failure, the reason for the
+ *            failure is indicated by the value of the provided status code
+ *            argument. This value will be zero in case of success. Note that
+ *            a call to this callback does not guarantee that the packet is
+ *            not in use by the transport system any more.
+ */
+struct ssh_packet_ops {
+	void (*release)(struct ssh_packet *p);
+	void (*complete)(struct ssh_packet *p, int status);
+};
+
+/**
+ * struct ssh_packet - SSH transport packet.
+ * @ptl:      Pointer to the packet transport layer. May be %NULL if the packet
+ *            (or enclosing request) has not been submitted yet.
+ * @refcnt:   Reference count of the packet.
+ * @priority: Priority of the packet. Must be computed via
+ *            SSH_PACKET_PRIORITY(). Must only be accessed while holding the
+ *            queue lock after first submission.
+ * @data:     Raw message data.
+ * @data.len: Length of the raw message data.
+ * @data.ptr: Pointer to the raw message data buffer.
+ * @state:    State and type flags describing current packet state (dynamic)
+ *            and type (static). See &enum ssh_packet_flags for possible
+ *            options.
+ * @timestamp: Timestamp specifying when the latest transmission of a
+ *            currently pending packet has been started. May be %KTIME_MAX
+ *            before or in-between transmission attempts. Used for the packet
+ *            timeout implementation. Must only be accessed while holding the
+ *            pending lock after first submission.
+ * @queue_node:	The list node for the packet queue.
+ * @pending_node: The list node for the set of pending packets.
+ * @ops:      Packet operations.
+ */
+struct ssh_packet {
+	struct ssh_ptl *ptl;
+	struct kref refcnt;
+
+	u8 priority;
+
+	struct {
+		size_t len;
+		u8 *ptr;
+	} data;
+
+	unsigned long state;
+	ktime_t timestamp;
+
+	struct list_head queue_node;
+	struct list_head pending_node;
+
+	const struct ssh_packet_ops *ops;
+};
+
+struct ssh_packet *ssh_packet_get(struct ssh_packet *p);
+void ssh_packet_put(struct ssh_packet *p);
+
+/**
+ * ssh_packet_set_data() - Set raw message data of packet.
+ * @p:   The packet for which the message data should be set.
+ * @ptr: Pointer to the memory holding the message data.
+ * @len: Length of the message data.
+ *
+ * Sets the raw message data buffer of the packet to the provided memory. The
+ * memory is not copied. Instead, the caller is responsible for management
+ * (i.e. allocation and deallocation) of the memory. The caller must ensure
+ * that the provided memory is valid and contains a valid SSH message,
+ * starting from the time of submission of the packet until the ``release``
+ * callback has been called. During this time, the memory may not be altered
+ * in any way.
+ */
+static inline void ssh_packet_set_data(struct ssh_packet *p, u8 *ptr, size_t len)
+{
+	p->data.ptr = ptr;
+	p->data.len = len;
+}
+
+
+/* -- Request transport layer (rtl). ---------------------------------------- */
+
+enum ssh_request_flags {
+	/* state flags */
+	SSH_REQUEST_SF_LOCKED_BIT,
+	SSH_REQUEST_SF_QUEUED_BIT,
+	SSH_REQUEST_SF_PENDING_BIT,
+	SSH_REQUEST_SF_TRANSMITTING_BIT,
+	SSH_REQUEST_SF_TRANSMITTED_BIT,
+	SSH_REQUEST_SF_RSPRCVD_BIT,
+	SSH_REQUEST_SF_CANCELED_BIT,
+	SSH_REQUEST_SF_COMPLETED_BIT,
+
+	/* type flags */
+	SSH_REQUEST_TY_FLUSH_BIT,
+	SSH_REQUEST_TY_HAS_RESPONSE_BIT,
+
+	/* mask for state flags */
+	SSH_REQUEST_FLAGS_SF_MASK =
+		  BIT(SSH_REQUEST_SF_LOCKED_BIT)
+		| BIT(SSH_REQUEST_SF_QUEUED_BIT)
+		| BIT(SSH_REQUEST_SF_PENDING_BIT)
+		| BIT(SSH_REQUEST_SF_TRANSMITTING_BIT)
+		| BIT(SSH_REQUEST_SF_TRANSMITTED_BIT)
+		| BIT(SSH_REQUEST_SF_RSPRCVD_BIT)
+		| BIT(SSH_REQUEST_SF_CANCELED_BIT)
+		| BIT(SSH_REQUEST_SF_COMPLETED_BIT),
+
+	/* mask for type flags */
+	SSH_REQUEST_FLAGS_TY_MASK =
+		  BIT(SSH_REQUEST_TY_FLUSH_BIT)
+		| BIT(SSH_REQUEST_TY_HAS_RESPONSE_BIT),
+};
+
+struct ssh_rtl;
+struct ssh_request;
+
+/**
+ * struct ssh_request_ops - Callback operations for a SSH request.
+ * @release:  Function called when the request's reference count reaches zero.
+ *            This callback must be relied upon to ensure that the request has
+ *            left the transport systems (both, packet an request systems).
+ * @complete: Function called when the request is completed, either with
+ *            success or failure. The command data for the request response
+ *            is provided via the &struct ssh_command parameter (``cmd``),
+ *            the command payload of the request response via the &struct
+ *            ssh_span parameter (``data``).
+ *
+ *            If the request does not have any response or has not been
+ *            completed with success, both ``cmd`` and ``data`` parameters will
+ *            be NULL. If the request response does not have any command
+ *            payload, the ``data`` span will be an empty (zero-length) span.
+ *
+ *            In case of failure, the reason for the failure is indicated by
+ *            the value of the provided status code argument (``status``). This
+ *            value will be zero in case of success and a regular errno
+ *            otherwise.
+ *
+ *            Note that a call to this callback does not guarantee that the
+ *            request is not in use by the transport systems any more.
+ */
+struct ssh_request_ops {
+	void (*release)(struct ssh_request *rqst);
+	void (*complete)(struct ssh_request *rqst,
+			 const struct ssh_command *cmd,
+			 const struct ssam_span *data, int status);
+};
+
+/**
+ * struct ssh_request - SSH transport request.
+ * @packet: The underlying SSH transport packet.
+ * @node:   List node for the request queue and pending set.
+ * @state:  State and type flags describing current request state (dynamic)
+ *          and type (static). See &enum ssh_request_flags for possible
+ *          options.
+ * @timestamp: Timestamp specifying when we start waiting on the response of
+ *          the request. This is set once the underlying packet has been
+ *          completed and may be %KTIME_MAX before that, or when the request
+ *          does not expect a response. Used for the request timeout
+ *          implementation.
+ * @ops:    Request Operations.
+ */
+struct ssh_request {
+	struct ssh_packet packet;
+	struct list_head node;
+
+	unsigned long state;
+	ktime_t timestamp;
+
+	const struct ssh_request_ops *ops;
+};
+
+/**
+ * to_ssh_request() - Cast a SSH packet to its enclosing SSH request.
+ * @p: The packet to cast.
+ *
+ * Casts the given &struct ssh_packet to its enclosing &struct ssh_request.
+ * The caller is responsible for making sure that the packet is actually
+ * wrapped in a &struct ssh_request.
+ *
+ * Return: Returns the &struct ssh_request wrapping the provided packet.
+ */
+static inline struct ssh_request *to_ssh_request(struct ssh_packet *p)
+{
+	return container_of(p, struct ssh_request, packet);
+}
+
+/**
+ * ssh_request_get() - Increment reference count of request.
+ * @r: The request to increment the reference count of.
+ *
+ * Increments the reference count of the given request by incrementing the
+ * reference count of the underlying &struct ssh_packet, enclosed in it.
+ *
+ * See also ssh_request_put(), ssh_packet_get().
+ *
+ * Return: Returns the request provided as input.
+ */
+static inline struct ssh_request *ssh_request_get(struct ssh_request *r)
+{
+	return r ? to_ssh_request(ssh_packet_get(&r->packet)) : NULL;
+}
+
+/**
+ * ssh_request_put() - Decrement reference count of request.
+ * @r: The request to decrement the reference count of.
+ *
+ * Decrements the reference count of the given request by decrementing the
+ * reference count of the underlying &struct ssh_packet, enclosed in it. If
+ * the reference count reaches zero, the ``release`` callback specified in the
+ * request's &struct ssh_request_ops, i.e. ``r->ops->release``, will be
+ * called.
+ *
+ * See also ssh_request_get(), ssh_packet_put().
+ */
+static inline void ssh_request_put(struct ssh_request *r)
+{
+	if (r)
+		ssh_packet_put(&r->packet);
+}
+
+/**
+ * ssh_request_set_data() - Set raw message data of request.
+ * @r:   The request for which the message data should be set.
+ * @ptr: Pointer to the memory holding the message data.
+ * @len: Length of the message data.
+ *
+ * Sets the raw message data buffer of the underlying packet to the specified
+ * buffer. Does not copy the actual message data, just sets the buffer pointer
+ * and length. Refer to ssh_packet_set_data() for more details.
+ */
+static inline void ssh_request_set_data(struct ssh_request *r, u8 *ptr, size_t len)
+{
+	ssh_packet_set_data(&r->packet, ptr, len);
+}
+
+#endif /* _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H */
-- 
cgit v1.2.3


From eb0e90a82098d4a48308abb87d2087578a83987f Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Mon, 21 Dec 2020 19:39:56 +0100
Subject: platform/surface: aggregator: Add dedicated bus and device type

The Surface Aggregator EC provides varying functionality, depending on
the Surface device. To manage this functionality, we use dedicated
client devices for each subsystem or virtual device of the EC. While
some of these clients are described as standard devices in ACPI and the
corresponding client drivers can be implemented as platform drivers in
the kernel (making use of the controller API already present), many
devices, especially on newer Surface models, cannot be found there.

To simplify management of these devices, we introduce a new bus and
client device type for the Surface Aggregator subsystem. The new device
type takes care of managing the controller reference, essentially
guaranteeing its validity for as long as the client device exists, thus
alleviating the need to manually establish device links for that purpose
in the client driver (as has to be done with the platform devices).

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20201221183959.1186143-7-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/Kconfig  |  12 +
 drivers/platform/surface/aggregator/Makefile |   4 +
 drivers/platform/surface/aggregator/bus.c    | 415 ++++++++++++++++++++++++++
 drivers/platform/surface/aggregator/bus.h    |  27 ++
 drivers/platform/surface/aggregator/core.c   |  12 +
 include/linux/mod_devicetable.h              |  18 ++
 include/linux/surface_aggregator/device.h    | 423 +++++++++++++++++++++++++++
 scripts/mod/devicetable-offsets.c            |   8 +
 scripts/mod/file2alias.c                     |  23 ++
 9 files changed, 942 insertions(+)
 create mode 100644 drivers/platform/surface/aggregator/bus.c
 create mode 100644 drivers/platform/surface/aggregator/bus.h
 create mode 100644 include/linux/surface_aggregator/device.h

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/Kconfig b/drivers/platform/surface/aggregator/Kconfig
index e417bac67088..3aaeea9f0433 100644
--- a/drivers/platform/surface/aggregator/Kconfig
+++ b/drivers/platform/surface/aggregator/Kconfig
@@ -41,6 +41,18 @@ menuconfig SURFACE_AGGREGATOR
 	  module, y if you want to build it into the kernel and n if you don't
 	  want it at all.
 
+config SURFACE_AGGREGATOR_BUS
+	bool "Surface System Aggregator Module Bus"
+	depends on SURFACE_AGGREGATOR
+	default y
+	help
+	  Expands the Surface System Aggregator Module (SSAM) core driver by
+	  providing a dedicated bus and client-device type.
+
+	  This bus and device type are intended to provide and simplify support
+	  for non-platform and non-ACPI SSAM devices, i.e. SSAM devices that are
+	  not auto-detectable via the conventional means (e.g. ACPI).
+
 config SURFACE_AGGREGATOR_ERROR_INJECTION
 	bool "Surface System Aggregator Module Error Injection Capabilities"
 	depends on SURFACE_AGGREGATOR
diff --git a/drivers/platform/surface/aggregator/Makefile b/drivers/platform/surface/aggregator/Makefile
index b8b24c8ec310..c112e2c7112b 100644
--- a/drivers/platform/surface/aggregator/Makefile
+++ b/drivers/platform/surface/aggregator/Makefile
@@ -11,3 +11,7 @@ surface_aggregator-objs += ssh_parser.o
 surface_aggregator-objs += ssh_packet_layer.o
 surface_aggregator-objs += ssh_request_layer.o
 surface_aggregator-objs += controller.o
+
+ifeq ($(CONFIG_SURFACE_AGGREGATOR_BUS),y)
+surface_aggregator-objs += bus.o
+endif
diff --git a/drivers/platform/surface/aggregator/bus.c b/drivers/platform/surface/aggregator/bus.c
new file mode 100644
index 000000000000..a9b660af0917
--- /dev/null
+++ b/drivers/platform/surface/aggregator/bus.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Surface System Aggregator Module bus and device integration.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include <linux/surface_aggregator/device.h>
+
+#include "bus.h"
+#include "controller.h"
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct ssam_device *sdev = to_ssam_device(dev);
+
+	return sysfs_emit(buf, "ssam:d%02Xc%02Xt%02Xi%02Xf%02X\n",
+			sdev->uid.domain, sdev->uid.category, sdev->uid.target,
+			sdev->uid.instance, sdev->uid.function);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *ssam_device_attrs[] = {
+	&dev_attr_modalias.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(ssam_device);
+
+static int ssam_device_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct ssam_device *sdev = to_ssam_device(dev);
+
+	return add_uevent_var(env, "MODALIAS=ssam:d%02Xc%02Xt%02Xi%02Xf%02X",
+			      sdev->uid.domain, sdev->uid.category,
+			      sdev->uid.target, sdev->uid.instance,
+			      sdev->uid.function);
+}
+
+static void ssam_device_release(struct device *dev)
+{
+	struct ssam_device *sdev = to_ssam_device(dev);
+
+	ssam_controller_put(sdev->ctrl);
+	kfree(sdev);
+}
+
+const struct device_type ssam_device_type = {
+	.name    = "surface_aggregator_device",
+	.groups  = ssam_device_groups,
+	.uevent  = ssam_device_uevent,
+	.release = ssam_device_release,
+};
+EXPORT_SYMBOL_GPL(ssam_device_type);
+
+/**
+ * ssam_device_alloc() - Allocate and initialize a SSAM client device.
+ * @ctrl: The controller under which the device should be added.
+ * @uid:  The UID of the device to be added.
+ *
+ * Allocates and initializes a new client device. The parent of the device
+ * will be set to the controller device and the name will be set based on the
+ * UID. Note that the device still has to be added via ssam_device_add().
+ * Refer to that function for more details.
+ *
+ * Return: Returns the newly allocated and initialized SSAM client device, or
+ * %NULL if it could not be allocated.
+ */
+struct ssam_device *ssam_device_alloc(struct ssam_controller *ctrl,
+				      struct ssam_device_uid uid)
+{
+	struct ssam_device *sdev;
+
+	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+	if (!sdev)
+		return NULL;
+
+	device_initialize(&sdev->dev);
+	sdev->dev.bus = &ssam_bus_type;
+	sdev->dev.type = &ssam_device_type;
+	sdev->dev.parent = ssam_controller_device(ctrl);
+	sdev->ctrl = ssam_controller_get(ctrl);
+	sdev->uid = uid;
+
+	dev_set_name(&sdev->dev, "%02x:%02x:%02x:%02x:%02x",
+		     sdev->uid.domain, sdev->uid.category, sdev->uid.target,
+		     sdev->uid.instance, sdev->uid.function);
+
+	return sdev;
+}
+EXPORT_SYMBOL_GPL(ssam_device_alloc);
+
+/**
+ * ssam_device_add() - Add a SSAM client device.
+ * @sdev: The SSAM client device to be added.
+ *
+ * Added client devices must be guaranteed to always have a valid and active
+ * controller. Thus, this function will fail with %-ENODEV if the controller
+ * of the device has not been initialized yet, has been suspended, or has been
+ * shut down.
+ *
+ * The caller of this function should ensure that the corresponding call to
+ * ssam_device_remove() is issued before the controller is shut down. If the
+ * added device is a direct child of the controller device (default), it will
+ * be automatically removed when the controller is shut down.
+ *
+ * By default, the controller device will become the parent of the newly
+ * created client device. The parent may be changed before ssam_device_add is
+ * called, but care must be taken that a) the correct suspend/resume ordering
+ * is guaranteed and b) the client device does not outlive the controller,
+ * i.e. that the device is removed before the controller is being shut down.
+ * In case these guarantees have to be manually enforced, please refer to the
+ * ssam_client_link() and ssam_client_bind() functions, which are intended to
+ * set up device-links for this purpose.
+ *
+ * Return: Returns zero on success, a negative error code on failure.
+ */
+int ssam_device_add(struct ssam_device *sdev)
+{
+	int status;
+
+	/*
+	 * Ensure that we can only add new devices to a controller if it has
+	 * been started and is not going away soon. This works in combination
+	 * with ssam_controller_remove_clients to ensure driver presence for the
+	 * controller device, i.e. it ensures that the controller (sdev->ctrl)
+	 * is always valid and can be used for requests as long as the client
+	 * device we add here is registered as child under it. This essentially
+	 * guarantees that the client driver can always expect the preconditions
+	 * for functions like ssam_request_sync (controller has to be started
+	 * and is not suspended) to hold and thus does not have to check for
+	 * them.
+	 *
+	 * Note that for this to work, the controller has to be a parent device.
+	 * If it is not a direct parent, care has to be taken that the device is
+	 * removed via ssam_device_remove(), as device_unregister does not
+	 * remove child devices recursively.
+	 */
+	ssam_controller_statelock(sdev->ctrl);
+
+	if (sdev->ctrl->state != SSAM_CONTROLLER_STARTED) {
+		ssam_controller_stateunlock(sdev->ctrl);
+		return -ENODEV;
+	}
+
+	status = device_add(&sdev->dev);
+
+	ssam_controller_stateunlock(sdev->ctrl);
+	return status;
+}
+EXPORT_SYMBOL_GPL(ssam_device_add);
+
+/**
+ * ssam_device_remove() - Remove a SSAM client device.
+ * @sdev: The device to remove.
+ *
+ * Removes and unregisters the provided SSAM client device.
+ */
+void ssam_device_remove(struct ssam_device *sdev)
+{
+	device_unregister(&sdev->dev);
+}
+EXPORT_SYMBOL_GPL(ssam_device_remove);
+
+/**
+ * ssam_device_id_compatible() - Check if a device ID matches a UID.
+ * @id:  The device ID as potential match.
+ * @uid: The device UID matching against.
+ *
+ * Check if the given ID is a match for the given UID, i.e. if a device with
+ * the provided UID is compatible to the given ID following the match rules
+ * described in its &ssam_device_id.match_flags member.
+ *
+ * Return: Returns %true if the given UID is compatible to the match rule
+ * described by the given ID, %false otherwise.
+ */
+static bool ssam_device_id_compatible(const struct ssam_device_id *id,
+				      struct ssam_device_uid uid)
+{
+	if (id->domain != uid.domain || id->category != uid.category)
+		return false;
+
+	if ((id->match_flags & SSAM_MATCH_TARGET) && id->target != uid.target)
+		return false;
+
+	if ((id->match_flags & SSAM_MATCH_INSTANCE) && id->instance != uid.instance)
+		return false;
+
+	if ((id->match_flags & SSAM_MATCH_FUNCTION) && id->function != uid.function)
+		return false;
+
+	return true;
+}
+
+/**
+ * ssam_device_id_is_null() - Check if a device ID is null.
+ * @id: The device ID to check.
+ *
+ * Check if a given device ID is null, i.e. all zeros. Used to check for the
+ * end of ``MODULE_DEVICE_TABLE(ssam, ...)`` or similar lists.
+ *
+ * Return: Returns %true if the given ID represents a null ID, %false
+ * otherwise.
+ */
+static bool ssam_device_id_is_null(const struct ssam_device_id *id)
+{
+	return id->match_flags == 0 &&
+		id->domain == 0 &&
+		id->category == 0 &&
+		id->target == 0 &&
+		id->instance == 0 &&
+		id->function == 0 &&
+		id->driver_data == 0;
+}
+
+/**
+ * ssam_device_id_match() - Find the matching ID table entry for the given UID.
+ * @table: The table to search in.
+ * @uid:   The UID to matched against the individual table entries.
+ *
+ * Find the first match for the provided device UID in the provided ID table
+ * and return it. Returns %NULL if no match could be found.
+ */
+const struct ssam_device_id *ssam_device_id_match(const struct ssam_device_id *table,
+						  const struct ssam_device_uid uid)
+{
+	const struct ssam_device_id *id;
+
+	for (id = table; !ssam_device_id_is_null(id); ++id)
+		if (ssam_device_id_compatible(id, uid))
+			return id;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ssam_device_id_match);
+
+/**
+ * ssam_device_get_match() - Find and return the ID matching the device in the
+ * ID table of the bound driver.
+ * @dev: The device for which to get the matching ID table entry.
+ *
+ * Find the fist match for the UID of the device in the ID table of the
+ * currently bound driver and return it. Returns %NULL if the device does not
+ * have a driver bound to it, the driver does not have match_table (i.e. it is
+ * %NULL), or there is no match in the driver's match_table.
+ *
+ * This function essentially calls ssam_device_id_match() with the ID table of
+ * the bound device driver and the UID of the device.
+ *
+ * Return: Returns the first match for the UID of the device in the device
+ * driver's match table, or %NULL if no such match could be found.
+ */
+const struct ssam_device_id *ssam_device_get_match(const struct ssam_device *dev)
+{
+	const struct ssam_device_driver *sdrv;
+
+	sdrv = to_ssam_device_driver(dev->dev.driver);
+	if (!sdrv)
+		return NULL;
+
+	if (!sdrv->match_table)
+		return NULL;
+
+	return ssam_device_id_match(sdrv->match_table, dev->uid);
+}
+EXPORT_SYMBOL_GPL(ssam_device_get_match);
+
+/**
+ * ssam_device_get_match_data() - Find the ID matching the device in the
+ * ID table of the bound driver and return its ``driver_data`` member.
+ * @dev: The device for which to get the match data.
+ *
+ * Find the fist match for the UID of the device in the ID table of the
+ * corresponding driver and return its driver_data. Returns %NULL if the
+ * device does not have a driver bound to it, the driver does not have
+ * match_table (i.e. it is %NULL), there is no match in the driver's
+ * match_table, or the match does not have any driver_data.
+ *
+ * This function essentially calls ssam_device_get_match() and, if any match
+ * could be found, returns its ``struct ssam_device_id.driver_data`` member.
+ *
+ * Return: Returns the driver data associated with the first match for the UID
+ * of the device in the device driver's match table, or %NULL if no such match
+ * could be found.
+ */
+const void *ssam_device_get_match_data(const struct ssam_device *dev)
+{
+	const struct ssam_device_id *id;
+
+	id = ssam_device_get_match(dev);
+	if (!id)
+		return NULL;
+
+	return (const void *)id->driver_data;
+}
+EXPORT_SYMBOL_GPL(ssam_device_get_match_data);
+
+static int ssam_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct ssam_device_driver *sdrv = to_ssam_device_driver(drv);
+	struct ssam_device *sdev = to_ssam_device(dev);
+
+	if (!is_ssam_device(dev))
+		return 0;
+
+	return !!ssam_device_id_match(sdrv->match_table, sdev->uid);
+}
+
+static int ssam_bus_probe(struct device *dev)
+{
+	return to_ssam_device_driver(dev->driver)
+		->probe(to_ssam_device(dev));
+}
+
+static int ssam_bus_remove(struct device *dev)
+{
+	struct ssam_device_driver *sdrv = to_ssam_device_driver(dev->driver);
+
+	if (sdrv->remove)
+		sdrv->remove(to_ssam_device(dev));
+
+	return 0;
+}
+
+struct bus_type ssam_bus_type = {
+	.name   = "surface_aggregator",
+	.match  = ssam_bus_match,
+	.probe  = ssam_bus_probe,
+	.remove = ssam_bus_remove,
+};
+EXPORT_SYMBOL_GPL(ssam_bus_type);
+
+/**
+ * __ssam_device_driver_register() - Register a SSAM client device driver.
+ * @sdrv:  The driver to register.
+ * @owner: The module owning the provided driver.
+ *
+ * Please refer to the ssam_device_driver_register() macro for the normal way
+ * to register a driver from inside its owning module.
+ */
+int __ssam_device_driver_register(struct ssam_device_driver *sdrv,
+				  struct module *owner)
+{
+	sdrv->driver.owner = owner;
+	sdrv->driver.bus = &ssam_bus_type;
+
+	/* force drivers to async probe so I/O is possible in probe */
+	sdrv->driver.probe_type = PROBE_PREFER_ASYNCHRONOUS;
+
+	return driver_register(&sdrv->driver);
+}
+EXPORT_SYMBOL_GPL(__ssam_device_driver_register);
+
+/**
+ * ssam_device_driver_unregister - Unregister a SSAM device driver.
+ * @sdrv: The driver to unregister.
+ */
+void ssam_device_driver_unregister(struct ssam_device_driver *sdrv)
+{
+	driver_unregister(&sdrv->driver);
+}
+EXPORT_SYMBOL_GPL(ssam_device_driver_unregister);
+
+static int ssam_remove_device(struct device *dev, void *_data)
+{
+	struct ssam_device *sdev = to_ssam_device(dev);
+
+	if (is_ssam_device(dev))
+		ssam_device_remove(sdev);
+
+	return 0;
+}
+
+/**
+ * ssam_controller_remove_clients() - Remove SSAM client devices registered as
+ * direct children under the given controller.
+ * @ctrl: The controller to remove all direct clients for.
+ *
+ * Remove all SSAM client devices registered as direct children under the
+ * given controller. Note that this only accounts for direct children of the
+ * controller device. This does not take care of any client devices where the
+ * parent device has been manually set before calling ssam_device_add. Refer
+ * to ssam_device_add()/ssam_device_remove() for more details on those cases.
+ *
+ * To avoid new devices being added in parallel to this call, the main
+ * controller lock (not statelock) must be held during this (and if
+ * necessary, any subsequent deinitialization) call.
+ */
+void ssam_controller_remove_clients(struct ssam_controller *ctrl)
+{
+	struct device *dev;
+
+	dev = ssam_controller_device(ctrl);
+	device_for_each_child_reverse(dev, NULL, ssam_remove_device);
+}
+
+/**
+ * ssam_bus_register() - Register and set-up the SSAM client device bus.
+ */
+int ssam_bus_register(void)
+{
+	return bus_register(&ssam_bus_type);
+}
+
+/**
+ * ssam_bus_unregister() - Unregister the SSAM client device bus.
+ */
+void ssam_bus_unregister(void)
+{
+	return bus_unregister(&ssam_bus_type);
+}
diff --git a/drivers/platform/surface/aggregator/bus.h b/drivers/platform/surface/aggregator/bus.h
new file mode 100644
index 000000000000..7712baaed6a5
--- /dev/null
+++ b/drivers/platform/surface/aggregator/bus.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Surface System Aggregator Module bus and device integration.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _SURFACE_AGGREGATOR_BUS_H
+#define _SURFACE_AGGREGATOR_BUS_H
+
+#include <linux/surface_aggregator/controller.h>
+
+#ifdef CONFIG_SURFACE_AGGREGATOR_BUS
+
+void ssam_controller_remove_clients(struct ssam_controller *ctrl);
+
+int ssam_bus_register(void);
+void ssam_bus_unregister(void);
+
+#else /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
+static inline void ssam_controller_remove_clients(struct ssam_controller *ctrl) {}
+static inline int ssam_bus_register(void) { return 0; }
+static inline void ssam_bus_unregister(void) {}
+
+#endif /* CONFIG_SURFACE_AGGREGATOR_BUS */
+#endif /* _SURFACE_AGGREGATOR_BUS_H */
diff --git a/drivers/platform/surface/aggregator/core.c b/drivers/platform/surface/aggregator/core.c
index b6a9dea53592..8dc2c267bcd6 100644
--- a/drivers/platform/surface/aggregator/core.c
+++ b/drivers/platform/surface/aggregator/core.c
@@ -22,6 +22,8 @@
 #include <linux/sysfs.h>
 
 #include <linux/surface_aggregator/controller.h>
+
+#include "bus.h"
 #include "controller.h"
 
 #define CREATE_TRACE_POINTS
@@ -739,6 +741,9 @@ static void ssam_serial_hub_remove(struct serdev_device *serdev)
 	sysfs_remove_group(&serdev->dev.kobj, &ssam_sam_group);
 	ssam_controller_lock(ctrl);
 
+	/* Remove all client devices. */
+	ssam_controller_remove_clients(ctrl);
+
 	/* Act as if suspending to silence events. */
 	status = ssam_ctrl_notif_display_off(ctrl);
 	if (status) {
@@ -791,6 +796,10 @@ static int __init ssam_core_init(void)
 {
 	int status;
 
+	status = ssam_bus_register();
+	if (status)
+		goto err_bus;
+
 	status = ssh_ctrl_packet_cache_init();
 	if (status)
 		goto err_cpkg;
@@ -810,6 +819,8 @@ err_register:
 err_evitem:
 	ssh_ctrl_packet_cache_destroy();
 err_cpkg:
+	ssam_bus_unregister();
+err_bus:
 	return status;
 }
 module_init(ssam_core_init);
@@ -819,6 +830,7 @@ static void __exit ssam_core_exit(void)
 	serdev_device_driver_unregister(&ssam_serial_hub);
 	ssam_event_item_cache_destroy();
 	ssh_ctrl_packet_cache_destroy();
+	ssam_bus_unregister();
 }
 module_exit(ssam_core_exit);
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index c425290b21e2..935060955152 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -846,4 +846,22 @@ struct auxiliary_device_id {
 	kernel_ulong_t driver_data;
 };
 
+/* Surface System Aggregator Module */
+
+#define SSAM_MATCH_TARGET	0x1
+#define SSAM_MATCH_INSTANCE	0x2
+#define SSAM_MATCH_FUNCTION	0x4
+
+struct ssam_device_id {
+	__u8 match_flags;
+
+	__u8 domain;
+	__u8 category;
+	__u8 target;
+	__u8 instance;
+	__u8 function;
+
+	kernel_ulong_t driver_data;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
new file mode 100644
index 000000000000..02f3e06c0a60
--- /dev/null
+++ b/include/linux/surface_aggregator/device.h
@@ -0,0 +1,423 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Surface System Aggregator Module (SSAM) bus and client-device subsystem.
+ *
+ * Main interface for the surface-aggregator bus, surface-aggregator client
+ * devices, and respective drivers building on top of the SSAM controller.
+ * Provides support for non-platform/non-ACPI SSAM clients via dedicated
+ * subsystem.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _LINUX_SURFACE_AGGREGATOR_DEVICE_H
+#define _LINUX_SURFACE_AGGREGATOR_DEVICE_H
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/types.h>
+
+#include <linux/surface_aggregator/controller.h>
+
+
+/* -- Surface System Aggregator Module bus. --------------------------------- */
+
+/**
+ * enum ssam_device_domain - SAM device domain.
+ * @SSAM_DOMAIN_VIRTUAL:   Virtual device.
+ * @SSAM_DOMAIN_SERIALHUB: Physical device connected via Surface Serial Hub.
+ */
+enum ssam_device_domain {
+	SSAM_DOMAIN_VIRTUAL   = 0x00,
+	SSAM_DOMAIN_SERIALHUB = 0x01,
+};
+
+/**
+ * enum ssam_virtual_tc - Target categories for the virtual SAM domain.
+ * @SSAM_VIRTUAL_TC_HUB: Device hub category.
+ */
+enum ssam_virtual_tc {
+	SSAM_VIRTUAL_TC_HUB = 0x00,
+};
+
+/**
+ * struct ssam_device_uid - Unique identifier for SSAM device.
+ * @domain:   Domain of the device.
+ * @category: Target category of the device.
+ * @target:   Target ID of the device.
+ * @instance: Instance ID of the device.
+ * @function: Sub-function of the device. This field can be used to split a
+ *            single SAM device into multiple virtual subdevices to separate
+ *            different functionality of that device and allow one driver per
+ *            such functionality.
+ */
+struct ssam_device_uid {
+	u8 domain;
+	u8 category;
+	u8 target;
+	u8 instance;
+	u8 function;
+};
+
+/*
+ * Special values for device matching.
+ *
+ * These values are intended to be used with SSAM_DEVICE(), SSAM_VDEV(), and
+ * SSAM_SDEV() exclusively. Specifically, they are used to initialize the
+ * match_flags member of the device ID structure. Do not use them directly
+ * with struct ssam_device_id or struct ssam_device_uid.
+ */
+#define SSAM_ANY_TID		0xffff
+#define SSAM_ANY_IID		0xffff
+#define SSAM_ANY_FUN		0xffff
+
+/**
+ * SSAM_DEVICE() - Initialize a &struct ssam_device_id with the given
+ * parameters.
+ * @d:   Domain of the device.
+ * @cat: Target category of the device.
+ * @tid: Target ID of the device.
+ * @iid: Instance ID of the device.
+ * @fun: Sub-function of the device.
+ *
+ * Initializes a &struct ssam_device_id with the given parameters. See &struct
+ * ssam_device_uid for details regarding the parameters. The special values
+ * %SSAM_ANY_TID, %SSAM_ANY_IID, and %SSAM_ANY_FUN can be used to specify that
+ * matching should ignore target ID, instance ID, and/or sub-function,
+ * respectively. This macro initializes the ``match_flags`` field based on the
+ * given parameters.
+ *
+ * Note: The parameters @d and @cat must be valid &u8 values, the parameters
+ * @tid, @iid, and @fun must be either valid &u8 values or %SSAM_ANY_TID,
+ * %SSAM_ANY_IID, or %SSAM_ANY_FUN, respectively. Other non-&u8 values are not
+ * allowed.
+ */
+#define SSAM_DEVICE(d, cat, tid, iid, fun)					\
+	.match_flags = (((tid) != SSAM_ANY_TID) ? SSAM_MATCH_TARGET : 0)	\
+		     | (((iid) != SSAM_ANY_IID) ? SSAM_MATCH_INSTANCE : 0)	\
+		     | (((fun) != SSAM_ANY_FUN) ? SSAM_MATCH_FUNCTION : 0),	\
+	.domain   = d,								\
+	.category = cat,							\
+	.target   = ((tid) != SSAM_ANY_TID) ? (tid) : 0,			\
+	.instance = ((iid) != SSAM_ANY_IID) ? (iid) : 0,			\
+	.function = ((fun) != SSAM_ANY_FUN) ? (fun) : 0				\
+
+/**
+ * SSAM_VDEV() - Initialize a &struct ssam_device_id as virtual device with
+ * the given parameters.
+ * @cat: Target category of the device.
+ * @tid: Target ID of the device.
+ * @iid: Instance ID of the device.
+ * @fun: Sub-function of the device.
+ *
+ * Initializes a &struct ssam_device_id with the given parameters in the
+ * virtual domain. See &struct ssam_device_uid for details regarding the
+ * parameters. The special values %SSAM_ANY_TID, %SSAM_ANY_IID, and
+ * %SSAM_ANY_FUN can be used to specify that matching should ignore target ID,
+ * instance ID, and/or sub-function, respectively. This macro initializes the
+ * ``match_flags`` field based on the given parameters.
+ *
+ * Note: The parameter @cat must be a valid &u8 value, the parameters @tid,
+ * @iid, and @fun must be either valid &u8 values or %SSAM_ANY_TID,
+ * %SSAM_ANY_IID, or %SSAM_ANY_FUN, respectively. Other non-&u8 values are not
+ * allowed.
+ */
+#define SSAM_VDEV(cat, tid, iid, fun) \
+	SSAM_DEVICE(SSAM_DOMAIN_VIRTUAL, SSAM_VIRTUAL_TC_##cat, tid, iid, fun)
+
+/**
+ * SSAM_SDEV() - Initialize a &struct ssam_device_id as physical SSH device
+ * with the given parameters.
+ * @cat: Target category of the device.
+ * @tid: Target ID of the device.
+ * @iid: Instance ID of the device.
+ * @fun: Sub-function of the device.
+ *
+ * Initializes a &struct ssam_device_id with the given parameters in the SSH
+ * domain. See &struct ssam_device_uid for details regarding the parameters.
+ * The special values %SSAM_ANY_TID, %SSAM_ANY_IID, and %SSAM_ANY_FUN can be
+ * used to specify that matching should ignore target ID, instance ID, and/or
+ * sub-function, respectively. This macro initializes the ``match_flags``
+ * field based on the given parameters.
+ *
+ * Note: The parameter @cat must be a valid &u8 value, the parameters @tid,
+ * @iid, and @fun must be either valid &u8 values or %SSAM_ANY_TID,
+ * %SSAM_ANY_IID, or %SSAM_ANY_FUN, respectively. Other non-&u8 values are not
+ * allowed.
+ */
+#define SSAM_SDEV(cat, tid, iid, fun) \
+	SSAM_DEVICE(SSAM_DOMAIN_SERIALHUB, SSAM_SSH_TC_##cat, tid, iid, fun)
+
+/**
+ * struct ssam_device - SSAM client device.
+ * @dev:  Driver model representation of the device.
+ * @ctrl: SSAM controller managing this device.
+ * @uid:  UID identifying the device.
+ */
+struct ssam_device {
+	struct device dev;
+	struct ssam_controller *ctrl;
+
+	struct ssam_device_uid uid;
+};
+
+/**
+ * struct ssam_device_driver - SSAM client device driver.
+ * @driver:      Base driver model structure.
+ * @match_table: Match table specifying which devices the driver should bind to.
+ * @probe:       Called when the driver is being bound to a device.
+ * @remove:      Called when the driver is being unbound from the device.
+ */
+struct ssam_device_driver {
+	struct device_driver driver;
+
+	const struct ssam_device_id *match_table;
+
+	int  (*probe)(struct ssam_device *sdev);
+	void (*remove)(struct ssam_device *sdev);
+};
+
+extern struct bus_type ssam_bus_type;
+extern const struct device_type ssam_device_type;
+
+/**
+ * is_ssam_device() - Check if the given device is a SSAM client device.
+ * @d: The device to test the type of.
+ *
+ * Return: Returns %true if the specified device is of type &struct
+ * ssam_device, i.e. the device type points to %ssam_device_type, and %false
+ * otherwise.
+ */
+static inline bool is_ssam_device(struct device *d)
+{
+	return d->type == &ssam_device_type;
+}
+
+/**
+ * to_ssam_device() - Casts the given device to a SSAM client device.
+ * @d: The device to cast.
+ *
+ * Casts the given &struct device to a &struct ssam_device. The caller has to
+ * ensure that the given device is actually enclosed in a &struct ssam_device,
+ * e.g. by calling is_ssam_device().
+ *
+ * Return: Returns a pointer to the &struct ssam_device wrapping the given
+ * device @d.
+ */
+static inline struct ssam_device *to_ssam_device(struct device *d)
+{
+	return container_of(d, struct ssam_device, dev);
+}
+
+/**
+ * to_ssam_device_driver() - Casts the given device driver to a SSAM client
+ * device driver.
+ * @d: The driver to cast.
+ *
+ * Casts the given &struct device_driver to a &struct ssam_device_driver. The
+ * caller has to ensure that the given driver is actually enclosed in a
+ * &struct ssam_device_driver.
+ *
+ * Return: Returns the pointer to the &struct ssam_device_driver wrapping the
+ * given device driver @d.
+ */
+static inline
+struct ssam_device_driver *to_ssam_device_driver(struct device_driver *d)
+{
+	return container_of(d, struct ssam_device_driver, driver);
+}
+
+const struct ssam_device_id *ssam_device_id_match(const struct ssam_device_id *table,
+						  const struct ssam_device_uid uid);
+
+const struct ssam_device_id *ssam_device_get_match(const struct ssam_device *dev);
+
+const void *ssam_device_get_match_data(const struct ssam_device *dev);
+
+struct ssam_device *ssam_device_alloc(struct ssam_controller *ctrl,
+				      struct ssam_device_uid uid);
+
+int ssam_device_add(struct ssam_device *sdev);
+void ssam_device_remove(struct ssam_device *sdev);
+
+/**
+ * ssam_device_get() - Increment reference count of SSAM client device.
+ * @sdev: The device to increment the reference count of.
+ *
+ * Increments the reference count of the given SSAM client device by
+ * incrementing the reference count of the enclosed &struct device via
+ * get_device().
+ *
+ * See ssam_device_put() for the counter-part of this function.
+ *
+ * Return: Returns the device provided as input.
+ */
+static inline struct ssam_device *ssam_device_get(struct ssam_device *sdev)
+{
+	return sdev ? to_ssam_device(get_device(&sdev->dev)) : NULL;
+}
+
+/**
+ * ssam_device_put() - Decrement reference count of SSAM client device.
+ * @sdev: The device to decrement the reference count of.
+ *
+ * Decrements the reference count of the given SSAM client device by
+ * decrementing the reference count of the enclosed &struct device via
+ * put_device().
+ *
+ * See ssam_device_get() for the counter-part of this function.
+ */
+static inline void ssam_device_put(struct ssam_device *sdev)
+{
+	if (sdev)
+		put_device(&sdev->dev);
+}
+
+/**
+ * ssam_device_get_drvdata() - Get driver-data of SSAM client device.
+ * @sdev: The device to get the driver-data from.
+ *
+ * Return: Returns the driver-data of the given device, previously set via
+ * ssam_device_set_drvdata().
+ */
+static inline void *ssam_device_get_drvdata(struct ssam_device *sdev)
+{
+	return dev_get_drvdata(&sdev->dev);
+}
+
+/**
+ * ssam_device_set_drvdata() - Set driver-data of SSAM client device.
+ * @sdev: The device to set the driver-data of.
+ * @data: The data to set the device's driver-data pointer to.
+ */
+static inline void ssam_device_set_drvdata(struct ssam_device *sdev, void *data)
+{
+	dev_set_drvdata(&sdev->dev, data);
+}
+
+int __ssam_device_driver_register(struct ssam_device_driver *d, struct module *o);
+void ssam_device_driver_unregister(struct ssam_device_driver *d);
+
+/**
+ * ssam_device_driver_register() - Register a SSAM client device driver.
+ * @drv: The driver to register.
+ */
+#define ssam_device_driver_register(drv) \
+	__ssam_device_driver_register(drv, THIS_MODULE)
+
+/**
+ * module_ssam_device_driver() - Helper macro for SSAM device driver
+ * registration.
+ * @drv: The driver managed by this module.
+ *
+ * Helper macro to register a SSAM device driver via module_init() and
+ * module_exit(). This macro may only be used once per module and replaces the
+ * aforementioned definitions.
+ */
+#define module_ssam_device_driver(drv)			\
+	module_driver(drv, ssam_device_driver_register,	\
+		      ssam_device_driver_unregister)
+
+
+/* -- Helpers for client-device requests. ----------------------------------- */
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_CL_N() - Define synchronous client-device SAM
+ * request function with neither argument nor return value.
+ * @name: Name of the generated function.
+ * @spec: Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request having neither argument nor return value. Device
+ * specifying parameters are not hard-coded, but instead are provided via the
+ * client device, specifically its UID, supplied when calling this function.
+ * The generated function takes care of setting up the request struct, buffer
+ * allocation, as well as execution of the request itself, returning once the
+ * request has been fully completed. The required transport buffer will be
+ * allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_device *sdev)``,
+ * returning the status of the request, which is zero on success and negative
+ * on failure. The ``sdev`` parameter specifies both the target device of the
+ * request and by association the controller via which the request is sent.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_CL_N(name, spec...)			\
+	SSAM_DEFINE_SYNC_REQUEST_MD_N(__raw_##name, spec)		\
+	int name(struct ssam_device *sdev)				\
+	{								\
+		return __raw_##name(sdev->ctrl, sdev->uid.target,	\
+				    sdev->uid.instance);		\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_CL_W() - Define synchronous client-device SAM
+ * request function with argument.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request taking an argument of type @atype and having no
+ * return value. Device specifying parameters are not hard-coded, but instead
+ * are provided via the client device, specifically its UID, supplied when
+ * calling this function. The generated function takes care of setting up the
+ * request struct, buffer allocation, as well as execution of the request
+ * itself, returning once the request has been fully completed. The required
+ * transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_device *sdev,
+ * const atype *arg)``, returning the status of the request, which is zero on
+ * success and negative on failure. The ``sdev`` parameter specifies both the
+ * target device of the request and by association the controller via which
+ * the request is sent. The request's argument is specified via the ``arg``
+ * pointer.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_CL_W(name, atype, spec...)		\
+	SSAM_DEFINE_SYNC_REQUEST_MD_W(__raw_##name, atype, spec)	\
+	int name(struct ssam_device *sdev, const atype *arg)		\
+	{								\
+		return __raw_##name(sdev->ctrl, sdev->uid.target,	\
+				    sdev->uid.instance, arg);		\
+	}
+
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_CL_R() - Define synchronous client-device SAM
+ * request function with return value.
+ * @name:  Name of the generated function.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by
+ * @spec, with the request taking no argument but having a return value of
+ * type @rtype. Device specifying parameters are not hard-coded, but instead
+ * are provided via the client device, specifically its UID, supplied when
+ * calling this function. The generated function takes care of setting up the
+ * request struct, buffer allocation, as well as execution of the request
+ * itself, returning once the request has been fully completed. The required
+ * transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``int name(struct ssam_device *sdev,
+ * rtype *ret)``, returning the status of the request, which is zero on
+ * success and negative on failure. The ``sdev`` parameter specifies both the
+ * target device of the request and by association the controller via which
+ * the request is sent. The request's return value is written to the memory
+ * pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_CL_R(name, rtype, spec...)		\
+	SSAM_DEFINE_SYNC_REQUEST_MD_R(__raw_##name, rtype, spec)	\
+	int name(struct ssam_device *sdev, rtype *ret)			\
+	{								\
+		return __raw_##name(sdev->ctrl, sdev->uid.target,	\
+				    sdev->uid.instance, ret);		\
+	}
+
+#endif /* _LINUX_SURFACE_AGGREGATOR_DEVICE_H */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index e377f52dbfa3..f078eeb0a961 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -246,5 +246,13 @@ int main(void)
 	DEVID(auxiliary_device_id);
 	DEVID_FIELD(auxiliary_device_id, name);
 
+	DEVID(ssam_device_id);
+	DEVID_FIELD(ssam_device_id, match_flags);
+	DEVID_FIELD(ssam_device_id, domain);
+	DEVID_FIELD(ssam_device_id, category);
+	DEVID_FIELD(ssam_device_id, target);
+	DEVID_FIELD(ssam_device_id, instance);
+	DEVID_FIELD(ssam_device_id, function);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index fb4827027536..d21d2871387b 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1375,6 +1375,28 @@ static int do_auxiliary_entry(const char *filename, void *symval, char *alias)
 	return 1;
 }
 
+/*
+ * Looks like: ssam:dNcNtNiNfN
+ *
+ * N is exactly 2 digits, where each is an upper-case hex digit.
+ */
+static int do_ssam_entry(const char *filename, void *symval, char *alias)
+{
+	DEF_FIELD(symval, ssam_device_id, match_flags);
+	DEF_FIELD(symval, ssam_device_id, domain);
+	DEF_FIELD(symval, ssam_device_id, category);
+	DEF_FIELD(symval, ssam_device_id, target);
+	DEF_FIELD(symval, ssam_device_id, instance);
+	DEF_FIELD(symval, ssam_device_id, function);
+
+	sprintf(alias, "ssam:d%02Xc%02X", domain, category);
+	ADD(alias, "t", match_flags & SSAM_MATCH_TARGET, target);
+	ADD(alias, "i", match_flags & SSAM_MATCH_INSTANCE, instance);
+	ADD(alias, "f", match_flags & SSAM_MATCH_FUNCTION, function);
+
+	return 1;
+}
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
@@ -1450,6 +1472,7 @@ static const struct devtable devtable[] = {
 	{"wmi", SIZE_wmi_device_id, do_wmi_entry},
 	{"mhi", SIZE_mhi_device_id, do_mhi_entry},
 	{"auxiliary", SIZE_auxiliary_device_id, do_auxiliary_entry},
+	{"ssam", SIZE_ssam_device_id, do_ssam_entry},
 };
 
 /* Create MODULE_ALIAS() statements.
-- 
cgit v1.2.3


From fc00bc8ac1dada4085f9308f85f2d6359da0faa8 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Mon, 21 Dec 2020 19:39:59 +0100
Subject: platform/surface: Add Surface ACPI Notify driver

The Surface ACPI Notify (SAN) device provides an ACPI interface to the
Surface Aggregator EC, specifically the Surface Serial Hub interface.
This interface allows EC requests to be made from ACPI code and can
convert a subset of EC events back to ACPI notifications.

Specifically, this interface provides a GenericSerialBus operation
region ACPI code can execute a request by writing the request command
data and payload to this operation region and reading back the
corresponding response via a write-then-read operation. Furthermore,
this interface provides a _DSM method to be called when certain events
from the EC have been received, essentially turning them into ACPI
notifications.

The driver provided in this commit essentially takes care of translating
the request data written to the operation region, executing the request,
waiting for it to finish, and finally writing and translating back the
response (if the request has one). Furthermore, this driver takes care
of enabling the events handled via ACPI _DSM calls. Lastly, this driver
also exposes an interface providing discrete GPU (dGPU) power-on
notifications on the Surface Book 2, which are also received via the
operation region interface (but not handled by the SAN driver directly),
making them accessible to other drivers (such as a dGPU hot-plug driver
that may be added later on).

On 5th and 6th generation Surface devices (Surface Pro 5/2017, Pro 6,
Book 2, Laptop 1 and 2), the SAN interface provides full battery and
thermal subsystem access, as well as other EC based functionality. On
those models, battery and thermal sensor devices are implemented as
standard ACPI devices of that type, however, forward ACPI calls to the
corresponding Surface Aggregator EC request via the SAN interface and
receive corresponding notifications (e.g. battery information change)
from it. This interface is therefore required to provide said
functionality on those devices.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20201221183959.1186143-10-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../surface_aggregator/clients/index.rst           |   1 +
 .../driver-api/surface_aggregator/clients/san.rst  |  44 +
 MAINTAINERS                                        |   2 +
 drivers/platform/surface/Kconfig                   |  19 +
 drivers/platform/surface/Makefile                  |   1 +
 drivers/platform/surface/surface_acpi_notify.c     | 886 +++++++++++++++++++++
 include/linux/surface_acpi_notify.h                |  39 +
 7 files changed, 992 insertions(+)
 create mode 100644 Documentation/driver-api/surface_aggregator/clients/san.rst
 create mode 100644 drivers/platform/surface/surface_acpi_notify.c
 create mode 100644 include/linux/surface_acpi_notify.h

(limited to 'include/linux')

diff --git a/Documentation/driver-api/surface_aggregator/clients/index.rst b/Documentation/driver-api/surface_aggregator/clients/index.rst
index ab260ec82cfb..3ccabce23271 100644
--- a/Documentation/driver-api/surface_aggregator/clients/index.rst
+++ b/Documentation/driver-api/surface_aggregator/clients/index.rst
@@ -11,6 +11,7 @@ This is the documentation for client drivers themselves. Refer to
    :maxdepth: 1
 
    cdev
+   san
 
 .. only::  subproject and html
 
diff --git a/Documentation/driver-api/surface_aggregator/clients/san.rst b/Documentation/driver-api/surface_aggregator/clients/san.rst
new file mode 100644
index 000000000000..38c2580e7758
--- /dev/null
+++ b/Documentation/driver-api/surface_aggregator/clients/san.rst
@@ -0,0 +1,44 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+.. |san_client_link| replace:: :c:func:`san_client_link`
+.. |san_dgpu_notifier_register| replace:: :c:func:`san_dgpu_notifier_register`
+.. |san_dgpu_notifier_unregister| replace:: :c:func:`san_dgpu_notifier_unregister`
+
+===================
+Surface ACPI Notify
+===================
+
+The Surface ACPI Notify (SAN) device provides the bridge between ACPI and
+SAM controller. Specifically, ACPI code can execute requests and handle
+battery and thermal events via this interface. In addition to this, events
+relating to the discrete GPU (dGPU) of the Surface Book 2 can be sent from
+ACPI code (note: the Surface Book 3 uses a different method for this). The
+only currently known event sent via this interface is a dGPU power-on
+notification. While this driver handles the former part internally, it only
+relays the dGPU events to any other driver interested via its public API and
+does not handle them.
+
+The public interface of this driver is split into two parts: Client
+registration and notifier-block registration.
+
+A client to the SAN interface can be linked as consumer to the SAN device
+via |san_client_link|. This can be used to ensure that the a client
+receiving dGPU events does not miss any events due to the SAN interface not
+being set up as this forces the client driver to unbind once the SAN driver
+is unbound.
+
+Notifier-blocks can be registered by any device for as long as the module is
+loaded, regardless of being linked as client or not. Registration is done
+with |san_dgpu_notifier_register|. If the notifier is not needed any more, it
+should be unregistered via |san_dgpu_notifier_unregister|.
+
+Consult the API documentation below for more details.
+
+
+API Documentation
+=================
+
+.. kernel-doc:: include/linux/surface_acpi_notify.h
+
+.. kernel-doc:: drivers/platform/surface/surface_acpi_notify.c
+    :export:
diff --git a/MAINTAINERS b/MAINTAINERS
index 9bd55c842898..cf250c1365b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11820,7 +11820,9 @@ W:	https://github.com/linux-surface/surface-aggregator-module
 C:	irc://chat.freenode.net/##linux-surface
 F:	Documentation/driver-api/surface_aggregator/
 F:	drivers/platform/surface/aggregator/
+F:	drivers/platform/surface/surface_acpi_notify.c
 F:	drivers/platform/surface/surface_aggregator_cdev.c
+F:	include/linux/surface_acpi_notify.h
 F:	include/linux/surface_aggregator/
 F:	include/uapi/linux/surface_aggregator/
 
diff --git a/drivers/platform/surface/Kconfig b/drivers/platform/surface/Kconfig
index 988b2de6b500..83b0a4c7b352 100644
--- a/drivers/platform/surface/Kconfig
+++ b/drivers/platform/surface/Kconfig
@@ -41,6 +41,25 @@ config SURFACE_3_POWER_OPREGION
 	  This driver provides support for ACPI operation
 	  region of the Surface 3 battery platform driver.
 
+config SURFACE_ACPI_NOTIFY
+	tristate "Surface ACPI Notify Driver"
+	depends on SURFACE_AGGREGATOR
+	help
+	  Surface ACPI Notify (SAN) driver for Microsoft Surface devices.
+
+	  This driver provides support for the ACPI interface (called SAN) of
+	  the Surface System Aggregator Module (SSAM) EC. This interface is used
+	  on 5th- and 6th-generation Microsoft Surface devices (including
+	  Surface Pro 5 and 6, Surface Book 2, Surface Laptops 1 and 2, and in
+	  reduced functionality on the Surface Laptop 3) to execute SSAM
+	  requests directly from ACPI code, as well as receive SSAM events and
+	  turn them into ACPI notifications. It essentially acts as a
+	  translation layer between the SSAM controller and ACPI.
+
+	  Specifically, this driver may be needed for battery status reporting,
+	  thermal sensor access, and real-time clock information, depending on
+	  the Surface device in question.
+
 config SURFACE_AGGREGATOR_CDEV
 	tristate "Surface System Aggregator Module User-Space Interface"
 	depends on SURFACE_AGGREGATOR
diff --git a/drivers/platform/surface/Makefile b/drivers/platform/surface/Makefile
index 161f0ad05795..3eb971006877 100644
--- a/drivers/platform/surface/Makefile
+++ b/drivers/platform/surface/Makefile
@@ -7,6 +7,7 @@
 obj-$(CONFIG_SURFACE3_WMI)		+= surface3-wmi.o
 obj-$(CONFIG_SURFACE_3_BUTTON)		+= surface3_button.o
 obj-$(CONFIG_SURFACE_3_POWER_OPREGION)	+= surface3_power.o
+obj-$(CONFIG_SURFACE_ACPI_NOTIFY)	+= surface_acpi_notify.o
 obj-$(CONFIG_SURFACE_AGGREGATOR)	+= aggregator/
 obj-$(CONFIG_SURFACE_AGGREGATOR_CDEV)	+= surface_aggregator_cdev.o
 obj-$(CONFIG_SURFACE_GPE)		+= surface_gpe.o
diff --git a/drivers/platform/surface/surface_acpi_notify.c b/drivers/platform/surface/surface_acpi_notify.c
new file mode 100644
index 000000000000..8cd67a669c86
--- /dev/null
+++ b/drivers/platform/surface/surface_acpi_notify.c
@@ -0,0 +1,886 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Driver for the Surface ACPI Notify (SAN) interface/shim.
+ *
+ * Translates communication from ACPI to Surface System Aggregator Module
+ * (SSAM/SAM) requests and back, specifically SAM-over-SSH. Translates SSAM
+ * events back to ACPI notifications. Allows handling of discrete GPU
+ * notifications sent from ACPI via the SAN interface by providing them to any
+ * registered external driver.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <asm/unaligned.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/rwsem.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include <linux/surface_acpi_notify.h>
+
+struct san_data {
+	struct device *dev;
+	struct ssam_controller *ctrl;
+
+	struct acpi_connection_info info;
+
+	struct ssam_event_notifier nf_bat;
+	struct ssam_event_notifier nf_tmp;
+};
+
+#define to_san_data(ptr, member) \
+	container_of(ptr, struct san_data, member)
+
+
+/* -- dGPU notifier interface. ---------------------------------------------- */
+
+struct san_rqsg_if {
+	struct rw_semaphore lock;
+	struct device *dev;
+	struct blocking_notifier_head nh;
+};
+
+static struct san_rqsg_if san_rqsg_if = {
+	.lock = __RWSEM_INITIALIZER(san_rqsg_if.lock),
+	.dev = NULL,
+	.nh = BLOCKING_NOTIFIER_INIT(san_rqsg_if.nh),
+};
+
+static int san_set_rqsg_interface_device(struct device *dev)
+{
+	int status = 0;
+
+	down_write(&san_rqsg_if.lock);
+	if (!san_rqsg_if.dev && dev)
+		san_rqsg_if.dev = dev;
+	else
+		status = -EBUSY;
+	up_write(&san_rqsg_if.lock);
+
+	return status;
+}
+
+/**
+ * san_client_link() - Link client as consumer to SAN device.
+ * @client: The client to link.
+ *
+ * Sets up a device link between the provided client device as consumer and
+ * the SAN device as provider. This function can be used to ensure that the
+ * SAN interface has been set up and will be set up for as long as the driver
+ * of the client device is bound. This guarantees that, during that time, all
+ * dGPU events will be received by any registered notifier.
+ *
+ * The link will be automatically removed once the client device's driver is
+ * unbound.
+ *
+ * Return: Returns zero on success, %-ENXIO if the SAN interface has not been
+ * set up yet, and %-ENOMEM if device link creation failed.
+ */
+int san_client_link(struct device *client)
+{
+	const u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_CONSUMER;
+	struct device_link *link;
+
+	down_read(&san_rqsg_if.lock);
+
+	if (!san_rqsg_if.dev) {
+		up_read(&san_rqsg_if.lock);
+		return -ENXIO;
+	}
+
+	link = device_link_add(client, san_rqsg_if.dev, flags);
+	if (!link) {
+		up_read(&san_rqsg_if.lock);
+		return -ENOMEM;
+	}
+
+	if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) {
+		up_read(&san_rqsg_if.lock);
+		return -ENXIO;
+	}
+
+	up_read(&san_rqsg_if.lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(san_client_link);
+
+/**
+ * san_dgpu_notifier_register() - Register a SAN dGPU notifier.
+ * @nb: The notifier-block to register.
+ *
+ * Registers a SAN dGPU notifier, receiving any new SAN dGPU events sent from
+ * ACPI. The registered notifier will be called with &struct san_dgpu_event
+ * as notifier data and the command ID of that event as notifier action.
+ */
+int san_dgpu_notifier_register(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&san_rqsg_if.nh, nb);
+}
+EXPORT_SYMBOL_GPL(san_dgpu_notifier_register);
+
+/**
+ * san_dgpu_notifier_unregister() - Unregister a SAN dGPU notifier.
+ * @nb: The notifier-block to unregister.
+ */
+int san_dgpu_notifier_unregister(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&san_rqsg_if.nh, nb);
+}
+EXPORT_SYMBOL_GPL(san_dgpu_notifier_unregister);
+
+static int san_dgpu_notifier_call(struct san_dgpu_event *evt)
+{
+	int ret;
+
+	ret = blocking_notifier_call_chain(&san_rqsg_if.nh, evt->command, evt);
+	return notifier_to_errno(ret);
+}
+
+
+/* -- ACPI _DSM event relay. ------------------------------------------------ */
+
+#define SAN_DSM_REVISION	0
+
+/* 93b666c5-70c6-469f-a215-3d487c91ab3c */
+static const guid_t SAN_DSM_UUID =
+	GUID_INIT(0x93b666c5, 0x70c6, 0x469f, 0xa2, 0x15, 0x3d,
+		  0x48, 0x7c, 0x91, 0xab, 0x3c);
+
+enum san_dsm_event_fn {
+	SAN_DSM_EVENT_FN_BAT1_STAT = 0x03,
+	SAN_DSM_EVENT_FN_BAT1_INFO = 0x04,
+	SAN_DSM_EVENT_FN_ADP1_STAT = 0x05,
+	SAN_DSM_EVENT_FN_ADP1_INFO = 0x06,
+	SAN_DSM_EVENT_FN_BAT2_STAT = 0x07,
+	SAN_DSM_EVENT_FN_BAT2_INFO = 0x08,
+	SAN_DSM_EVENT_FN_THERMAL   = 0x09,
+	SAN_DSM_EVENT_FN_DPTF      = 0x0a,
+};
+
+enum sam_event_cid_bat {
+	SAM_EVENT_CID_BAT_BIX  = 0x15,
+	SAM_EVENT_CID_BAT_BST  = 0x16,
+	SAM_EVENT_CID_BAT_ADP  = 0x17,
+	SAM_EVENT_CID_BAT_PROT = 0x18,
+	SAM_EVENT_CID_BAT_DPTF = 0x4f,
+};
+
+enum sam_event_cid_tmp {
+	SAM_EVENT_CID_TMP_TRIP = 0x0b,
+};
+
+struct san_event_work {
+	struct delayed_work work;
+	struct device *dev;
+	struct ssam_event event;	/* must be last */
+};
+
+static int san_acpi_notify_event(struct device *dev, u64 func,
+				 union acpi_object *param)
+{
+	acpi_handle san = ACPI_HANDLE(dev);
+	union acpi_object *obj;
+	int status = 0;
+
+	if (!acpi_check_dsm(san, &SAN_DSM_UUID, SAN_DSM_REVISION, 1 << func))
+		return 0;
+
+	dev_dbg(dev, "notify event %#04llx\n", func);
+
+	obj = acpi_evaluate_dsm_typed(san, &SAN_DSM_UUID, SAN_DSM_REVISION,
+				      func, param, ACPI_TYPE_BUFFER);
+	if (!obj)
+		return -EFAULT;
+
+	if (obj->buffer.length != 1 || obj->buffer.pointer[0] != 0) {
+		dev_err(dev, "got unexpected result from _DSM\n");
+		status = -EPROTO;
+	}
+
+	ACPI_FREE(obj);
+	return status;
+}
+
+static int san_evt_bat_adp(struct device *dev, const struct ssam_event *event)
+{
+	int status;
+
+	status = san_acpi_notify_event(dev, SAN_DSM_EVENT_FN_ADP1_STAT, NULL);
+	if (status)
+		return status;
+
+	/*
+	 * Ensure that the battery states get updated correctly. When the
+	 * battery is fully charged and an adapter is plugged in, it sometimes
+	 * is not updated correctly, instead showing it as charging.
+	 * Explicitly trigger battery updates to fix this.
+	 */
+
+	status = san_acpi_notify_event(dev, SAN_DSM_EVENT_FN_BAT1_STAT, NULL);
+	if (status)
+		return status;
+
+	return san_acpi_notify_event(dev, SAN_DSM_EVENT_FN_BAT2_STAT, NULL);
+}
+
+static int san_evt_bat_bix(struct device *dev, const struct ssam_event *event)
+{
+	enum san_dsm_event_fn fn;
+
+	if (event->instance_id == 0x02)
+		fn = SAN_DSM_EVENT_FN_BAT2_INFO;
+	else
+		fn = SAN_DSM_EVENT_FN_BAT1_INFO;
+
+	return san_acpi_notify_event(dev, fn, NULL);
+}
+
+static int san_evt_bat_bst(struct device *dev, const struct ssam_event *event)
+{
+	enum san_dsm_event_fn fn;
+
+	if (event->instance_id == 0x02)
+		fn = SAN_DSM_EVENT_FN_BAT2_STAT;
+	else
+		fn = SAN_DSM_EVENT_FN_BAT1_STAT;
+
+	return san_acpi_notify_event(dev, fn, NULL);
+}
+
+static int san_evt_bat_dptf(struct device *dev, const struct ssam_event *event)
+{
+	union acpi_object payload;
+
+	/*
+	 * The Surface ACPI expects a buffer and not a package. It specifically
+	 * checks for ObjectType (Arg3) == 0x03. This will cause a warning in
+	 * acpica/nsarguments.c, but that warning can be safely ignored.
+	 */
+	payload.type = ACPI_TYPE_BUFFER;
+	payload.buffer.length = event->length;
+	payload.buffer.pointer = (u8 *)&event->data[0];
+
+	return san_acpi_notify_event(dev, SAN_DSM_EVENT_FN_DPTF, &payload);
+}
+
+static unsigned long san_evt_bat_delay(u8 cid)
+{
+	switch (cid) {
+	case SAM_EVENT_CID_BAT_ADP:
+		/*
+		 * Wait for battery state to update before signaling adapter
+		 * change.
+		 */
+		return msecs_to_jiffies(5000);
+
+	case SAM_EVENT_CID_BAT_BST:
+		/* Ensure we do not miss anything important due to caching. */
+		return msecs_to_jiffies(2000);
+
+	default:
+		return 0;
+	}
+}
+
+static bool san_evt_bat(const struct ssam_event *event, struct device *dev)
+{
+	int status;
+
+	switch (event->command_id) {
+	case SAM_EVENT_CID_BAT_BIX:
+		status = san_evt_bat_bix(dev, event);
+		break;
+
+	case SAM_EVENT_CID_BAT_BST:
+		status = san_evt_bat_bst(dev, event);
+		break;
+
+	case SAM_EVENT_CID_BAT_ADP:
+		status = san_evt_bat_adp(dev, event);
+		break;
+
+	case SAM_EVENT_CID_BAT_PROT:
+		/*
+		 * TODO: Implement support for battery protection status change
+		 *       event.
+		 */
+		return true;
+
+	case SAM_EVENT_CID_BAT_DPTF:
+		status = san_evt_bat_dptf(dev, event);
+		break;
+
+	default:
+		return false;
+	}
+
+	if (status) {
+		dev_err(dev, "error handling power event (cid = %#04x)\n",
+			event->command_id);
+	}
+
+	return true;
+}
+
+static void san_evt_bat_workfn(struct work_struct *work)
+{
+	struct san_event_work *ev;
+
+	ev = container_of(work, struct san_event_work, work.work);
+	san_evt_bat(&ev->event, ev->dev);
+	kfree(ev);
+}
+
+static u32 san_evt_bat_nf(struct ssam_event_notifier *nf,
+			  const struct ssam_event *event)
+{
+	struct san_data *d = to_san_data(nf, nf_bat);
+	struct san_event_work *work;
+	unsigned long delay = san_evt_bat_delay(event->command_id);
+
+	if (delay == 0)
+		return san_evt_bat(event, d->dev) ? SSAM_NOTIF_HANDLED : 0;
+
+	work = kzalloc(sizeof(*work) + event->length, GFP_KERNEL);
+	if (!work)
+		return ssam_notifier_from_errno(-ENOMEM);
+
+	INIT_DELAYED_WORK(&work->work, san_evt_bat_workfn);
+	work->dev = d->dev;
+
+	memcpy(&work->event, event, sizeof(struct ssam_event) + event->length);
+
+	schedule_delayed_work(&work->work, delay);
+	return SSAM_NOTIF_HANDLED;
+}
+
+static int san_evt_tmp_trip(struct device *dev, const struct ssam_event *event)
+{
+	union acpi_object param;
+
+	/*
+	 * The Surface ACPI expects an integer and not a package. This will
+	 * cause a warning in acpica/nsarguments.c, but that warning can be
+	 * safely ignored.
+	 */
+	param.type = ACPI_TYPE_INTEGER;
+	param.integer.value = event->instance_id;
+
+	return san_acpi_notify_event(dev, SAN_DSM_EVENT_FN_THERMAL, &param);
+}
+
+static bool san_evt_tmp(const struct ssam_event *event, struct device *dev)
+{
+	int status;
+
+	switch (event->command_id) {
+	case SAM_EVENT_CID_TMP_TRIP:
+		status = san_evt_tmp_trip(dev, event);
+		break;
+
+	default:
+		return false;
+	}
+
+	if (status) {
+		dev_err(dev, "error handling thermal event (cid = %#04x)\n",
+			event->command_id);
+	}
+
+	return true;
+}
+
+static u32 san_evt_tmp_nf(struct ssam_event_notifier *nf,
+			  const struct ssam_event *event)
+{
+	struct san_data *d = to_san_data(nf, nf_tmp);
+
+	return san_evt_tmp(event, d->dev) ? SSAM_NOTIF_HANDLED : 0;
+}
+
+
+/* -- ACPI GSB OperationRegion handler -------------------------------------- */
+
+struct gsb_data_in {
+	u8 cv;
+} __packed;
+
+struct gsb_data_rqsx {
+	u8 cv;				/* Command value (san_gsb_request_cv). */
+	u8 tc;				/* Target category. */
+	u8 tid;				/* Target ID. */
+	u8 iid;				/* Instance ID. */
+	u8 snc;				/* Expect-response-flag. */
+	u8 cid;				/* Command ID. */
+	u16 cdl;			/* Payload length. */
+	u8 pld[];			/* Payload. */
+} __packed;
+
+struct gsb_data_etwl {
+	u8 cv;				/* Command value (should be 0x02). */
+	u8 etw3;			/* Unknown. */
+	u8 etw4;			/* Unknown. */
+	u8 msg[];			/* Error message (ASCIIZ). */
+} __packed;
+
+struct gsb_data_out {
+	u8 status;			/* _SSH communication status. */
+	u8 len;				/* _SSH payload length. */
+	u8 pld[];			/* _SSH payload. */
+} __packed;
+
+union gsb_buffer_data {
+	struct gsb_data_in   in;	/* Common input. */
+	struct gsb_data_rqsx rqsx;	/* RQSX input. */
+	struct gsb_data_etwl etwl;	/* ETWL input. */
+	struct gsb_data_out  out;	/* Output. */
+};
+
+struct gsb_buffer {
+	u8 status;			/* GSB AttribRawProcess status. */
+	u8 len;				/* GSB AttribRawProcess length. */
+	union gsb_buffer_data data;
+} __packed;
+
+#define SAN_GSB_MAX_RQSX_PAYLOAD  (U8_MAX - 2 - sizeof(struct gsb_data_rqsx))
+#define SAN_GSB_MAX_RESPONSE	  (U8_MAX - 2 - sizeof(struct gsb_data_out))
+
+#define SAN_GSB_COMMAND		0
+
+enum san_gsb_request_cv {
+	SAN_GSB_REQUEST_CV_RQST = 0x01,
+	SAN_GSB_REQUEST_CV_ETWL = 0x02,
+	SAN_GSB_REQUEST_CV_RQSG = 0x03,
+};
+
+#define SAN_REQUEST_NUM_TRIES	5
+
+static acpi_status san_etwl(struct san_data *d, struct gsb_buffer *b)
+{
+	struct gsb_data_etwl *etwl = &b->data.etwl;
+
+	if (b->len < sizeof(struct gsb_data_etwl)) {
+		dev_err(d->dev, "invalid ETWL package (len = %d)\n", b->len);
+		return AE_OK;
+	}
+
+	dev_err(d->dev, "ETWL(%#04x, %#04x): %.*s\n", etwl->etw3, etwl->etw4,
+		(unsigned int)(b->len - sizeof(struct gsb_data_etwl)),
+		(char *)etwl->msg);
+
+	/* Indicate success. */
+	b->status = 0x00;
+	b->len = 0x00;
+
+	return AE_OK;
+}
+
+static
+struct gsb_data_rqsx *san_validate_rqsx(struct device *dev, const char *type,
+					struct gsb_buffer *b)
+{
+	struct gsb_data_rqsx *rqsx = &b->data.rqsx;
+
+	if (b->len < sizeof(struct gsb_data_rqsx)) {
+		dev_err(dev, "invalid %s package (len = %d)\n", type, b->len);
+		return NULL;
+	}
+
+	if (get_unaligned(&rqsx->cdl) != b->len - sizeof(struct gsb_data_rqsx)) {
+		dev_err(dev, "bogus %s package (len = %d, cdl = %d)\n",
+			type, b->len, get_unaligned(&rqsx->cdl));
+		return NULL;
+	}
+
+	if (get_unaligned(&rqsx->cdl) > SAN_GSB_MAX_RQSX_PAYLOAD) {
+		dev_err(dev, "payload for %s package too large (cdl = %d)\n",
+			type, get_unaligned(&rqsx->cdl));
+		return NULL;
+	}
+
+	return rqsx;
+}
+
+static void gsb_rqsx_response_error(struct gsb_buffer *gsb, int status)
+{
+	gsb->status = 0x00;
+	gsb->len = 0x02;
+	gsb->data.out.status = (u8)(-status);
+	gsb->data.out.len = 0x00;
+}
+
+static void gsb_rqsx_response_success(struct gsb_buffer *gsb, u8 *ptr, size_t len)
+{
+	gsb->status = 0x00;
+	gsb->len = len + 2;
+	gsb->data.out.status = 0x00;
+	gsb->data.out.len = len;
+
+	if (len)
+		memcpy(&gsb->data.out.pld[0], ptr, len);
+}
+
+static acpi_status san_rqst_fixup_suspended(struct san_data *d,
+					    struct ssam_request *rqst,
+					    struct gsb_buffer *gsb)
+{
+	if (rqst->target_category == SSAM_SSH_TC_BAS && rqst->command_id == 0x0D) {
+		u8 base_state = 1;
+
+		/* Base state quirk:
+		 * The base state may be queried from ACPI when the EC is still
+		 * suspended. In this case it will return '-EPERM'. This query
+		 * will only be triggered from the ACPI lid GPE interrupt, thus
+		 * we are either in laptop or studio mode (base status 0x01 or
+		 * 0x02). Furthermore, we will only get here if the device (and
+		 * EC) have been suspended.
+		 *
+		 * We now assume that the device is in laptop mode (0x01). This
+		 * has the drawback that it will wake the device when unfolding
+		 * it in studio mode, but it also allows us to avoid actively
+		 * waiting for the EC to wake up, which may incur a notable
+		 * delay.
+		 */
+
+		dev_dbg(d->dev, "rqst: fixup: base-state quirk\n");
+
+		gsb_rqsx_response_success(gsb, &base_state, sizeof(base_state));
+		return AE_OK;
+	}
+
+	gsb_rqsx_response_error(gsb, -ENXIO);
+	return AE_OK;
+}
+
+static acpi_status san_rqst(struct san_data *d, struct gsb_buffer *buffer)
+{
+	u8 rspbuf[SAN_GSB_MAX_RESPONSE];
+	struct gsb_data_rqsx *gsb_rqst;
+	struct ssam_request rqst;
+	struct ssam_response rsp;
+	int status = 0;
+
+	gsb_rqst = san_validate_rqsx(d->dev, "RQST", buffer);
+	if (!gsb_rqst)
+		return AE_OK;
+
+	rqst.target_category = gsb_rqst->tc;
+	rqst.target_id = gsb_rqst->tid;
+	rqst.command_id = gsb_rqst->cid;
+	rqst.instance_id = gsb_rqst->iid;
+	rqst.flags = gsb_rqst->snc ? SSAM_REQUEST_HAS_RESPONSE : 0;
+	rqst.length = get_unaligned(&gsb_rqst->cdl);
+	rqst.payload = &gsb_rqst->pld[0];
+
+	rsp.capacity = ARRAY_SIZE(rspbuf);
+	rsp.length = 0;
+	rsp.pointer = &rspbuf[0];
+
+	/* Handle suspended device. */
+	if (d->dev->power.is_suspended) {
+		dev_warn(d->dev, "rqst: device is suspended, not executing\n");
+		return san_rqst_fixup_suspended(d, &rqst, buffer);
+	}
+
+	status = __ssam_retry(ssam_request_sync_onstack, SAN_REQUEST_NUM_TRIES,
+			      d->ctrl, &rqst, &rsp, SAN_GSB_MAX_RQSX_PAYLOAD);
+
+	if (!status) {
+		gsb_rqsx_response_success(buffer, rsp.pointer, rsp.length);
+	} else {
+		dev_err(d->dev, "rqst: failed with error %d\n", status);
+		gsb_rqsx_response_error(buffer, status);
+	}
+
+	return AE_OK;
+}
+
+static acpi_status san_rqsg(struct san_data *d, struct gsb_buffer *buffer)
+{
+	struct gsb_data_rqsx *gsb_rqsg;
+	struct san_dgpu_event evt;
+	int status;
+
+	gsb_rqsg = san_validate_rqsx(d->dev, "RQSG", buffer);
+	if (!gsb_rqsg)
+		return AE_OK;
+
+	evt.category = gsb_rqsg->tc;
+	evt.target = gsb_rqsg->tid;
+	evt.command = gsb_rqsg->cid;
+	evt.instance = gsb_rqsg->iid;
+	evt.length = get_unaligned(&gsb_rqsg->cdl);
+	evt.payload = &gsb_rqsg->pld[0];
+
+	status = san_dgpu_notifier_call(&evt);
+	if (!status) {
+		gsb_rqsx_response_success(buffer, NULL, 0);
+	} else {
+		dev_err(d->dev, "rqsg: failed with error %d\n", status);
+		gsb_rqsx_response_error(buffer, status);
+	}
+
+	return AE_OK;
+}
+
+static acpi_status san_opreg_handler(u32 function, acpi_physical_address command,
+				     u32 bits, u64 *value64, void *opreg_context,
+				     void *region_context)
+{
+	struct san_data *d = to_san_data(opreg_context, info);
+	struct gsb_buffer *buffer = (struct gsb_buffer *)value64;
+	int accessor_type = (function & 0xFFFF0000) >> 16;
+
+	if (command != SAN_GSB_COMMAND) {
+		dev_warn(d->dev, "unsupported command: %#04llx\n", command);
+		return AE_OK;
+	}
+
+	if (accessor_type != ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS) {
+		dev_err(d->dev, "invalid access type: %#04x\n", accessor_type);
+		return AE_OK;
+	}
+
+	/* Buffer must have at least contain the command-value. */
+	if (buffer->len == 0) {
+		dev_err(d->dev, "request-package too small\n");
+		return AE_OK;
+	}
+
+	switch (buffer->data.in.cv) {
+	case SAN_GSB_REQUEST_CV_RQST:
+		return san_rqst(d, buffer);
+
+	case SAN_GSB_REQUEST_CV_ETWL:
+		return san_etwl(d, buffer);
+
+	case SAN_GSB_REQUEST_CV_RQSG:
+		return san_rqsg(d, buffer);
+
+	default:
+		dev_warn(d->dev, "unsupported SAN0 request (cv: %#04x)\n",
+			 buffer->data.in.cv);
+		return AE_OK;
+	}
+}
+
+
+/* -- Driver setup. --------------------------------------------------------- */
+
+static int san_events_register(struct platform_device *pdev)
+{
+	struct san_data *d = platform_get_drvdata(pdev);
+	int status;
+
+	d->nf_bat.base.priority = 1;
+	d->nf_bat.base.fn = san_evt_bat_nf;
+	d->nf_bat.event.reg = SSAM_EVENT_REGISTRY_SAM;
+	d->nf_bat.event.id.target_category = SSAM_SSH_TC_BAT;
+	d->nf_bat.event.id.instance = 0;
+	d->nf_bat.event.mask = SSAM_EVENT_MASK_TARGET;
+	d->nf_bat.event.flags = SSAM_EVENT_SEQUENCED;
+
+	d->nf_tmp.base.priority = 1;
+	d->nf_tmp.base.fn = san_evt_tmp_nf;
+	d->nf_tmp.event.reg = SSAM_EVENT_REGISTRY_SAM;
+	d->nf_tmp.event.id.target_category = SSAM_SSH_TC_TMP;
+	d->nf_tmp.event.id.instance = 0;
+	d->nf_tmp.event.mask = SSAM_EVENT_MASK_TARGET;
+	d->nf_tmp.event.flags = SSAM_EVENT_SEQUENCED;
+
+	status = ssam_notifier_register(d->ctrl, &d->nf_bat);
+	if (status)
+		return status;
+
+	status = ssam_notifier_register(d->ctrl, &d->nf_tmp);
+	if (status)
+		ssam_notifier_unregister(d->ctrl, &d->nf_bat);
+
+	return status;
+}
+
+static void san_events_unregister(struct platform_device *pdev)
+{
+	struct san_data *d = platform_get_drvdata(pdev);
+
+	ssam_notifier_unregister(d->ctrl, &d->nf_bat);
+	ssam_notifier_unregister(d->ctrl, &d->nf_tmp);
+}
+
+#define san_consumer_printk(level, dev, handle, fmt, ...)			\
+do {										\
+	char *path = "<error getting consumer path>";				\
+	struct acpi_buffer buffer = {						\
+		.length = ACPI_ALLOCATE_BUFFER,					\
+		.pointer = NULL,						\
+	};									\
+										\
+	if (ACPI_SUCCESS(acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer)))	\
+		path = buffer.pointer;						\
+										\
+	dev_##level(dev, "[%s]: " fmt, path, ##__VA_ARGS__);			\
+	kfree(buffer.pointer);							\
+} while (0)
+
+#define san_consumer_dbg(dev, handle, fmt, ...) \
+	san_consumer_printk(dbg, dev, handle, fmt, ##__VA_ARGS__)
+
+#define san_consumer_warn(dev, handle, fmt, ...) \
+	san_consumer_printk(warn, dev, handle, fmt, ##__VA_ARGS__)
+
+static bool is_san_consumer(struct platform_device *pdev, acpi_handle handle)
+{
+	struct acpi_handle_list dep_devices;
+	acpi_handle supplier = ACPI_HANDLE(&pdev->dev);
+	acpi_status status;
+	int i;
+
+	if (!acpi_has_method(handle, "_DEP"))
+		return false;
+
+	status = acpi_evaluate_reference(handle, "_DEP", NULL, &dep_devices);
+	if (ACPI_FAILURE(status)) {
+		san_consumer_dbg(&pdev->dev, handle, "failed to evaluate _DEP\n");
+		return false;
+	}
+
+	for (i = 0; i < dep_devices.count; i++) {
+		if (dep_devices.handles[i] == supplier)
+			return true;
+	}
+
+	return false;
+}
+
+static acpi_status san_consumer_setup(acpi_handle handle, u32 lvl,
+				      void *context, void **rv)
+{
+	const u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_SUPPLIER;
+	struct platform_device *pdev = context;
+	struct acpi_device *adev;
+	struct device_link *link;
+
+	if (!is_san_consumer(pdev, handle))
+		return AE_OK;
+
+	/* Ignore ACPI devices that are not present. */
+	if (acpi_bus_get_device(handle, &adev) != 0)
+		return AE_OK;
+
+	san_consumer_dbg(&pdev->dev, handle, "creating device link\n");
+
+	/* Try to set up device links, ignore but log errors. */
+	link = device_link_add(&adev->dev, &pdev->dev, flags);
+	if (!link) {
+		san_consumer_warn(&pdev->dev, handle, "failed to create device link\n");
+		return AE_OK;
+	}
+
+	return AE_OK;
+}
+
+static int san_consumer_links_setup(struct platform_device *pdev)
+{
+	acpi_status status;
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+				     ACPI_UINT32_MAX, san_consumer_setup, NULL,
+				     pdev, NULL);
+
+	return status ? -EFAULT : 0;
+}
+
+static int san_probe(struct platform_device *pdev)
+{
+	acpi_handle san = ACPI_HANDLE(&pdev->dev);
+	struct ssam_controller *ctrl;
+	struct san_data *data;
+	acpi_status astatus;
+	int status;
+
+	ctrl = ssam_client_bind(&pdev->dev);
+	if (IS_ERR(ctrl))
+		return PTR_ERR(ctrl) == -ENODEV ? -EPROBE_DEFER : PTR_ERR(ctrl);
+
+	status = san_consumer_links_setup(pdev);
+	if (status)
+		return status;
+
+	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->dev = &pdev->dev;
+	data->ctrl = ctrl;
+
+	platform_set_drvdata(pdev, data);
+
+	astatus = acpi_install_address_space_handler(san, ACPI_ADR_SPACE_GSBUS,
+						     &san_opreg_handler, NULL,
+						     &data->info);
+	if (ACPI_FAILURE(astatus))
+		return -ENXIO;
+
+	status = san_events_register(pdev);
+	if (status)
+		goto err_enable_events;
+
+	status = san_set_rqsg_interface_device(&pdev->dev);
+	if (status)
+		goto err_install_dev;
+
+	acpi_walk_dep_device_list(san);
+	return 0;
+
+err_install_dev:
+	san_events_unregister(pdev);
+err_enable_events:
+	acpi_remove_address_space_handler(san, ACPI_ADR_SPACE_GSBUS,
+					  &san_opreg_handler);
+	return status;
+}
+
+static int san_remove(struct platform_device *pdev)
+{
+	acpi_handle san = ACPI_HANDLE(&pdev->dev);
+
+	san_set_rqsg_interface_device(NULL);
+	acpi_remove_address_space_handler(san, ACPI_ADR_SPACE_GSBUS,
+					  &san_opreg_handler);
+	san_events_unregister(pdev);
+
+	/*
+	 * We have unregistered our event sources. Now we need to ensure that
+	 * all delayed works they may have spawned are run to completion.
+	 */
+	flush_scheduled_work();
+
+	return 0;
+}
+
+static const struct acpi_device_id san_match[] = {
+	{ "MSHW0091" },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, san_match);
+
+static struct platform_driver surface_acpi_notify = {
+	.probe = san_probe,
+	.remove = san_remove,
+	.driver = {
+		.name = "surface_acpi_notify",
+		.acpi_match_table = san_match,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+module_platform_driver(surface_acpi_notify);
+
+MODULE_AUTHOR("Maximilian Luz <luzmaximilian@gmail.com>");
+MODULE_DESCRIPTION("Surface ACPI Notify driver for Surface System Aggregator Module");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/surface_acpi_notify.h b/include/linux/surface_acpi_notify.h
new file mode 100644
index 000000000000..8e3e86c7d78c
--- /dev/null
+++ b/include/linux/surface_acpi_notify.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Interface for Surface ACPI Notify (SAN) driver.
+ *
+ * Provides access to discrete GPU notifications sent from ACPI via the SAN
+ * driver, which are not handled by this driver directly.
+ *
+ * Copyright (C) 2019-2020 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _LINUX_SURFACE_ACPI_NOTIFY_H
+#define _LINUX_SURFACE_ACPI_NOTIFY_H
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+/**
+ * struct san_dgpu_event - Discrete GPU ACPI event.
+ * @category: Category of the event.
+ * @target:   Target ID of the event source.
+ * @command:  Command ID of the event.
+ * @instance: Instance ID of the event source.
+ * @length:   Length of the event's payload data (in bytes).
+ * @payload:  Pointer to the event's payload data.
+ */
+struct san_dgpu_event {
+	u8 category;
+	u8 target;
+	u8 command;
+	u8 instance;
+	u16 length;
+	u8 *payload;
+};
+
+int san_client_link(struct device *client);
+int san_dgpu_notifier_register(struct notifier_block *nb);
+int san_dgpu_notifier_unregister(struct notifier_block *nb);
+
+#endif /* _LINUX_SURFACE_ACPI_NOTIFY_H */
-- 
cgit v1.2.3


From ae5c2341ed3987bd434ed495bd4f3d8b2bc3e623 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 23 Sep 2020 11:22:09 -0400
Subject: rcu/segcblist: Add counters to segcblist datastructure

Add counting of segment lengths of segmented callback list.

This will be useful for a number of things such as knowing how big the
ready-to-execute segment have gotten. The immediate benefit is ability
to trace how the callbacks in the segmented callback list change.

Also this patch remove hacks related to using donecbs's ->len field as a
temporary variable to save the segmented callback list's length. This cannot be
done anymore and is not needed.

Also fix SRCU:
The negative counting of the unsegmented list cannot be used to adjust
the segmented one. To fix this, sample the unsegmented length in
advance, and use it after CB execution to adjust the segmented list's
length.

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_segcblist.h |   1 +
 kernel/rcu/rcu_segcblist.c    | 120 +++++++++++++++++++++++++++---------------
 kernel/rcu/rcu_segcblist.h    |   2 -
 kernel/rcu/srcutree.c         |   5 +-
 4 files changed, 82 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index b36afe7b22c9..6c01f09a6456 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -72,6 +72,7 @@ struct rcu_segcblist {
 #else
 	long len;
 #endif
+	long seglen[RCU_CBLIST_NSEGS];
 	u8 enabled;
 	u8 offloaded;
 };
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 3cff8003f707..777780447887 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -7,10 +7,10 @@
  * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
  */
 
-#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/cpu.h>
 #include <linux/interrupt.h>
-#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
 
 #include "rcu_segcblist.h"
 
@@ -88,6 +88,46 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
 #endif
 }
 
+/* Get the length of a segment of the rcu_segcblist structure. */
+static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+	return READ_ONCE(rsclp->seglen[seg]);
+}
+
+/* Set the length of a segment of the rcu_segcblist structure. */
+static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+	WRITE_ONCE(rsclp->seglen[seg], v);
+}
+
+/* Increase the numeric length of a segment by a specified amount. */
+static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+	WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
+}
+
+/* Move from's segment length to to's segment. */
+static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to)
+{
+	long len;
+
+	if (from == to)
+		return;
+
+	len = rcu_segcblist_get_seglen(rsclp, from);
+	if (!len)
+		return;
+
+	rcu_segcblist_add_seglen(rsclp, to, len);
+	rcu_segcblist_set_seglen(rsclp, from, 0);
+}
+
+/* Increment segment's length. */
+static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+	rcu_segcblist_add_seglen(rsclp, seg, 1);
+}
+
 /*
  * Increase the numeric length of an rcu_segcblist structure by the
  * specified amount, which can be negative.  This can cause the ->len
@@ -179,26 +219,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
 	rcu_segcblist_add_len(rsclp, 1);
 }
 
-/*
- * Exchange the numeric length of the specified rcu_segcblist structure
- * with the specified value.  This can cause the ->len field to disagree
- * with the actual number of callbacks on the structure.  This exchange is
- * fully ordered with respect to the callers accesses both before and after.
- */
-static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
-	return atomic_long_xchg(&rsclp->len, v);
-#else
-	long ret = rsclp->len;
-
-	smp_mb(); /* Up to the caller! */
-	WRITE_ONCE(rsclp->len, v);
-	smp_mb(); /* Up to the caller! */
-	return ret;
-#endif
-}
-
 /*
  * Initialize an rcu_segcblist structure.
  */
@@ -209,8 +229,10 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
 	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
 	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
 	rsclp->head = NULL;
-	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+	for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
 		rsclp->tails[i] = &rsclp->head;
+		rcu_segcblist_set_seglen(rsclp, i, 0);
+	}
 	rcu_segcblist_set_len(rsclp, 0);
 	rsclp->enabled = 1;
 }
@@ -306,6 +328,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 {
 	rcu_segcblist_inc_len(rsclp);
 	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+	rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
 	rhp->next = NULL;
 	WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
 	WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
@@ -334,27 +357,13 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
 	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
 		if (rsclp->tails[i] != rsclp->tails[i - 1])
 			break;
+	rcu_segcblist_inc_seglen(rsclp, i);
 	WRITE_ONCE(*rsclp->tails[i], rhp);
 	for (; i <= RCU_NEXT_TAIL; i++)
 		WRITE_ONCE(rsclp->tails[i], &rhp->next);
 	return true;
 }
 
-/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure.  This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks.  (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks.  Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
-					       struct rcu_cblist *rclp)
-{
-	rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
-}
-
 /*
  * Extract only those callbacks ready to be invoked from the specified
  * rcu_segcblist structure and place them in the specified rcu_cblist
@@ -367,6 +376,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
 
 	if (!rcu_segcblist_ready_cbs(rsclp))
 		return; /* Nothing to do. */
+	rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL);
 	*rclp->tail = rsclp->head;
 	WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
 	WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
@@ -374,6 +384,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
 	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
 		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
 			WRITE_ONCE(rsclp->tails[i], &rsclp->head);
+	rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
 }
 
 /*
@@ -390,11 +401,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
 
 	if (!rcu_segcblist_pend_cbs(rsclp))
 		return; /* Nothing to do. */
+	rclp->len = 0;
 	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
 	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
 	WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
-	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+		rclp->len += rcu_segcblist_get_seglen(rsclp, i);
 		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
+		rcu_segcblist_set_seglen(rsclp, i, 0);
+	}
 }
 
 /*
@@ -405,7 +420,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
 				struct rcu_cblist *rclp)
 {
 	rcu_segcblist_add_len(rsclp, rclp->len);
-	rclp->len = 0;
 }
 
 /*
@@ -419,6 +433,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
 
 	if (!rclp->head)
 		return; /* No callbacks to move. */
+	rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len);
 	*rclp->tail = rsclp->head;
 	WRITE_ONCE(rsclp->head, rclp->head);
 	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
@@ -439,6 +454,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 {
 	if (!rclp->head)
 		return; /* Nothing to do. */
+
+	rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len);
 	WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
 	WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
 }
@@ -463,6 +480,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
 			break;
 		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
 	}
 
 	/* If no callbacks moved, nothing more need be done. */
@@ -483,6 +501,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
 			break;  /* No more callbacks. */
 		WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+		rcu_segcblist_move_seglen(rsclp, i, j);
 		rsclp->gp_seq[j] = rsclp->gp_seq[i];
 	}
 }
@@ -504,7 +523,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
  */
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	int i;
+	int i, j;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
 	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -547,6 +566,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
 		return false;
 
+	/* Accounting: everything below i is about to get merged into i. */
+	for (j = i + 1; j <= RCU_NEXT_TAIL; j++)
+		rcu_segcblist_move_seglen(rsclp, j, i);
+
 	/*
 	 * Merge all later callbacks, including newly arrived callbacks,
 	 * into the segment located by the for-loop above.  Assign "seq"
@@ -574,13 +597,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 	struct rcu_cblist donecbs;
 	struct rcu_cblist pendcbs;
 
+	lockdep_assert_cpus_held();
+
 	rcu_cblist_init(&donecbs);
 	rcu_cblist_init(&pendcbs);
-	rcu_segcblist_extract_count(src_rsclp, &donecbs);
+
 	rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
 	rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+
+	/*
+	 * No need smp_mb() before setting length to 0, because CPU hotplug
+	 * lock excludes rcu_barrier.
+	 */
+	rcu_segcblist_set_len(src_rsclp, 0);
+
 	rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+	rcu_segcblist_insert_count(dst_rsclp, &pendcbs);
 	rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
 	rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+
 	rcu_segcblist_init(src_rsclp);
 }
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 1d2d61406463..cd35c9faaf51 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -89,8 +89,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 			   struct rcu_head *rhp);
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
 			   struct rcu_head *rhp);
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
-				 struct rcu_cblist *rclp);
 void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
 				    struct rcu_cblist *rclp);
 void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0f23d20d485a..79b7081143a7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1160,6 +1160,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
  */
 static void srcu_invoke_callbacks(struct work_struct *work)
 {
+	long len;
 	bool more;
 	struct rcu_cblist ready_cbs;
 	struct rcu_head *rhp;
@@ -1182,6 +1183,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	/* We are on the job!  Extract and invoke ready callbacks. */
 	sdp->srcu_cblist_invoking = true;
 	rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+	len = ready_cbs.len;
 	spin_unlock_irq_rcu_node(sdp);
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
@@ -1190,13 +1192,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 		rhp->func(rhp);
 		local_bh_enable();
 	}
+	WARN_ON_ONCE(ready_cbs.len);
 
 	/*
 	 * Update counts, accelerate new callbacks, and if needed,
 	 * schedule another round of callback invocation.
 	 */
 	spin_lock_irq_rcu_node(sdp);
-	rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+	rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
 				       rcu_seq_snap(&ssp->srcu_gp_seq));
 	sdp->srcu_cblist_invoking = false;
-- 
cgit v1.2.3


From 65e560327fe68153a9ad7452d5fd3171a1927d33 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 13 Nov 2020 13:13:16 +0100
Subject: rcu/nocb: Turn enabled/offload states into a common flag

This commit gathers the rcu_segcblist ->enabled and ->offloaded property
field into a single ->flags bitmask to avoid further proliferation of
individual u8 fields in the structure.  This change prepares for the
state formerly known as ->offloaded state to be modified at runtime.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Inspired-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_segcblist.h |  6 ++++--
 kernel/rcu/rcu_segcblist.c    |  6 +++---
 kernel/rcu/rcu_segcblist.h    | 23 +++++++++++++++++++++--
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 6c01f09a6456..4714b0263c76 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -63,6 +63,9 @@ struct rcu_cblist {
 #define RCU_NEXT_TAIL		3
 #define RCU_CBLIST_NSEGS	4
 
+#define SEGCBLIST_ENABLED	BIT(0)
+#define SEGCBLIST_OFFLOADED	BIT(1)
+
 struct rcu_segcblist {
 	struct rcu_head *head;
 	struct rcu_head **tails[RCU_CBLIST_NSEGS];
@@ -73,8 +76,7 @@ struct rcu_segcblist {
 	long len;
 #endif
 	long seglen[RCU_CBLIST_NSEGS];
-	u8 enabled;
-	u8 offloaded;
+	u8 flags;
 };
 
 #define RCU_SEGCBLIST_INITIALIZER(n) \
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 89e0dff890bf..934945d72ca5 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -246,7 +246,7 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
 		rcu_segcblist_set_seglen(rsclp, i, 0);
 	}
 	rcu_segcblist_set_len(rsclp, 0);
-	rsclp->enabled = 1;
+	rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED);
 }
 
 /*
@@ -257,7 +257,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
 {
 	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
 	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
-	rsclp->enabled = 0;
+	rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
 }
 
 /*
@@ -266,7 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
  */
 void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
 {
-	rsclp->offloaded = 1;
+	rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
 }
 
 /*
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 18e101de8747..ff372db09f8b 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -53,19 +53,38 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
 #endif
 }
 
+static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
+					   int flags)
+{
+	rsclp->flags |= flags;
+}
+
+static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
+					     int flags)
+{
+	rsclp->flags &= ~flags;
+}
+
+static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
+					    int flags)
+{
+	return READ_ONCE(rsclp->flags) & flags;
+}
+
 /*
  * Is the specified rcu_segcblist enabled, for example, not corresponding
  * to an offline CPU?
  */
 static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
 {
-	return rsclp->enabled;
+	return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
 }
 
 /* Is the specified rcu_segcblist offloaded?  */
 static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
 {
-	return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded;
+	return IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+		rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED);
 }
 
 /*
-- 
cgit v1.2.3


From 8d346d438f93b5344e99d429727ec9c2f392d4ec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 13 Nov 2020 13:13:17 +0100
Subject: rcu/nocb: Provide basic callback offloading state machine bits

Offloading and de-offloading RCU callback processes must be done
carefully.  There must never be a time at which callback processing is
disabled because the task driving the offloading or de-offloading might be
preempted or otherwise stalled at that point in time, which would result
in OOM due to calbacks piling up indefinitely.  This implies that there
will be times during which a given CPU's callbacks might be concurrently
invoked by both that CPU's RCU_SOFTIRQ handler (or, equivalently, that
CPU's rcuc kthread) and by that CPU's rcuo kthread.

This situation could fatally confuse both rcu_barrier() and the
CPU-hotplug offlining process, so these must be excluded during any
concurrent-callback-invocation period.  In addition, during times of
concurrent callback invocation, changes to ->cblist must be protected
both as needed for RCU_SOFTIRQ and as needed for the rcuo kthread.

This commit therefore defines and documents the states for a state
machine that coordinates offloading and deoffloading.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Inspired-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_segcblist.h | 115 +++++++++++++++++++++++++++++++++++++++++-
 kernel/rcu/rcu_segcblist.c    |   1 +
 kernel/rcu/rcu_segcblist.h    |  12 ++++-
 kernel/rcu/tree.c             |   3 ++
 4 files changed, 128 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 4714b0263c76..8afe886e85f1 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -63,8 +63,121 @@ struct rcu_cblist {
 #define RCU_NEXT_TAIL		3
 #define RCU_CBLIST_NSEGS	4
 
+
+/*
+ *                     ==NOCB Offloading state machine==
+ *
+ *
+ *  ----------------------------------------------------------------------------
+ *  |                         SEGCBLIST_SOFTIRQ_ONLY                           |
+ *  |                                                                          |
+ *  |  Callbacks processed by rcu_core() from softirqs or local                |
+ *  |  rcuc kthread, without holding nocb_lock.                                |
+ *  ----------------------------------------------------------------------------
+ *                                         |
+ *                                         v
+ *  ----------------------------------------------------------------------------
+ *  |                        SEGCBLIST_OFFLOADED                               |
+ *  |                                                                          |
+ *  | Callbacks processed by rcu_core() from softirqs or local                 |
+ *  | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads,     |
+ *  | allowing nocb_timer to be armed.                                         |
+ *  ----------------------------------------------------------------------------
+ *                                         |
+ *                                         v
+ *                        -----------------------------------
+ *                        |                                 |
+ *                        v                                 v
+ *  ---------------------------------------  ----------------------------------|
+ *  |        SEGCBLIST_OFFLOADED |        |  |     SEGCBLIST_OFFLOADED |       |
+ *  |        SEGCBLIST_KTHREAD_CB         |  |     SEGCBLIST_KTHREAD_GP        |
+ *  |                                     |  |                                 |
+ *  |                                     |  |                                 |
+ *  | CB kthread woke up and              |  | GP kthread woke up and          |
+ *  | acknowledged SEGCBLIST_OFFLOADED.   |  | acknowledged SEGCBLIST_OFFLOADED|
+ *  | Processes callbacks concurrently    |  |                                 |
+ *  | with rcu_core(), holding            |  |                                 |
+ *  | nocb_lock.                          |  |                                 |
+ *  ---------------------------------------  -----------------------------------
+ *                        |                                 |
+ *                        -----------------------------------
+ *                                         |
+ *                                         v
+ *  |--------------------------------------------------------------------------|
+ *  |                           SEGCBLIST_OFFLOADED |                          |
+ *  |                           SEGCBLIST_KTHREAD_CB |                         |
+ *  |                           SEGCBLIST_KTHREAD_GP                           |
+ *  |                                                                          |
+ *  |   Kthreads handle callbacks holding nocb_lock, local rcu_core() stops    |
+ *  |   handling callbacks.                                                    |
+ *  ----------------------------------------------------------------------------
+ */
+
+
+
+/*
+ *                       ==NOCB De-Offloading state machine==
+ *
+ *
+ *  |--------------------------------------------------------------------------|
+ *  |                           SEGCBLIST_OFFLOADED |                          |
+ *  |                           SEGCBLIST_KTHREAD_CB |                         |
+ *  |                           SEGCBLIST_KTHREAD_GP                           |
+ *  |                                                                          |
+ *  |   CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core()    |
+ *  |   ignores callbacks.                                                     |
+ *  ----------------------------------------------------------------------------
+ *                                      |
+ *                                      v
+ *  |--------------------------------------------------------------------------|
+ *  |                           SEGCBLIST_KTHREAD_CB |                         |
+ *  |                           SEGCBLIST_KTHREAD_GP                           |
+ *  |                                                                          |
+ *  |   CB/GP kthreads and local rcu_core() handle callbacks concurrently      |
+ *  |   holding nocb_lock. Wake up CB and GP kthreads if necessary.            |
+ *  ----------------------------------------------------------------------------
+ *                                      |
+ *                                      v
+ *                     -----------------------------------
+ *                     |                                 |
+ *                     v                                 v
+ *  ---------------------------------------------------------------------------|
+ *  |                                                                          |
+ *  |        SEGCBLIST_KTHREAD_CB         |       SEGCBLIST_KTHREAD_GP         |
+ *  |                                     |                                    |
+ *  | GP kthread woke up and              |   CB kthread woke up and           |
+ *  | acknowledged the fact that          |   acknowledged the fact that       |
+ *  | SEGCBLIST_OFFLOADED got cleared.    |   SEGCBLIST_OFFLOADED got cleared. |
+ *  |                                     |   The CB kthread goes to sleep     |
+ *  | The callbacks from the target CPU   |   until it ever gets re-offloaded. |
+ *  | will be ignored from the GP kthread |                                    |
+ *  | loop.                               |                                    |
+ *  ----------------------------------------------------------------------------
+ *                      |                                 |
+ *                      -----------------------------------
+ *                                      |
+ *                                      v
+ *  ----------------------------------------------------------------------------
+ *  |                                   0                                      |
+ *  |                                                                          |
+ *  | Callbacks processed by rcu_core() from softirqs or local                 |
+ *  | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed.    |
+ *  | Flush pending nocb_timer. Flush nocb bypass callbacks.                   |
+ *  ----------------------------------------------------------------------------
+ *                                      |
+ *                                      v
+ *  ----------------------------------------------------------------------------
+ *  |                         SEGCBLIST_SOFTIRQ_ONLY                           |
+ *  |                                                                          |
+ *  |  Callbacks processed by rcu_core() from softirqs or local                |
+ *  |  rcuc kthread, without holding nocb_lock.                                |
+ *  ----------------------------------------------------------------------------
+ */
 #define SEGCBLIST_ENABLED	BIT(0)
-#define SEGCBLIST_OFFLOADED	BIT(1)
+#define SEGCBLIST_SOFTIRQ_ONLY	BIT(1)
+#define SEGCBLIST_KTHREAD_CB	BIT(2)
+#define SEGCBLIST_KTHREAD_GP	BIT(3)
+#define SEGCBLIST_OFFLOADED	BIT(4)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 934945d72ca5..7fc6362625b2 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -266,6 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
  */
 void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
 {
+	rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
 	rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
 }
 
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index ff372db09f8b..e05952ab9b87 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -83,8 +83,16 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
 /* Is the specified rcu_segcblist offloaded?  */
 static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
 {
-	return IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
-		rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED);
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) {
+		/*
+		 * Complete de-offloading happens only when SEGCBLIST_SOFTIRQ_ONLY
+		 * is set.
+		 */
+		if (!rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY))
+			return true;
+	}
+
+	return false;
 }
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8086c0467c15..7cfc2e8a2500 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
 	.dynticks_nesting = 1,
 	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
 	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+#ifdef CONFIG_RCU_NOCB_CPU
+	.cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+#endif
 };
 static struct rcu_state rcu_state = {
 	.level = { &rcu_state.node[0] },
-- 
cgit v1.2.3


From d97b078182406c0bd0aacd36fc0a693e118e608f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 13 Nov 2020 13:13:19 +0100
Subject: rcu/nocb: De-offloading CB kthread

To de-offload callback processing back onto a CPU, it is necessary to
clear SEGCBLIST_OFFLOAD and notify the nocb CB kthread, which will then
clear its own bit flag and go to sleep to stop handling callbacks.  This
commit makes that change.  It will also be necessary to notify the nocb
GP kthread in this same way, which is the subject of a follow-on commit.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Inspired-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
[ paulmck: Add export per kernel test robot feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h   |   2 +
 kernel/rcu/rcu_segcblist.c |  10 ++--
 kernel/rcu/rcu_segcblist.h |   2 +-
 kernel/rcu/tree.h          |   1 +
 kernel/rcu/tree_plugin.h   | 130 ++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 123 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de0826411311..40266eb418b6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -104,8 +104,10 @@ static inline void rcu_user_exit(void) { }
 
 #ifdef CONFIG_RCU_NOCB_CPU
 void rcu_init_nohz(void);
+int rcu_nocb_cpu_deoffload(int cpu);
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 static inline void rcu_init_nohz(void) { }
+static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 /**
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 7fc6362625b2..7f181c9675f7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -264,10 +264,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
  * Mark the specified rcu_segcblist structure as offloaded.  This
  * structure must be empty.
  */
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
 {
-	rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
-	rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
+	if (offload) {
+		rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY);
+		rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED);
+	} else {
+		rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
+	}
 }
 
 /*
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index e05952ab9b87..28c9a5225afc 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -109,7 +109,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
 void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
 void rcu_segcblist_init(struct rcu_segcblist *rsclp);
 void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
 bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a..e0deb4829847 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -201,6 +201,7 @@ struct rcu_data {
 	/* 5) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
 	struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+	struct swait_queue_head nocb_state_wq; /* For offloading state changes */
 	struct task_struct *nocb_gp_kthread;
 	raw_spinlock_t nocb_lock;	/* Guard following pair of fields. */
 	atomic_t nocb_lock_contended;	/* Contention experienced. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6..1b870d0d2445 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2081,16 +2081,29 @@ static int rcu_nocb_gp_kthread(void *arg)
 	return 0;
 }
 
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+	u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+	return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+	return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
 /*
  * Invoke any ready callbacks from the corresponding no-CBs CPU,
  * then, if there are no more, wait for more to appear.
  */
 static void nocb_cb_wait(struct rcu_data *rdp)
 {
+	struct rcu_segcblist *cblist = &rdp->cblist;
+	struct rcu_node *rnp = rdp->mynode;
+	bool needwake_state = false;
+	bool needwake_gp = false;
 	unsigned long cur_gp_seq;
 	unsigned long flags;
-	bool needwake_gp = false;
-	struct rcu_node *rnp = rdp->mynode;
 
 	local_irq_save(flags);
 	rcu_momentary_dyntick_idle();
@@ -2100,32 +2113,50 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 	local_bh_enable();
 	lockdep_assert_irqs_enabled();
 	rcu_nocb_lock_irqsave(rdp, flags);
-	if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+	if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
 	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
 	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
 		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
 	}
-	if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
-		rcu_nocb_unlock_irqrestore(rdp, flags);
-		if (needwake_gp)
-			rcu_gp_kthread_wake();
-		return;
-	}
 
-	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
 	WRITE_ONCE(rdp->nocb_cb_sleep, true);
+
+	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+		if (rcu_segcblist_ready_cbs(cblist))
+			WRITE_ONCE(rdp->nocb_cb_sleep, false);
+	} else {
+		/*
+		 * De-offloading. Clear our flag and notify the de-offload worker.
+		 * We won't touch the callbacks and keep sleeping until we ever
+		 * get re-offloaded.
+		 */
+		WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+		rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+			needwake_state = true;
+	}
+
+	if (rdp->nocb_cb_sleep)
+		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
 	rcu_nocb_unlock_irqrestore(rdp, flags);
 	if (needwake_gp)
 		rcu_gp_kthread_wake();
-	swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
-				 !READ_ONCE(rdp->nocb_cb_sleep));
-	if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
+
+	if (needwake_state)
+		swake_up_one(&rdp->nocb_state_wq);
+
+	do {
+		swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+						    nocb_cb_wait_cond(rdp));
+
 		/* ^^^ Ensure CB invocation follows _sleep test. */
-		return;
-	}
-	WARN_ON(signal_pending(current));
-	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+		if (smp_load_acquire(&rdp->nocb_cb_sleep)) {
+			WARN_ON(signal_pending(current));
+			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+		}
+	} while (!nocb_cb_can_run(rdp));
 }
 
 /*
@@ -2187,6 +2218,67 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 		do_nocb_deferred_wakeup_common(rdp);
 }
 
+static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+	struct rcu_segcblist *cblist = &rdp->cblist;
+	bool wake_cb = false;
+	unsigned long flags;
+
+	printk("De-offloading %d\n", rdp->cpu);
+
+	rcu_nocb_lock_irqsave(rdp, flags);
+	rcu_segcblist_offload(cblist, false);
+
+	if (rdp->nocb_cb_sleep) {
+		rdp->nocb_cb_sleep = false;
+		wake_cb = true;
+	}
+	rcu_nocb_unlock_irqrestore(rdp, flags);
+
+	if (wake_cb)
+		swake_up_one(&rdp->nocb_cb_wq);
+
+	swait_event_exclusive(rdp->nocb_state_wq,
+			      !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+
+	return 0;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+	struct rcu_data *rdp = arg;
+
+	WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+	return __rcu_nocb_rdp_deoffload(rdp);
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	int ret = 0;
+
+	if (rdp == rdp->nocb_gp_rdp) {
+		pr_info("Can't deoffload an rdp GP leader (yet)\n");
+		return -EINVAL;
+	}
+	mutex_lock(&rcu_state.barrier_mutex);
+	cpus_read_lock();
+	if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+		if (cpu_online(cpu)) {
+			ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+		} else {
+			ret = __rcu_nocb_rdp_deoffload(rdp);
+		}
+		if (!ret)
+			cpumask_clear_cpu(cpu, rcu_nocb_mask);
+	}
+	cpus_read_unlock();
+	mutex_unlock(&rcu_state.barrier_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
 void __init rcu_init_nohz(void)
 {
 	int cpu;
@@ -2229,7 +2321,8 @@ void __init rcu_init_nohz(void)
 		rdp = per_cpu_ptr(&rcu_data, cpu);
 		if (rcu_segcblist_empty(&rdp->cblist))
 			rcu_segcblist_init(&rdp->cblist);
-		rcu_segcblist_offload(&rdp->cblist);
+		rcu_segcblist_offload(&rdp->cblist, true);
+		rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
 	}
 	rcu_organize_nocb_kthreads();
 }
@@ -2239,6 +2332,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 	init_swait_queue_head(&rdp->nocb_cb_wq);
 	init_swait_queue_head(&rdp->nocb_gp_wq);
+	init_swait_queue_head(&rdp->nocb_state_wq);
 	raw_spin_lock_init(&rdp->nocb_lock);
 	raw_spin_lock_init(&rdp->nocb_bypass_lock);
 	raw_spin_lock_init(&rdp->nocb_gp_lock);
-- 
cgit v1.2.3


From 254e11efde66ca0a0ce0c99a62c377314b5984ff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 13 Nov 2020 13:13:22 +0100
Subject: rcu/nocb: Re-offload support

To re-offload the callback processing off of a CPU, it is necessary to
clear SEGCBLIST_SOFTIRQ_ONLY, set SEGCBLIST_OFFLOADED, and then notify
both the CB and GP kthreads so that they both set their own bit flag and
start processing the callbacks remotely.  The re-offloading worker is
then notified that it can stop the RCU_SOFTIRQ handler (or rcuc kthread,
as the case may be) from processing the callbacks locally.

Ordering must be carefully enforced so that the callbacks that used to be
processed locally without locking will have the same ordering properties
when they are invoked by the nocb CB and GP kthreads.

This commit makes this change.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Inspired-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
[ paulmck: Export rcu_nocb_cpu_offload(). ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h |   2 +
 kernel/rcu/tree_plugin.h | 158 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 138 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 40266eb418b6..e0ee52e2756d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -104,9 +104,11 @@ static inline void rcu_user_exit(void) { }
 
 #ifdef CONFIG_RCU_NOCB_CPU
 void rcu_init_nohz(void);
+int rcu_nocb_cpu_offload(int cpu);
 int rcu_nocb_cpu_deoffload(int cpu);
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 static inline void rcu_init_nohz(void) { }
+static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
 static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fe46e7008667..03ae1ce4790f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1928,6 +1928,20 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
 	__call_rcu_nocb_wake(rdp, true, flags);
 }
 
+/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ *  rdp_offload_toggle()        nocb_gp_enabled_cb()
+ * -------------------------   ----------------------------
+ *    WRITE flags                 LOCK nocb_gp_lock
+ *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
+ *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
+ *    UNLOCK nocb_gp_lock         READ flags
+ */
 static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
 {
 	u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
@@ -1940,6 +1954,11 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta
 	struct rcu_segcblist *cblist = &rdp->cblist;
 
 	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+			rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+			if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+				*needwake_state = true;
+		}
 		return true;
 	} else {
 		/*
@@ -2003,6 +2022,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
 			rcu_nocb_unlock_irqrestore(rdp, flags);
+			if (needwake_state)
+				swake_up_one(&rdp->nocb_state_wq);
 			continue; /* No callbacks here, try next. */
 		}
 		if (bypass_ncbs) {
@@ -2054,6 +2075,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		}
 		if (needwake_gp)
 			rcu_gp_kthread_wake();
+		if (needwake_state)
+			swake_up_one(&rdp->nocb_state_wq);
 	}
 
 	my_rdp->nocb_gp_bypass = bypass;
@@ -2159,6 +2182,11 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 	WRITE_ONCE(rdp->nocb_cb_sleep, true);
 
 	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+			rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+			if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+				needwake_state = true;
+		}
 		if (rcu_segcblist_ready_cbs(cblist))
 			WRITE_ONCE(rdp->nocb_cb_sleep, false);
 	} else {
@@ -2254,35 +2282,25 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 		do_nocb_deferred_wakeup_common(rdp);
 }
 
-static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+static int rdp_offload_toggle(struct rcu_data *rdp,
+			       bool offload, unsigned long flags)
+	__releases(rdp->nocb_lock)
 {
 	struct rcu_segcblist *cblist = &rdp->cblist;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-	bool wake_cb = false, wake_gp = false;
-	unsigned long flags;
-
-	printk("De-offloading %d\n", rdp->cpu);
-
-	rcu_nocb_lock_irqsave(rdp, flags);
-	/*
-	 * If there are still pending work offloaded, the offline
-	 * CPU won't help much handling them.
-	 */
-	if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
-		rcu_nocb_unlock_irqrestore(rdp, flags);
-		return -EBUSY;
-	}
+	bool wake_gp = false;
 
-	rcu_segcblist_offload(cblist, false);
+	rcu_segcblist_offload(cblist, offload);
 
-	if (rdp->nocb_cb_sleep) {
+	if (rdp->nocb_cb_sleep)
 		rdp->nocb_cb_sleep = false;
-		wake_cb = true;
-	}
 	rcu_nocb_unlock_irqrestore(rdp, flags);
 
-	if (wake_cb)
-		swake_up_one(&rdp->nocb_cb_wq);
+	/*
+	 * Ignore former value of nocb_cb_sleep and force wake up as it could
+	 * have been spuriously set to false already.
+	 */
+	swake_up_one(&rdp->nocb_cb_wq);
 
 	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 	if (rdp_gp->nocb_gp_sleep) {
@@ -2294,10 +2312,32 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 	if (wake_gp)
 		wake_up_process(rdp_gp->nocb_gp_kthread);
 
+	return 0;
+}
+
+static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+	struct rcu_segcblist *cblist = &rdp->cblist;
+	unsigned long flags;
+	int ret;
+
+	printk("De-offloading %d\n", rdp->cpu);
+
+	rcu_nocb_lock_irqsave(rdp, flags);
+	/*
+	 * If there are still pending work offloaded, the offline
+	 * CPU won't help much handling them.
+	 */
+	if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
+		rcu_nocb_unlock_irqrestore(rdp, flags);
+		return -EBUSY;
+	}
+
+	ret = rdp_offload_toggle(rdp, false, flags);
 	swait_event_exclusive(rdp->nocb_state_wq,
 			      !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
 							SEGCBLIST_KTHREAD_GP));
-	return 0;
+	return ret;
 }
 
 static long rcu_nocb_rdp_deoffload(void *arg)
@@ -2335,6 +2375,80 @@ int rcu_nocb_cpu_deoffload(int cpu)
 }
 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
 
+static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
+	struct rcu_segcblist *cblist = &rdp->cblist;
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * For now we only support re-offload, ie: the rdp must have been
+	 * offloaded on boot first.
+	 */
+	if (!rdp->nocb_gp_rdp)
+		return -EINVAL;
+
+	printk("Offloading %d\n", rdp->cpu);
+	/*
+	 * Can't use rcu_nocb_lock_irqsave() while we are in
+	 * SEGCBLIST_SOFTIRQ_ONLY mode.
+	 */
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	/*
+	 * We didn't take the nocb lock while working on the
+	 * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+	 * Every modifications that have been done previously on
+	 * rdp->cblist must be visible remotely by the nocb kthreads
+	 * upon wake up after reading the cblist flags.
+	 *
+	 * The layout against nocb_lock enforces that ordering:
+	 *
+	 *  __rcu_nocb_rdp_offload()   nocb_cb_wait()/nocb_gp_wait()
+	 * -------------------------   ----------------------------
+	 *      WRITE callbacks           rcu_nocb_lock()
+	 *      rcu_nocb_lock()           READ flags
+	 *      WRITE flags               READ callbacks
+	 *      rcu_nocb_unlock()         rcu_nocb_unlock()
+	 */
+	ret = rdp_offload_toggle(rdp, true, flags);
+	swait_event_exclusive(rdp->nocb_state_wq,
+			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+	return ret;
+}
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+	struct rcu_data *rdp = arg;
+
+	WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+	return __rcu_nocb_rdp_offload(rdp);
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	int ret = 0;
+
+	mutex_lock(&rcu_state.barrier_mutex);
+	cpus_read_lock();
+	if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+		if (cpu_online(cpu)) {
+			ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+		} else {
+			ret = __rcu_nocb_rdp_offload(rdp);
+		}
+		if (!ret)
+			cpumask_set_cpu(cpu, rcu_nocb_mask);
+	}
+	cpus_read_unlock();
+	mutex_unlock(&rcu_state.barrier_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
 void __init rcu_init_nohz(void)
 {
 	int cpu;
-- 
cgit v1.2.3


From 43759fe5a137389e94ed6d4680c3c63c17273158 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 11 Nov 2020 23:53:13 +0100
Subject: cpu/hotplug: Add lockdep_is_cpus_held()

This commit adds a lockdep_is_cpus_held() function to verify that the
proper locks are held and that various operations are running in the
correct context.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/cpu.h | 2 ++
 kernel/cpu.c        | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d6428aaf67e7..3aaa0687e8df 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -111,6 +111,8 @@ static inline void cpu_maps_update_done(void)
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
+extern int lockdep_is_cpus_held(void);
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern void cpus_write_lock(void);
 extern void cpus_write_unlock(void);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4e11e91010e1..1b6302ecbabe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void)
 	percpu_rwsem_assert_held(&cpu_hotplug_lock);
 }
 
+#ifdef CONFIG_LOCKDEP
+int lockdep_is_cpus_held(void)
+{
+	return percpu_rwsem_is_held(&cpu_hotplug_lock);
+}
+#endif
+
 static void lockdep_acquire_cpus_lock(void)
 {
 	rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
-- 
cgit v1.2.3


From dcd42591ebb8a25895b551a5297ea9c24414ba54 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 13 Nov 2020 13:13:33 +0100
Subject: timer: Add timer_curr_running()

This commit adds a timer_curr_running() function that verifies that the
current code is running in the context of the specified timer's handler.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/timer.h |  2 ++
 kernel/time/timer.c   | 13 +++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index fda13c9d1256..4118a97e62fb 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -192,6 +192,8 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
 
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
 
+extern bool timer_curr_running(struct timer_list *timer);
+
 extern void init_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8dbc008f8942..f9b2096456e5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1237,6 +1237,19 @@ int try_to_del_timer_sync(struct timer_list *timer)
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
+bool timer_curr_running(struct timer_list *timer)
+{
+	int i;
+
+	for (i = 0; i < NR_BASES; i++) {
+		struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
+		if (base->running_timer == timer)
+			return true;
+	}
+
+	return false;
+}
+
 #ifdef CONFIG_PREEMPT_RT
 static __init void timer_base_init_expiry_lock(struct timer_base *base)
 {
-- 
cgit v1.2.3


From ae19aaafae95a5487469433e9cae4c208f8d15cd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 17 Nov 2020 11:30:18 -0800
Subject: torture: Add fuzzed hrtimer-based sleep functions

This commit adds torture_hrtimeout_ns(), torture_hrtimeout_us(),
torture_hrtimeout_ms(), torture_hrtimeout_jiffies(), and
torture_hrtimeout_s(), each of which uses hrtimers to block for a fuzzed
time interval.  These functions are intended to be used by the various
torture tests to decouple wakeups from the timer wheel, thus providing
more opportunity for Murphy to insert destructive race conditions.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/torture.h |  7 +++++
 kernel/torture.c        | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7f65bd1dd307..32941f8106b2 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -61,6 +61,13 @@ static inline void torture_random_init(struct torture_random_state *trsp)
 	trsp->trs_count = 0;
 }
 
+/* Definitions for high-resolution-timer sleeps. */
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp);
+int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp);
+int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp);
+
 /* Task shuffler, which causes CPUs to occasionally go idle. */
 void torture_shuffle_task_register(struct task_struct *tp);
 int torture_shuffle_init(long shuffint);
diff --git a/kernel/torture.c b/kernel/torture.c
index 8562ac18d2eb..7548634e8a89 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -58,6 +58,81 @@ static int verbose;
 static int fullstop = FULLSTOP_RMMOD;
 static DEFINE_MUTEX(fullstop_mutex);
 
+/*
+ * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit
+ * nanosecond random fuzz.  This function and its friends desynchronize
+ * testing from the timer wheel.
+ */
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+	ktime_t hto = baset_ns;
+
+	if (trsp)
+		hto += (torture_random(trsp) >> 3) % fuzzt_ns;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
+
+/*
+ * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit
+ * nanosecond (not microsecond!) random fuzz.
+ */
+int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp)
+{
+	ktime_t baset_ns = baset_us * NSEC_PER_USEC;
+
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * microsecond (not millisecond!) random fuzz.
+ */
+int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp)
+{
+	ktime_t baset_ns = baset_ms * NSEC_PER_MSEC;
+	u32 fuzzt_ns;
+
+	if ((u32)~0U / NSEC_PER_USEC < fuzzt_us)
+		fuzzt_ns = (u32)~0U;
+	else
+		fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
+
+/*
+ * Schedule a high-resolution-timer sleep in jiffies, with an
+ * implied one-jiffy random fuzz.  This is intended to replace calls to
+ * schedule_timeout_interruptible() and friends.
+ */
+int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
+{
+	ktime_t baset_ns = jiffies_to_nsecs(baset_j);
+
+	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
+
+/*
+ * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit
+ * millisecond (not second!) random fuzz.
+ */
+int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp)
+{
+	ktime_t baset_ns = baset_s * NSEC_PER_SEC;
+	u32 fuzzt_ns;
+
+	if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms)
+		fuzzt_ns = (u32)~0U;
+	else
+		fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+}
+EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
-- 
cgit v1.2.3


From 8a67a20bf257ca378d6e5588fbe4382966395ac8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 25 Nov 2020 13:00:04 -0800
Subject: torture: Throttle VERBOSE_TOROUT_*() output

This commit adds kernel boot parameters torture.verbose_sleep_frequency
and torture.verbose_sleep_duration, which allow VERBOSE_TOROUT_*() output
to be throttled with periodic sleeps on large systems.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 ++++++++
 include/linux/torture.h                         | 15 +++++++++++++--
 kernel/torture.c                                | 20 ++++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3244f9e04a64..18f3aa759e54 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5337,6 +5337,14 @@
 			are running concurrently, especially on systems
 			with rotating-rust storage.
 
+	torture.verbose_sleep_frequency= [KNL]
+			Specifies how many verbose printk()s should be
+			emitted between each sleep.  The default of zero
+			disables verbose-printk() sleeping.
+
+	torture.verbose_sleep_duration= [KNL]
+			Duration of each verbose-printk() sleep in jiffies.
+
 	tp720=		[HW,PS2]
 
 	tpm_suspend_pcr=[HW,TPM]
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 32941f8106b2..d62d13c8c69a 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -32,9 +32,20 @@
 #define TOROUT_STRING(s) \
 	pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s)
 #define VERBOSE_TOROUT_STRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0)
+do {										\
+	if (verbose) {								\
+		verbose_torout_sleep();						\
+		pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s);		\
+	}									\
+} while (0)
 #define VERBOSE_TOROUT_ERRSTRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0)
+do {										\
+	if (verbose) {								\
+		verbose_torout_sleep();						\
+		pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s);	\
+	}									\
+} while (0)
+void verbose_torout_sleep(void);
 
 /* Definitions for online/offline exerciser. */
 typedef void torture_ofl_func(void);
diff --git a/kernel/torture.c b/kernel/torture.c
index 7ad5c9681580..93eeeb2b3d88 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444);
 static bool ftrace_dump_at_shutdown;
 module_param(ftrace_dump_at_shutdown, bool, 0444);
 
+static int verbose_sleep_frequency;
+module_param(verbose_sleep_frequency, int, 0444);
+
+static int verbose_sleep_duration = 1;
+module_param(verbose_sleep_duration, int, 0444);
+
 static char *torture_type;
 static int verbose;
 
@@ -58,6 +64,20 @@ static int verbose;
 static int fullstop = FULLSTOP_RMMOD;
 static DEFINE_MUTEX(fullstop_mutex);
 
+static atomic_t verbose_sleep_counter;
+
+/*
+ * Sleep if needed from VERBOSE_TOROUT*().
+ */
+void verbose_torout_sleep(void)
+{
+	if (verbose_sleep_frequency > 0 &&
+	    verbose_sleep_duration > 0 &&
+	    !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency))
+		schedule_timeout_uninterruptible(verbose_sleep_duration);
+}
+EXPORT_SYMBOL_GPL(verbose_torout_sleep);
+
 /*
  * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit
  * nanosecond random fuzz.  This function and its friends desynchronize
-- 
cgit v1.2.3


From 1afb95fee0342b8d9e05b0433e8e44a6dfd7c4a3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 19 Dec 2020 07:34:35 -0800
Subject: torture: Maintain torture-specific set of CPUs-online books

The TREE01 rcutorture scenario intentionally creates confusion as to the
number of available CPUs by specifying the "maxcpus=8 nr_cpus=43" kernel
boot parameters.  This can disable rcutorture's load shedding, which
currently uses num_online_cpus(), which would count the extra 35 CPUs.
However, the rcutorture guest OS will be provisioned with only 8 CPUs,
which means that rcutorture will present full load even when all but one
of the original 8 CPUs are offline.  This can result in spurious errors
due to extreme overloading of that single remaining CPU.

This commit therefore keeps a separate set of books on the number of
usable online CPUs, so that torture_num_online_cpus() is used for load
shedding instead of num_online_cpus().  Note that initial sizing must
use num_online_cpus() because torture_num_online_cpus() will return
NR_CPUS until shortly after torture_onoff_init() is invoked.

Reported-by: Frederic Weisbecker <frederic@kernel.org>
[ paulmck: Apply feedback from kernel test robot. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/torture.h |  5 +++++
 kernel/rcu/rcutorture.c |  4 ++--
 kernel/torture.c        | 16 ++++++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index d62d13c8c69a..0910c5803f35 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -48,6 +48,11 @@ do {										\
 void verbose_torout_sleep(void);
 
 /* Definitions for online/offline exerciser. */
+#ifdef CONFIG_HOTPLUG_CPU
+int torture_num_online_cpus(void);
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static inline int torture_num_online_cpus(void) { return 1; }
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 typedef void torture_ofl_func(void);
 bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes,
 		     unsigned long *sum_offl, int *min_onl, int *max_onl);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 76c838696366..a816df4e86e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1338,7 +1338,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
 					struct torture_random_state *trsp)
 {
 	unsigned long loops;
-	int noc = num_online_cpus();
+	int noc = torture_num_online_cpus();
 	int rdrchked;
 	int rdrchker;
 	struct rcu_torture_reader_check *rtrcp; // Me.
@@ -1658,7 +1658,7 @@ rcu_torture_reader(void *arg)
 			torture_hrtimeout_us(500, 1000, &rand);
 			lastsleep = jiffies + 10;
 		}
-		while (num_online_cpus() < mynumonline && !torture_must_stop())
+		while (torture_num_online_cpus() < mynumonline && !torture_must_stop())
 			schedule_timeout_interruptible(HZ / 5);
 		stutter_wait("rcu_torture_reader");
 	} while (!torture_must_stop());
diff --git a/kernel/torture.c b/kernel/torture.c
index 507a20be6950..01e336f1e5b2 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -175,6 +175,19 @@ static unsigned long sum_online;
 static int min_online = -1;
 static int max_online;
 
+static int torture_online_cpus = NR_CPUS;
+
+/*
+ * Some torture testing leverages confusion as to the number of online
+ * CPUs.  This function returns the torture-testing view of this number,
+ * which allows torture tests to load-balance appropriately.
+ */
+int torture_num_online_cpus(void)
+{
+	return READ_ONCE(torture_online_cpus);
+}
+EXPORT_SYMBOL_GPL(torture_num_online_cpus);
+
 /*
  * Attempt to take a CPU offline.  Return false if the CPU is already
  * offline or if it is not subject to CPU-hotplug operations.  The
@@ -229,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
 			*min_offl = delta;
 		if (*max_offl < delta)
 			*max_offl = delta;
+		WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1);
+		WARN_ON_ONCE(torture_online_cpus <= 0);
 	}
 
 	return true;
@@ -285,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes,
 			*min_onl = delta;
 		if (*max_onl < delta)
 			*max_onl = delta;
+		WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1);
 	}
 
 	return true;
-- 
cgit v1.2.3


From 9326eecd9365a5b82220a4011592f7c0209566fc Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@intel.com>
Date: Wed, 6 Jan 2021 20:37:10 -0800
Subject: fpga: dfl: move dfl_device_id to mod_devicetable.h

In order to support MODULE_DEVICE_TABLE() for dfl device driver, this
patch moves struct dfl_device_id to mod_devicetable.h

Some brief description for DFL (Device Feature List) is added to make
the DFL known to the whole kernel.

Reviewed-by: Tom Rix <trix@redhat.com>
Acked-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Matthew Gerlach <matthew.gerlach@linux.intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Link: https://lore.kernel.org/r/20210107043714.991646-5-mdf@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/dfl.h              | 13 +------------
 include/linux/mod_devicetable.h | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index ac373b1fcff9..549c7900dcfd 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/iopoll.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mod_devicetable.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/uuid.h>
@@ -525,18 +526,6 @@ enum dfl_id_type {
 	DFL_ID_MAX,
 };
 
-/**
- * struct dfl_device_id -  dfl device identifier
- * @type: DFL FIU type of the device. See enum dfl_id_type.
- * @feature_id: feature identifier local to its DFL FIU type.
- * @driver_data: driver specific data.
- */
-struct dfl_device_id {
-	u16 type;
-	u16 feature_id;
-	unsigned long driver_data;
-};
-
 /**
  * struct dfl_device - represent an dfl device on dfl bus
  *
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index c425290b21e2..b8dae34eca10 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -846,4 +846,28 @@ struct auxiliary_device_id {
 	kernel_ulong_t driver_data;
 };
 
+/*
+ * DFL (Device Feature List)
+ *
+ * DFL defines a linked list of feature headers within the device MMIO space to
+ * provide an extensible way of adding features. Software can walk through these
+ * predefined data structures to enumerate features. It is now used in the FPGA.
+ * See Documentation/fpga/dfl.rst for more information.
+ *
+ * The dfl bus type is introduced to match the individual feature devices (dfl
+ * devices) for specific dfl drivers.
+ */
+
+/**
+ * struct dfl_device_id -  dfl device identifier
+ * @type: DFL FIU type of the device. See enum dfl_id_type.
+ * @feature_id: feature identifier local to its DFL FIU type.
+ * @driver_data: driver specific data.
+ */
+struct dfl_device_id {
+	__u16 type;
+	__u16 feature_id;
+	kernel_ulong_t driver_data;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3


From ecc1641aca658ed4140751748f84985ffb6cce28 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@intel.com>
Date: Wed, 6 Jan 2021 20:37:12 -0800
Subject: fpga: dfl: move dfl bus related APIs to include/linux/dfl.h

Now the dfl drivers could be made as independent modules and put in
different folders according to their functionalities. In order for
scattered dfl device drivers to include dfl bus APIs, move the
dfl bus APIs to a new header file in the public folder.

[mdf@kernel.org: Fixed up header guards to match filename]

Reviewed-by: Tom Rix <trix@redhat.com>
Acked-by: Wu Hao <hao.wu@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
Link: https://lore.kernel.org/r/20210107043714.991646-7-mdf@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS         |  1 +
 drivers/fpga/dfl.c  |  1 +
 drivers/fpga/dfl.h  | 72 --------------------------------------------
 include/linux/dfl.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/dfl.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 546aa66428c9..a044b58eb9ac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6957,6 +6957,7 @@ S:	Maintained
 F:	Documentation/ABI/testing/sysfs-bus-dfl
 F:	Documentation/fpga/dfl.rst
 F:	drivers/fpga/dfl*
+F:	include/linux/dfl.h
 F:	include/uapi/linux/fpga-dfl.h
 
 FPGA MANAGER FRAMEWORK
diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
index 5a6ba3b2fa05..511b20ff35a3 100644
--- a/drivers/fpga/dfl.c
+++ b/drivers/fpga/dfl.c
@@ -10,6 +10,7 @@
  *   Wu Hao <hao.wu@intel.com>
  *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
  */
+#include <linux/dfl.h>
 #include <linux/fpga-dfl.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
index 549c7900dcfd..2b82c96ba56c 100644
--- a/drivers/fpga/dfl.h
+++ b/drivers/fpga/dfl.h
@@ -517,76 +517,4 @@ long dfl_feature_ioctl_set_irq(struct platform_device *pdev,
 			       struct dfl_feature *feature,
 			       unsigned long arg);
 
-/**
- * enum dfl_id_type - define the DFL FIU types
- */
-enum dfl_id_type {
-	FME_ID = 0,
-	PORT_ID = 1,
-	DFL_ID_MAX,
-};
-
-/**
- * struct dfl_device - represent an dfl device on dfl bus
- *
- * @dev: generic device interface.
- * @id: id of the dfl device.
- * @type: type of DFL FIU of the device. See enum dfl_id_type.
- * @feature_id: feature identifier local to its DFL FIU type.
- * @mmio_res: mmio resource of this dfl device.
- * @irqs: list of Linux IRQ numbers of this dfl device.
- * @num_irqs: number of IRQs supported by this dfl device.
- * @cdev: pointer to DFL FPGA container device this dfl device belongs to.
- * @id_entry: matched id entry in dfl driver's id table.
- */
-struct dfl_device {
-	struct device dev;
-	int id;
-	u16 type;
-	u16 feature_id;
-	struct resource mmio_res;
-	int *irqs;
-	unsigned int num_irqs;
-	struct dfl_fpga_cdev *cdev;
-	const struct dfl_device_id *id_entry;
-};
-
-/**
- * struct dfl_driver - represent an dfl device driver
- *
- * @drv: driver model structure.
- * @id_table: pointer to table of device IDs the driver is interested in.
- *	      { } member terminated.
- * @probe: mandatory callback for device binding.
- * @remove: callback for device unbinding.
- */
-struct dfl_driver {
-	struct device_driver drv;
-	const struct dfl_device_id *id_table;
-
-	int (*probe)(struct dfl_device *dfl_dev);
-	void (*remove)(struct dfl_device *dfl_dev);
-};
-
-#define to_dfl_dev(d) container_of(d, struct dfl_device, dev)
-#define to_dfl_drv(d) container_of(d, struct dfl_driver, drv)
-
-/*
- * use a macro to avoid include chaining to get THIS_MODULE.
- */
-#define dfl_driver_register(drv) \
-	__dfl_driver_register(drv, THIS_MODULE)
-int __dfl_driver_register(struct dfl_driver *dfl_drv, struct module *owner);
-void dfl_driver_unregister(struct dfl_driver *dfl_drv);
-
-/*
- * module_dfl_driver() - Helper macro for drivers that don't do
- * anything special in module init/exit.  This eliminates a lot of
- * boilerplate.  Each module may only use this macro once, and
- * calling it replaces module_init() and module_exit().
- */
-#define module_dfl_driver(__dfl_driver) \
-	module_driver(__dfl_driver, dfl_driver_register, \
-		      dfl_driver_unregister)
-
 #endif /* __FPGA_DFL_H */
diff --git a/include/linux/dfl.h b/include/linux/dfl.h
new file mode 100644
index 000000000000..6cc10982351a
--- /dev/null
+++ b/include/linux/dfl.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for DFL driver and device API
+ *
+ * Copyright (C) 2020 Intel Corporation, Inc.
+ */
+
+#ifndef __LINUX_DFL_H
+#define __LINUX_DFL_H
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+
+/**
+ * enum dfl_id_type - define the DFL FIU types
+ */
+enum dfl_id_type {
+	FME_ID = 0,
+	PORT_ID = 1,
+	DFL_ID_MAX,
+};
+
+/**
+ * struct dfl_device - represent an dfl device on dfl bus
+ *
+ * @dev: generic device interface.
+ * @id: id of the dfl device.
+ * @type: type of DFL FIU of the device. See enum dfl_id_type.
+ * @feature_id: feature identifier local to its DFL FIU type.
+ * @mmio_res: mmio resource of this dfl device.
+ * @irqs: list of Linux IRQ numbers of this dfl device.
+ * @num_irqs: number of IRQs supported by this dfl device.
+ * @cdev: pointer to DFL FPGA container device this dfl device belongs to.
+ * @id_entry: matched id entry in dfl driver's id table.
+ */
+struct dfl_device {
+	struct device dev;
+	int id;
+	u16 type;
+	u16 feature_id;
+	struct resource mmio_res;
+	int *irqs;
+	unsigned int num_irqs;
+	struct dfl_fpga_cdev *cdev;
+	const struct dfl_device_id *id_entry;
+};
+
+/**
+ * struct dfl_driver - represent an dfl device driver
+ *
+ * @drv: driver model structure.
+ * @id_table: pointer to table of device IDs the driver is interested in.
+ *	      { } member terminated.
+ * @probe: mandatory callback for device binding.
+ * @remove: callback for device unbinding.
+ */
+struct dfl_driver {
+	struct device_driver drv;
+	const struct dfl_device_id *id_table;
+
+	int (*probe)(struct dfl_device *dfl_dev);
+	void (*remove)(struct dfl_device *dfl_dev);
+};
+
+#define to_dfl_dev(d) container_of(d, struct dfl_device, dev)
+#define to_dfl_drv(d) container_of(d, struct dfl_driver, drv)
+
+/*
+ * use a macro to avoid include chaining to get THIS_MODULE.
+ */
+#define dfl_driver_register(drv) \
+	__dfl_driver_register(drv, THIS_MODULE)
+int __dfl_driver_register(struct dfl_driver *dfl_drv, struct module *owner);
+void dfl_driver_unregister(struct dfl_driver *dfl_drv);
+
+/*
+ * module_dfl_driver() - Helper macro for drivers that don't do
+ * anything special in module init/exit.  This eliminates a lot of
+ * boilerplate.  Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dfl_driver(__dfl_driver) \
+	module_driver(__dfl_driver, dfl_driver_register, \
+		      dfl_driver_unregister)
+
+#endif /* __LINUX_DFL_H */
-- 
cgit v1.2.3


From 63f24a7fafd44899f001b8467b38fac5a534f63e Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 5 Jan 2021 13:02:28 +0100
Subject: vt: move set_leds to keyboard.c

set_leds and compute_shiftstate are called from a single place in vt.c.
Let's combine these two into vt_set_leds_compute_shiftstate. This allows
for making keyboard_tasklet local in the next patch.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210105120239.28031-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/keyboard.c | 11 ++++++++++-
 drivers/tty/vt/vt.c       |  3 +--
 include/linux/kbd_kern.h  |  8 +-------
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 52922d21a49f..32ec4242b1f2 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -372,6 +372,12 @@ static void to_utf8(struct vc_data *vc, uint c)
 	}
 }
 
+/* FIXME: review locking for vt.c callers */
+static void set_leds(void)
+{
+	tasklet_schedule(&keyboard_tasklet);
+}
+
 /*
  * Called after returning from RAW mode or when changing consoles - recompute
  * shift_down[] and shift_state from key_down[] maybe called when keymap is
@@ -401,9 +407,12 @@ static void do_compute_shiftstate(void)
 }
 
 /* We still have to export this method to vt.c */
-void compute_shiftstate(void)
+void vt_set_leds_compute_shiftstate(void)
 {
 	unsigned long flags;
+
+	set_leds();
+
 	spin_lock_irqsave(&kbd_event_lock, flags);
 	do_compute_shiftstate();
 	spin_unlock_irqrestore(&kbd_event_lock, flags);
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index d04a162939a4..fe4fedbc0386 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1036,8 +1036,7 @@ void redraw_screen(struct vc_data *vc, int is_switch)
 	}
 	set_cursor(vc);
 	if (is_switch) {
-		set_leds();
-		compute_shiftstate();
+		vt_set_leds_compute_shiftstate();
 		notify_update(vc);
 	}
 }
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index 82f29aa35062..adf98004624b 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -71,12 +71,6 @@ extern void (*kbd_ledfunc)(unsigned int led);
 extern int set_console(int nr);
 extern void schedule_console_callback(void);
 
-/* FIXME: review locking for vt.c callers */
-static inline void set_leds(void)
-{
-	tasklet_schedule(&keyboard_tasklet);
-}
-
 static inline int vc_kbd_mode(struct kbd_struct * kbd, int flag)
 {
 	return ((kbd->modeflags >> flag) & 1);
@@ -135,7 +129,7 @@ static inline void chg_vc_kbd_led(struct kbd_struct * kbd, int flag)
 
 struct console;
 
-void compute_shiftstate(void);
+void vt_set_leds_compute_shiftstate(void);
 
 /* defkeymap.c */
 
-- 
cgit v1.2.3


From a18a9da82c57804ce5977ea0e01b72ee3f40a51b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 5 Jan 2021 13:02:29 +0100
Subject: vt: keyboard, make keyboard_tasklet local

Now that the last extern user of the tasklet (set_leds) is in
keyboard.c, we can make keyboard_tasklet local to this unit too.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210105120239.28031-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/keyboard.c | 5 +++--
 include/linux/kbd_kern.h  | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 32ec4242b1f2..9f2eaa104ebc 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -131,6 +131,9 @@ static const unsigned char max_vals[] = {
 
 static const int NR_TYPES = ARRAY_SIZE(max_vals);
 
+static void kbd_bh(unsigned long dummy);
+static DECLARE_TASKLET_DISABLED_OLD(keyboard_tasklet, kbd_bh);
+
 static struct input_handler kbd_handler;
 static DEFINE_SPINLOCK(kbd_event_lock);
 static DEFINE_SPINLOCK(led_lock);
@@ -1258,8 +1261,6 @@ static void kbd_bh(unsigned long dummy)
 	}
 }
 
-DECLARE_TASKLET_DISABLED_OLD(keyboard_tasklet, kbd_bh);
-
 #if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) ||\
     defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\
     defined(CONFIG_PARISC) || defined(CONFIG_SUPERH) ||\
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index adf98004624b..c40811d79769 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -6,8 +6,6 @@
 #include <linux/interrupt.h>
 #include <linux/keyboard.h>
 
-extern struct tasklet_struct keyboard_tasklet;
-
 extern char *func_table[MAX_NR_FUNC];
 
 /*
-- 
cgit v1.2.3


From ff2047fb755d4415ec3c70ac799889371151796d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 5 Jan 2021 13:02:35 +0100
Subject: vt: drop old FONT ioctls

Drop support for these ioctls:
* PIO_FONT, PIO_FONTX
* GIO_FONT, GIO_FONTX
* PIO_FONTRESET

As was demonstrated by commit 90bfdeef83f1 (tty: make FONTX ioctl use
the tty pointer they were actually passed), these ioctls are not used
from userspace, as:
1) they used to be broken (set up font on current console, not the open
   one) and racy (before the commit above)
2) KDFONTOP ioctl is used for years instead

Note that PIO_FONTRESET is defunct on most systems as VGA_CONSOLE is set
on them for ages. That turns on BROKEN_GRAPHICS_PROGRAMS which makes
PIO_FONTRESET just return an error.

We are removing KD_FONT_FLAG_OLD here as it was used only by these
removed ioctls. kd.h header exists both in kernel and uapi headers, so
we can remove the kernel one completely. Everyone includeing kd.h will
now automatically get the uapi one.

There are now unused definitions of the ioctl numbers and "struct
consolefontdesc" in kd.h, but as it is a uapi header, I am not touching
these.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210105120239.28031-8-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c       |  39 +-----------
 drivers/tty/vt/vt_ioctl.c | 151 ----------------------------------------------
 include/linux/kd.h        |   8 ---
 3 files changed, 3 insertions(+), 195 deletions(-)
 delete mode 100644 include/linux/kd.h

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index fe4fedbc0386..284b07224c55 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -4583,16 +4583,8 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op)
 
 	if (op->data && font.charcount > op->charcount)
 		rc = -ENOSPC;
-	if (!(op->flags & KD_FONT_FLAG_OLD)) {
-		if (font.width > op->width || font.height > op->height) 
-			rc = -ENOSPC;
-	} else {
-		if (font.width != 8)
-			rc = -EIO;
-		else if ((op->height && font.height > op->height) ||
-			 font.height > 32)
-			rc = -ENOSPC;
-	}
+	if (font.width > op->width || font.height > op->height)
+		rc = -ENOSPC;
 	if (rc)
 		goto out;
 
@@ -4620,7 +4612,7 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op)
 		return -EINVAL;
 	if (op->charcount > 512)
 		return -EINVAL;
-	if (op->width <= 0 || op->width > 32 || op->height > 32)
+	if (op->width <= 0 || op->width > 32 || !op->height || op->height > 32)
 		return -EINVAL;
 	size = (op->width+7)/8 * 32 * op->charcount;
 	if (size > max_font_size)
@@ -4630,31 +4622,6 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op)
 	if (IS_ERR(font.data))
 		return PTR_ERR(font.data);
 
-	if (!op->height) {		/* Need to guess font height [compat] */
-		int h, i;
-		u8 *charmap = font.data;
-
-		/*
-		 * If from KDFONTOP ioctl, don't allow things which can be done
-		 * in userland,so that we can get rid of this soon
-		 */
-		if (!(op->flags & KD_FONT_FLAG_OLD)) {
-			kfree(font.data);
-			return -EINVAL;
-		}
-
-		for (h = 32; h > 0; h--)
-			for (i = 0; i < op->charcount; i++)
-				if (charmap[32*i+h-1])
-					goto nonzero;
-
-		kfree(font.data);
-		return -EINVAL;
-
-	nonzero:
-		op->height = h;
-	}
-
 	font.charcount = op->charcount;
 	font.width = op->width;
 	font.height = op->height;
diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index 3813c40f1b48..4a4cbd4a5f37 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -484,70 +484,6 @@ static int vt_k_ioctl(struct tty_struct *tty, unsigned int cmd,
 	return 0;
 }
 
-static inline int do_fontx_ioctl(struct vc_data *vc, int cmd,
-		struct consolefontdesc __user *user_cfd,
-		struct console_font_op *op)
-{
-	struct consolefontdesc cfdarg;
-	int i;
-
-	if (copy_from_user(&cfdarg, user_cfd, sizeof(struct consolefontdesc)))
-		return -EFAULT;
-
-	switch (cmd) {
-	case PIO_FONTX:
-		op->op = KD_FONT_OP_SET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = cfdarg.chardata;
-		return con_font_op(vc, op);
-
-	case GIO_FONTX:
-		op->op = KD_FONT_OP_GET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = cfdarg.chardata;
-		i = con_font_op(vc, op);
-		if (i)
-			return i;
-		cfdarg.charheight = op->height;
-		cfdarg.charcount = op->charcount;
-		if (copy_to_user(user_cfd, &cfdarg, sizeof(struct consolefontdesc)))
-			return -EFAULT;
-		return 0;
-	}
-	return -EINVAL;
-}
-
-static int vt_io_fontreset(struct vc_data *vc, struct console_font_op *op)
-{
-	int ret;
-
-	if (__is_defined(BROKEN_GRAPHICS_PROGRAMS)) {
-		/*
-		 * With BROKEN_GRAPHICS_PROGRAMS defined, the default font is
-		 * not saved.
-		 */
-		return -ENOSYS;
-	}
-
-	op->op = KD_FONT_OP_SET_DEFAULT;
-	op->data = NULL;
-	ret = con_font_op(vc, op);
-	if (ret)
-		return ret;
-
-	console_lock();
-	con_set_default_unimap(vc);
-	console_unlock();
-
-	return 0;
-}
-
 static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud,
 		bool perm, struct vc_data *vc)
 {
@@ -572,29 +508,7 @@ static inline int do_unimap_ioctl(int cmd, struct unimapdesc __user *user_ud,
 static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up,
 		bool perm)
 {
-	struct console_font_op op;	/* used in multiple places here */
-
 	switch (cmd) {
-	case PIO_FONT:
-		if (!perm)
-			return -EPERM;
-		op.op = KD_FONT_OP_SET;
-		op.flags = KD_FONT_FLAG_OLD | KD_FONT_FLAG_DONT_RECALC;	/* Compatibility */
-		op.width = 8;
-		op.height = 0;
-		op.charcount = 256;
-		op.data = up;
-		return con_font_op(vc, &op);
-
-	case GIO_FONT:
-		op.op = KD_FONT_OP_GET;
-		op.flags = KD_FONT_FLAG_OLD;
-		op.width = 8;
-		op.height = 32;
-		op.charcount = 256;
-		op.data = up;
-		return con_font_op(vc, &op);
-
 	case PIO_CMAP:
 		if (!perm)
 			return -EPERM;
@@ -603,20 +517,6 @@ static int vt_io_ioctl(struct vc_data *vc, unsigned int cmd, void __user *up,
 	case GIO_CMAP:
 		return con_get_cmap(up);
 
-	case PIO_FONTX:
-		if (!perm)
-			return -EPERM;
-
-		fallthrough;
-	case GIO_FONTX:
-		return do_fontx_ioctl(vc, cmd, up, &op);
-
-	case PIO_FONTRESET:
-		if (!perm)
-			return -EPERM;
-
-		return vt_io_fontreset(vc, &op);
-
 	case PIO_SCRNMAP:
 		if (!perm)
 			return -EPERM;
@@ -1059,54 +959,6 @@ void vc_SAK(struct work_struct *work)
 
 #ifdef CONFIG_COMPAT
 
-struct compat_consolefontdesc {
-	unsigned short charcount;       /* characters in font (256 or 512) */
-	unsigned short charheight;      /* scan lines per character (1-32) */
-	compat_caddr_t chardata;	/* font data in expanded form */
-};
-
-static inline int
-compat_fontx_ioctl(struct vc_data *vc, int cmd,
-		   struct compat_consolefontdesc __user *user_cfd,
-		   int perm, struct console_font_op *op)
-{
-	struct compat_consolefontdesc cfdarg;
-	int i;
-
-	if (copy_from_user(&cfdarg, user_cfd, sizeof(struct compat_consolefontdesc)))
-		return -EFAULT;
-
-	switch (cmd) {
-	case PIO_FONTX:
-		if (!perm)
-			return -EPERM;
-		op->op = KD_FONT_OP_SET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = compat_ptr(cfdarg.chardata);
-		return con_font_op(vc, op);
-
-	case GIO_FONTX:
-		op->op = KD_FONT_OP_GET;
-		op->flags = KD_FONT_FLAG_OLD;
-		op->width = 8;
-		op->height = cfdarg.charheight;
-		op->charcount = cfdarg.charcount;
-		op->data = compat_ptr(cfdarg.chardata);
-		i = con_font_op(vc, op);
-		if (i)
-			return i;
-		cfdarg.charheight = op->height;
-		cfdarg.charcount = op->charcount;
-		if (copy_to_user(user_cfd, &cfdarg, sizeof(struct compat_consolefontdesc)))
-			return -EFAULT;
-		return 0;
-	}
-	return -EINVAL;
-}
-
 struct compat_console_font_op {
 	compat_uint_t op;        /* operation code KD_FONT_OP_* */
 	compat_uint_t flags;     /* KD_FONT_FLAG_* */
@@ -1183,9 +1035,6 @@ long vt_compat_ioctl(struct tty_struct *tty,
 	/*
 	 * these need special handlers for incompatible data structures
 	 */
-	case PIO_FONTX:
-	case GIO_FONTX:
-		return compat_fontx_ioctl(vc, cmd, up, perm, &op);
 
 	case KDFONTOP:
 		return compat_kdfontop_ioctl(up, perm, &op, vc);
diff --git a/include/linux/kd.h b/include/linux/kd.h
deleted file mode 100644
index b130a18f860f..000000000000
--- a/include/linux/kd.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_KD_H
-#define _LINUX_KD_H
-
-#include <uapi/linux/kd.h>
-
-#define KD_FONT_FLAG_OLD		0x80000000	/* Invoked via old interface [compat] */
-#endif /* _LINUX_KD_H */
-- 
cgit v1.2.3


From cac8a63063e33606c4b8419ef34ef7644d7c2fc4 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 5 Jan 2021 13:02:36 +0100
Subject: vgacon: drop BROKEN_GRAPHICS_PROGRAMS

BROKEN_GRAPHICS_PROGRAMS is defined when CONFIG_VGA_CONSOLE=y. And
vgacon.c is built exclusively in that case too. So the check for
BROKEN_GRAPHICS_PROGRAMS is pointless in vgacon.c as it is always true.
So remove the test and BROKEN_GRAPHICS_PROGRAMS completely.

This also eliminates the need for vga_font_is_default global as it is
only set and never read.

Cc: dri-devel@lists.freedesktop.org
Cc: linux-fbdev@vger.kernel.org
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210105120239.28031-9-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/video/console/vgacon.c | 19 -------------------
 include/linux/vt_kern.h        | 12 ------------
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 17876f0179b5..962c12be9774 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -90,7 +90,6 @@ static unsigned int	vga_video_num_lines;			/* Number of text lines */
 static bool		vga_can_do_color;			/* Do we support colors? */
 static unsigned int	vga_default_font_height __read_mostly;	/* Height of default screen font */
 static unsigned char	vga_video_type		__read_mostly;	/* Card type */
-static bool		vga_font_is_default = true;
 static int		vga_vesa_blanked;
 static bool 		vga_palette_blanked;
 static bool 		vga_is_gfx;
@@ -878,7 +877,6 @@ static int vgacon_do_font_op(struct vgastate *state, char *arg, int set,
 		beg = 0x0a;
 	}
 
-#ifdef BROKEN_GRAPHICS_PROGRAMS
 	/*
 	 * All fonts are loaded in slot 0 (0:1 for 512 ch)
 	 */
@@ -886,24 +884,7 @@ static int vgacon_do_font_op(struct vgastate *state, char *arg, int set,
 	if (!arg)
 		return -EINVAL;	/* Return to default font not supported */
 
-	vga_font_is_default = false;
 	font_select = ch512 ? 0x04 : 0x00;
-#else
-	/*
-	 * The default font is kept in slot 0 and is never touched.
-	 * A custom font is loaded in slot 2 (256 ch) or 2:3 (512 ch)
-	 */
-
-	if (set) {
-		vga_font_is_default = !arg;
-		if (!arg)
-			ch512 = false;	/* Default font is always 256 */
-		font_select = arg ? (ch512 ? 0x0e : 0x0a) : 0x00;
-	}
-
-	if (!vga_font_is_default)
-		charmap += 4 * cmapsz;
-#endif
 
 	raw_spin_lock_irq(&vga_lock);
 	/* First, the Sequencer */
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 349e39c3ab60..94e7a315479c 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -16,18 +16,6 @@
 #include <linux/consolemap.h>
 #include <linux/notifier.h>
 
-/*
- * Presently, a lot of graphics programs do not restore the contents of
- * the higher font pages.  Defining this flag will avoid use of them, but
- * will lose support for PIO_FONTRESET.  Note that many font operations are
- * not likely to work with these programs anyway; they need to be
- * fixed.  The linux/Documentation directory includes a code snippet
- * to save and restore the text font.
- */
-#ifdef CONFIG_VGA_CONSOLE
-#define BROKEN_GRAPHICS_PROGRAMS 1
-#endif
-
 void kd_mksound(unsigned int hz, unsigned int ticks);
 int kbd_rate(struct kbd_repeat *rep);
 
-- 
cgit v1.2.3


From 0bc1bd092af3c7c0b025ece93c3a86916f89f3ca Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 5 Jan 2021 13:02:38 +0100
Subject: tty_port: drop last traces of low_latency

The main purpose of tty_port::low_latency was removed in commit
a9c3f68f3cd8 (tty: Fix low_latency BUG) back in 2014. It was left in
place for drivers as an optional tune knob. But only one driver has been
using it until the previous commit. So remove this misconcept
completely, given there are no users.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210105120239.28031-11-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/networking/caif/caif.rst | 1 -
 drivers/char/pcmcia/synclink_cs.c      | 2 --
 drivers/net/caif/caif_serial.c         | 3 +--
 drivers/s390/char/con3215.c            | 1 -
 drivers/s390/char/sclp_tty.c           | 1 -
 drivers/s390/char/sclp_vt220.c         | 1 -
 drivers/s390/char/tty3270.c            | 2 --
 drivers/tty/amiserial.c                | 3 ---
 drivers/tty/hvc/hvcs.c                 | 2 +-
 drivers/tty/ipwireless/tty.c           | 1 -
 drivers/tty/mxser.c                    | 1 -
 drivers/tty/serial/ifx6x60.c           | 3 ---
 drivers/tty/serial/max3100.c           | 3 ---
 drivers/tty/serial/serial_core.c       | 3 ---
 drivers/tty/synclink_gt.c              | 1 -
 include/linux/tty.h                    | 3 +--
 16 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/caif/caif.rst b/Documentation/networking/caif/caif.rst
index a07213030ccf..81a14373d780 100644
--- a/Documentation/networking/caif/caif.rst
+++ b/Documentation/networking/caif/caif.rst
@@ -68,7 +68,6 @@ There are debugfs parameters provided for serial communication.
 * tty_status: Prints the bit-mask tty status information
 
   - 0x01 - tty->warned is on.
-  - 0x02 - tty->low_latency is on.
   - 0x04 - tty->packed is on.
   - 0x08 - tty->flow_stopped is on.
   - 0x10 - tty->hw_stopped is on.
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index e342daa73d1b..2be8d9a8eec5 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -2494,8 +2494,6 @@ static int mgslpc_open(struct tty_struct *tty, struct file * filp)
 		printk("%s(%d):mgslpc_open(%s), old ref count = %d\n",
 			 __FILE__, __LINE__, tty->driver->name, port->count);
 
-	port->low_latency = (port->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
-
 	spin_lock_irqsave(&info->netlock, flags);
 	if (info->netcount) {
 		retval = -EBUSY;
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index bcc14c5875bf..8215cd77301f 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -89,8 +89,7 @@ static inline void update_tty_status(struct ser_device *ser)
 	ser->tty_status =
 		ser->tty->stopped << 5 |
 		ser->tty->flow_stopped << 3 |
-		ser->tty->packet << 2 |
-		ser->tty->port->low_latency << 1;
+		ser->tty->packet << 2;
 }
 static inline void debugfs_init(struct ser_device *ser, struct tty_struct *tty)
 {
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 1354c42d95aa..671efee612af 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -914,7 +914,6 @@ static int tty3215_open(struct tty_struct *tty, struct file * filp)
 
 	tty_port_tty_set(&raw->port, tty);
 
-	raw->port.low_latency = 0; /* don't use bottom half for pushing chars */
 	/*
 	 * Start up 3215 device
 	 */
diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index 5aff8b684eb2..013bcc331305 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -65,7 +65,6 @@ sclp_tty_open(struct tty_struct *tty, struct file *filp)
 {
 	tty_port_tty_set(&sclp_port, tty);
 	tty->driver_data = NULL;
-	sclp_port.low_latency = 0;
 	return 0;
 }
 
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 3f9a6ef650fa..047f812d1a1c 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -560,7 +560,6 @@ sclp_vt220_open(struct tty_struct *tty, struct file *filp)
 {
 	if (tty->count == 1) {
 		tty_port_tty_set(&sclp_vt220_port, tty);
-		sclp_vt220_port.low_latency = 0;
 		if (!tty->winsize.ws_row && !tty->winsize.ws_col) {
 			tty->winsize.ws_row = 24;
 			tty->winsize.ws_col = 80;
diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index aec996de44d9..15692449a1c3 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -967,7 +967,6 @@ static int tty3270_install(struct tty_driver *driver, struct tty_struct *tty)
 		tty->driver_data = tp;
 		tty->winsize.ws_row = tp->view.rows - 2;
 		tty->winsize.ws_col = tp->view.cols;
-		tp->port.low_latency = 0;
 		tp->inattr = TF_INPUT;
 		goto port_install;
 	}
@@ -996,7 +995,6 @@ static int tty3270_install(struct tty_driver *driver, struct tty_struct *tty)
 		return rc;
 	}
 
-	tp->port.low_latency = 0;
 	tty->winsize.ws_row = tp->view.rows - 2;
 	tty->winsize.ws_col = tp->view.cols;
 
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index 13f63c01c589..18b78ea110ef 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -998,7 +998,6 @@ static int set_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 	state->custom_divisor = ss->custom_divisor;
 	port->close_delay = ss->close_delay * HZ/100;
 	port->closing_wait = ss->closing_wait * HZ/100;
-	port->low_latency = (port->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
 check_and_exit:
 	if (tty_port_initialized(port)) {
@@ -1386,8 +1385,6 @@ static int rs_open(struct tty_struct *tty, struct file * filp)
 	tty->driver_data = info;
 	tty->port = port;
 
-	port->low_latency = (port->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
-
 	retval = startup(tty, info);
 	if (retval) {
 		return retval;
diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c
index 509d1042825a..dfe02283ed23 100644
--- a/drivers/tty/hvc/hvcs.c
+++ b/drivers/tty/hvc/hvcs.c
@@ -605,7 +605,7 @@ static int hvcs_io(struct hvcs_struct *hvcsd)
 		hvcsd->todo_mask |= HVCS_QUICK_READ;
 
 	spin_unlock_irqrestore(&hvcsd->lock, flags);
-	/* This is synch because tty->low_latency == 1 */
+	/* This is synch -- FIXME :js: it is not! */
 	if(got)
 		tty_flip_buffer_push(&hvcsd->port);
 
diff --git a/drivers/tty/ipwireless/tty.c b/drivers/tty/ipwireless/tty.c
index 23584769fc29..6dacbc5e286c 100644
--- a/drivers/tty/ipwireless/tty.c
+++ b/drivers/tty/ipwireless/tty.c
@@ -101,7 +101,6 @@ static int ipw_open(struct tty_struct *linux_tty, struct file *filp)
 
 	tty->port.tty = linux_tty;
 	linux_tty->driver_data = tty;
-	tty->port.low_latency = 1;
 
 	if (tty->tty_type == TTYTYPE_MODEM)
 		ipwireless_ppp_open(tty->network);
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 3703987c4666..4203b64bccdb 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -1273,7 +1273,6 @@ static int mxser_set_serial_info(struct tty_struct *tty,
 				(ss->flags & ASYNC_FLAGS));
 		port->close_delay = ss->close_delay * HZ / 100;
 		port->closing_wait = ss->closing_wait * HZ / 100;
-		port->low_latency = (port->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 		if ((port->flags & ASYNC_SPD_MASK) == ASYNC_SPD_CUST &&
 				(ss->baud_base != info->baud_base ||
 				ss->custom_divisor !=
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
index 182e0ccd60b2..d4ef88ee22d0 100644
--- a/drivers/tty/serial/ifx6x60.c
+++ b/drivers/tty/serial/ifx6x60.c
@@ -565,9 +565,6 @@ static int ifx_port_activate(struct tty_port *port, struct tty_struct *tty)
 	/* put port data into this tty */
 	tty->driver_data = ifx_dev;
 
-	/* allows flip string push from int context */
-	port->low_latency = 1;
-
 	/* set flag to allows data transfer */
 	set_bit(IFX_SPI_STATE_IO_AVAILABLE, &ifx_dev->flags);
 
diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 371569a0fd00..3c92d4e01488 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -521,9 +521,6 @@ max3100_set_termios(struct uart_port *port, struct ktermios *termios,
 			MAX3100_STATUS_PE | MAX3100_STATUS_FE |
 			MAX3100_STATUS_OE;
 
-	/* we are sending char from a workqueue so enable */
-	s->port.state->port.low_latency = 1;
-
 	if (s->poll_time > 0)
 		del_timer_sync(&s->timer);
 
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 828f9ad1be49..7dacdb6a8534 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -975,7 +975,6 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
 	port->closing_wait    = closing_wait;
 	if (new_info->xmit_fifo_size)
 		uport->fifosize = new_info->xmit_fifo_size;
-	port->low_latency = (uport->flags & UPF_LOW_LATENCY) ? 1 : 0;
 
  check_and_exit:
 	retval = 0;
@@ -1795,8 +1794,6 @@ static int uart_port_activate(struct tty_port *port, struct tty_struct *tty)
 	if (!uport || uport->flags & UPF_DEAD)
 		return -ENXIO;
 
-	port->low_latency = (uport->flags & UPF_LOW_LATENCY) ? 1 : 0;
-
 	/*
 	 * Start up the serial port.
 	 */
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index c0b384e3ed4d..644173786bf0 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -672,7 +672,6 @@ static int open(struct tty_struct *tty, struct file *filp)
 	DBGINFO(("%s open, old ref count = %d\n", info->device_name, info->port.count));
 
 	mutex_lock(&info->port.mutex);
-	info->port.low_latency = (info->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
 	spin_lock_irqsave(&info->netlock, flags);
 	if (info->netcount) {
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 12be8b16cdef..b57f6812b3ba 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -240,8 +240,7 @@ struct tty_port {
 	wait_queue_head_t	delta_msr_wait;	/* Modem status change */
 	unsigned long		flags;		/* User TTY flags ASYNC_ */
 	unsigned long		iflags;		/* Internal flags TTY_PORT_ */
-	unsigned char		console:1,	/* port is a console */
-				low_latency:1;	/* optional: tune for latency */
+	unsigned char		console:1;	/* port is a console */
 	struct mutex		mutex;		/* Locking */
 	struct mutex		buf_mutex;	/* Buffer alloc lock */
 	unsigned char		*xmit_buf;	/* Optional buffer */
-- 
cgit v1.2.3


From f446776ebffb9d3fb145b8d84f7f358f64cb54d6 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Mon, 23 Nov 2020 18:49:01 -0600
Subject: tty: Export redirect release

This will be required by the pty code when it removes tty_vhangup() on
master close.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
Link: https://lore.kernel.org/r/20201124004902.1398477-2-minyard@acm.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 32 ++++++++++++++++++++++++--------
 include/linux/tty.h  |  1 +
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 95ba028ef668..225f4933fbde 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -540,6 +540,28 @@ void tty_wakeup(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_wakeup);
 
+/**
+ *	tty_release_redirect	-	Release a redirect on a pty if present
+ *	@tty: tty device
+ *
+ *	This is available to the pty code so if the master closes, if the
+ *	slave is a redirect it can release the redirect.
+ */
+struct file *tty_release_redirect(struct tty_struct *tty)
+{
+	struct file *f = NULL;
+
+	spin_lock(&redirect_lock);
+	if (redirect && file_tty(redirect) == tty) {
+		f = redirect;
+		redirect = NULL;
+	}
+	spin_unlock(&redirect_lock);
+
+	return f;
+}
+EXPORT_SYMBOL_GPL(tty_release_redirect);
+
 /**
  *	__tty_hangup		-	actual handler for hangup events
  *	@tty: tty device
@@ -566,7 +588,7 @@ EXPORT_SYMBOL_GPL(tty_wakeup);
 static void __tty_hangup(struct tty_struct *tty, int exit_session)
 {
 	struct file *cons_filp = NULL;
-	struct file *filp, *f = NULL;
+	struct file *filp, *f;
 	struct tty_file_private *priv;
 	int    closecount = 0, n;
 	int refs;
@@ -574,13 +596,7 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session)
 	if (!tty)
 		return;
 
-
-	spin_lock(&redirect_lock);
-	if (redirect && file_tty(redirect) == tty) {
-		f = redirect;
-		redirect = NULL;
-	}
-	spin_unlock(&redirect_lock);
+	f = tty_release_redirect(tty);
 
 	tty_lock(tty);
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index b57f6812b3ba..dd6ded6138f7 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -421,6 +421,7 @@ extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
 extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
 extern void tty_ldisc_unlock(struct tty_struct *tty);
+extern struct file *tty_release_redirect(struct tty_struct *tty);
 #else
 static inline void tty_kref_put(struct tty_struct *tty)
 { }
-- 
cgit v1.2.3


From 04f111130e9afa41c10d7bcec14e00e3be8b6977 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 10 Dec 2020 13:15:14 +0100
Subject: thermal/core: Remove notify ops

With the removal of the notifys user in a previous patches, the ops is no
longer needed, remove it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20201210121514.25760-5-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_core.c | 3 ---
 include/linux/thermal.h        | 2 --
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 4a291d205d5c..567bc6f254c0 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -412,9 +412,6 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
 
 	trace_thermal_zone_trip(tz, trip, trip_type);
 
-	if (tz->ops->notify)
-		tz->ops->notify(tz, trip, trip_type);
-
 	if (trip_type == THERMAL_TRIP_HOT && tz->ops->hot)
 		tz->ops->hot(tz);
 	else if (trip_type == THERMAL_TRIP_CRITICAL)
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 31b84404f047..c80032322158 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -77,8 +77,6 @@ struct thermal_zone_device_ops {
 	int (*set_emul_temp) (struct thermal_zone_device *, int);
 	int (*get_trend) (struct thermal_zone_device *, int,
 			  enum thermal_trend *);
-	int (*notify) (struct thermal_zone_device *, int,
-		       enum thermal_trip_type);
 	void (*hot)(struct thermal_zone_device *);
 	void (*critical)(struct thermal_zone_device *);
 };
-- 
cgit v1.2.3


From 4b9bbb29baf6c948d24eaeff892815b8a403b64c Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Thu, 17 Dec 2020 19:17:00 -0800
Subject: driver core: Add device link support for INFERRED flag

This flag can never be added to a device link that already exists and
doesn't have the flag set. It can only be added when a device link is
created for the first time or it can be maintained if the device link
already has the it set.

This flag will be used for marking device links created ONLY by
inferring dependencies from data and NOT from explicit action by device
drivers/frameworks.  This will be useful in the future when we need to
deal with cycles in dependencies inferred from firmware.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20201218031703.3053753-3-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 15 +++++++++++----
 include/linux/device.h |  2 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index fe8601197b84..5827dbff7f21 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -229,7 +229,8 @@ int device_is_dependent(struct device *dev, void *target)
 		return ret;
 
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (link->flags == (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
+		if ((link->flags & ~DL_FLAG_INFERRED) ==
+		    (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
 			continue;
 
 		if (link->consumer == target)
@@ -302,7 +303,8 @@ static int device_reorder_to_tail(struct device *dev, void *not_used)
 
 	device_for_each_child(dev, NULL, device_reorder_to_tail);
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (link->flags == (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
+		if ((link->flags & ~DL_FLAG_INFERRED) ==
+		    (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
 			continue;
 		device_reorder_to_tail(link->consumer, NULL);
 	}
@@ -546,7 +548,8 @@ postcore_initcall(devlink_class_init);
 #define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \
 			       DL_FLAG_AUTOREMOVE_SUPPLIER | \
 			       DL_FLAG_AUTOPROBE_CONSUMER  | \
-			       DL_FLAG_SYNC_STATE_ONLY)
+			       DL_FLAG_SYNC_STATE_ONLY | \
+			       DL_FLAG_INFERRED)
 
 #define DL_ADD_VALID_FLAGS (DL_MANAGED_LINK_FLAGS | DL_FLAG_STATELESS | \
 			    DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE)
@@ -615,7 +618,7 @@ struct device_link *device_link_add(struct device *consumer,
 	if (!consumer || !supplier || flags & ~DL_ADD_VALID_FLAGS ||
 	    (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) ||
 	    (flags & DL_FLAG_SYNC_STATE_ONLY &&
-	     flags != DL_FLAG_SYNC_STATE_ONLY) ||
+	     (flags & ~DL_FLAG_INFERRED) != DL_FLAG_SYNC_STATE_ONLY) ||
 	    (flags & DL_FLAG_AUTOPROBE_CONSUMER &&
 	     flags & (DL_FLAG_AUTOREMOVE_CONSUMER |
 		      DL_FLAG_AUTOREMOVE_SUPPLIER)))
@@ -671,6 +674,10 @@ struct device_link *device_link_add(struct device *consumer,
 		if (link->consumer != consumer)
 			continue;
 
+		if (link->flags & DL_FLAG_INFERRED &&
+		    !(flags & DL_FLAG_INFERRED))
+			link->flags &= ~DL_FLAG_INFERRED;
+
 		if (flags & DL_FLAG_PM_RUNTIME) {
 			if (!(link->flags & DL_FLAG_PM_RUNTIME)) {
 				pm_runtime_new_link(consumer);
diff --git a/include/linux/device.h b/include/linux/device.h
index 89bb8b84173e..cb5eb2e58c25 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -323,6 +323,7 @@ enum device_link_state {
  * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
  * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
  * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
+ * INFERRED: Inferred from data (eg: firmware) and not from driver actions.
  */
 #define DL_FLAG_STATELESS		BIT(0)
 #define DL_FLAG_AUTOREMOVE_CONSUMER	BIT(1)
@@ -332,6 +333,7 @@ enum device_link_state {
 #define DL_FLAG_AUTOPROBE_CONSUMER	BIT(5)
 #define DL_FLAG_MANAGED			BIT(6)
 #define DL_FLAG_SYNC_STATE_ONLY		BIT(7)
+#define DL_FLAG_INFERRED		BIT(8)
 
 /**
  * enum dl_dev_state - Device driver presence tracking information.
-- 
cgit v1.2.3


From 30bfce109420912f201d4f295f9130ff44f04b41 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Jan 2021 13:06:36 -0800
Subject: net: remove ndo_udp_tunnel_* callbacks

All UDP tunnel port management is now routed via udp_tunnel_nic
infra directly. Remove the old callbacks.

Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c            |  2 --
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c    |  2 --
 drivers/net/ethernet/broadcom/bnxt/bnxt.c           |  2 --
 drivers/net/ethernet/cavium/liquidio/lio_main.c     |  2 --
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c  |  2 --
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c     |  2 --
 drivers/net/ethernet/cisco/enic/enic_main.c         |  4 ----
 drivers/net/ethernet/emulex/benet/be_main.c         |  2 --
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c     |  2 --
 drivers/net/ethernet/intel/i40e/i40e_main.c         |  2 --
 drivers/net/ethernet/intel/ice/ice_main.c           |  2 --
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c       |  2 --
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c      |  4 ----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   |  2 --
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c    |  2 --
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c |  2 --
 drivers/net/ethernet/qlogic/qede/qede_main.c        |  6 ------
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c    |  2 --
 drivers/net/ethernet/sfc/efx.c                      |  2 --
 drivers/net/netdevsim/netdev.c                      |  2 --
 include/linux/netdevice.h                           | 17 -----------------
 21 files changed, 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 2709a2db5657..99b6d5a9f1d9 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -2295,8 +2295,6 @@ static const struct net_device_ops xgbe_netdev_ops = {
 	.ndo_setup_tc		= xgbe_setup_tc,
 	.ndo_fix_features	= xgbe_fix_features,
 	.ndo_set_features	= xgbe_set_features,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= xgbe_features_check,
 };
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 28069b290862..b652ed72a621 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13071,8 +13071,6 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 	.ndo_get_phys_port_id	= bnx2x_get_phys_port_id,
 	.ndo_set_vf_link_state	= bnx2x_set_vf_link_state,
 	.ndo_features_check	= bnx2x_features_check,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 };
 
 static int bnx2x_set_coherency_mask(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index bbd2a07dc329..d31a5ad7522a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -12091,8 +12091,6 @@ static const struct net_device_ops bnxt_netdev_ops = {
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= bnxt_rx_flow_steer,
 #endif
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_bpf		= bnxt_xdp,
 	.ndo_xdp_xmit		= bnxt_xdp_xmit,
 	.ndo_bridge_getlink	= bnxt_bridge_getlink,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 7d00d3a8ded4..7c5af4beedc6 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3219,8 +3219,6 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_do_ioctl		= liquidio_ioctl,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_set_vf_mac		= liquidio_set_vf_mac,
 	.ndo_set_vf_vlan	= liquidio_set_vf_vlan,
 	.ndo_get_vf_config	= liquidio_get_vf_config,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 103440f97bc8..516f166ceff8 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1879,8 +1879,6 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_do_ioctl		= liquidio_ioctl,
 	.ndo_fix_features	= liquidio_fix_features,
 	.ndo_set_features	= liquidio_set_features,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 };
 
 static int lio_nic_info(struct octeon_recv_info *recv_info, void *buf)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 7fd264a6d085..15542661e3d2 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3882,8 +3882,6 @@ static const struct net_device_ops cxgb4_netdev_ops = {
 #endif /* CONFIG_CHELSIO_T4_FCOE */
 	.ndo_set_tx_maxrate   = cxgb_set_tx_maxrate,
 	.ndo_setup_tc         = cxgb_setup_tc,
-	.ndo_udp_tunnel_add   = udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del   = udp_tunnel_nic_del_port,
 	.ndo_features_check   = cxgb_features_check,
 	.ndo_fix_features     = cxgb_fix_features,
 };
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index fb269d587b74..f04ec53544ae 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2509,8 +2509,6 @@ static const struct net_device_ops enic_netdev_dynamic_ops = {
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= enic_rx_flow_steer,
 #endif
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= enic_features_check,
 };
 
@@ -2535,8 +2533,6 @@ static const struct net_device_ops enic_netdev_ops = {
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= enic_rx_flow_steer,
 #endif
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= enic_features_check,
 };
 
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index d402d83d9edd..b6eba29d8e99 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -5179,8 +5179,6 @@ static const struct net_device_ops be_netdev_ops = {
 #endif
 	.ndo_bridge_setlink	= be_ndo_bridge_setlink,
 	.ndo_bridge_getlink	= be_ndo_bridge_getlink,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= be_features_check,
 	.ndo_get_phys_port_id   = be_get_phys_port_id,
 };
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 5c19ff452558..2fb52bd6fc0e 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1531,8 +1531,6 @@ static const struct net_device_ops fm10k_netdev_ops = {
 	.ndo_set_vf_rate	= fm10k_ndo_set_vf_bw,
 	.ndo_get_vf_config	= fm10k_ndo_get_vf_config,
 	.ndo_get_vf_stats	= fm10k_ndo_get_vf_stats,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_dfwd_add_station	= fm10k_dfwd_add_station,
 	.ndo_dfwd_del_station	= fm10k_dfwd_del_station,
 	.ndo_features_check	= fm10k_features_check,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1db482d310c2..521ea9df38d5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12804,8 +12804,6 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_set_vf_link_state	= i40e_ndo_set_vf_link_state,
 	.ndo_set_vf_spoofchk	= i40e_ndo_set_vf_spoofchk,
 	.ndo_set_vf_trust	= i40e_ndo_set_vf_trust,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_get_phys_port_id	= i40e_get_phys_port_id,
 	.ndo_fdb_add		= i40e_ndo_fdb_add,
 	.ndo_features_check	= i40e_features_check,
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index c52b9bb0e3ab..6e251dfffc91 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -6790,6 +6790,4 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_bpf = ice_xdp,
 	.ndo_xdp_xmit = ice_xdp_xmit,
 	.ndo_xsk_wakeup = ice_xsk_wakeup,
-	.ndo_udp_tunnel_add = udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del = udp_tunnel_nic_del_port,
 };
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 393d1c2cd853..6cbbe09ce8a0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10278,8 +10278,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_bridge_getlink	= ixgbe_ndo_bridge_getlink,
 	.ndo_dfwd_add_station	= ixgbe_fwd_add,
 	.ndo_dfwd_del_station	= ixgbe_fwd_del,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= ixgbe_features_check,
 	.ndo_bpf		= ixgbe_xdp,
 	.ndo_xdp_xmit		= ixgbe_xdp_xmit,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 32aad4d32b88..51b9700fce83 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2839,8 +2839,6 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
 	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
 	.ndo_bpf		= mlx4_xdp,
@@ -2873,8 +2871,6 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
 	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= mlx4_en_features_check,
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
 	.ndo_bpf		= mlx4_xdp,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7a79d330c075..f27f509ab028 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4621,8 +4621,6 @@ const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_change_mtu          = mlx5e_change_nic_mtu,
 	.ndo_do_ioctl            = mlx5e_ioctl,
 	.ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
-	.ndo_udp_tunnel_add      = udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del      = udp_tunnel_nic_del_port,
 	.ndo_features_check      = mlx5e_features_check,
 	.ndo_tx_timeout          = mlx5e_tx_timeout,
 	.ndo_bpf		 = mlx5e_xdp,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 989c70c1eda3..cfa0e8552975 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -653,8 +653,6 @@ static const struct net_device_ops mlx5e_netdev_ops_uplink_rep = {
 	.ndo_has_offload_stats	 = mlx5e_rep_has_offload_stats,
 	.ndo_get_offload_stats	 = mlx5e_rep_get_offload_stats,
 	.ndo_change_mtu          = mlx5e_uplink_rep_change_mtu,
-	.ndo_udp_tunnel_add      = udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del      = udp_tunnel_nic_del_port,
 	.ndo_features_check      = mlx5e_features_check,
 	.ndo_set_vf_mac          = mlx5e_set_vf_mac,
 	.ndo_set_vf_rate         = mlx5e_set_vf_rate,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index f21fb573ea3e..7ba8f4c7f26d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3656,8 +3656,6 @@ const struct net_device_ops nfp_net_netdev_ops = {
 	.ndo_set_features	= nfp_net_set_features,
 	.ndo_features_check	= nfp_net_features_check,
 	.ndo_get_phys_port_name	= nfp_net_get_phys_port_name,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_bpf		= nfp_net_xdp,
 	.ndo_get_devlink_port	= nfp_devlink_get_devlink_port,
 };
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 9cf960a6d007..4bf94797aac5 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -663,8 +663,6 @@ static const struct net_device_ops qede_netdev_ops = {
 	.ndo_get_vf_config	= qede_get_vf_config,
 	.ndo_set_vf_rate	= qede_set_vf_rate,
 #endif
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= qede_features_check,
 	.ndo_bpf		= qede_xdp,
 #ifdef CONFIG_RFS_ACCEL
@@ -688,8 +686,6 @@ static const struct net_device_ops qede_netdev_vf_ops = {
 	.ndo_fix_features	= qede_fix_features,
 	.ndo_set_features	= qede_set_features,
 	.ndo_get_stats64	= qede_get_stats64,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= qede_features_check,
 };
 
@@ -707,8 +703,6 @@ static const struct net_device_ops qede_netdev_vf_xdp_ops = {
 	.ndo_fix_features	= qede_fix_features,
 	.ndo_set_features	= qede_set_features,
 	.ndo_get_stats64	= qede_get_stats64,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= qede_features_check,
 	.ndo_bpf		= qede_xdp,
 	.ndo_xdp_xmit		= qede_xdp_transmit,
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index c2faf96fcade..96b947fde646 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -520,8 +520,6 @@ static const struct net_device_ops qlcnic_netdev_ops = {
 	.ndo_fdb_del		= qlcnic_fdb_del,
 	.ndo_fdb_dump		= qlcnic_fdb_dump,
 	.ndo_get_phys_port_id	= qlcnic_get_phys_port_id,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_features_check	= qlcnic_features_check,
 #ifdef CONFIG_QLCNIC_SRIOV
 	.ndo_set_vf_mac		= qlcnic_sriov_set_vf_mac,
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 718308076341..36c8625a6fd7 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -612,8 +612,6 @@ static const struct net_device_ops efx_netdev_ops = {
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= efx_filter_rfs,
 #endif
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_xdp_xmit		= efx_xdp_xmit,
 	.ndo_bpf		= efx_xdp
 };
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 7178468302c8..aec92440eef1 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -258,8 +258,6 @@ static const struct net_device_ops nsim_netdev_ops = {
 	.ndo_setup_tc		= nsim_setup_tc,
 	.ndo_set_features	= nsim_set_features,
 	.ndo_bpf		= nsim_bpf,
-	.ndo_udp_tunnel_add	= udp_tunnel_nic_add_port,
-	.ndo_udp_tunnel_del	= udp_tunnel_nic_del_port,
 	.ndo_get_devlink_port	= nsim_get_devlink_port,
 };
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 259be67644e3..1ec3ac5d5bbf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1213,19 +1213,6 @@ struct netdev_net_notifier {
  *				 struct netdev_phys_item_id *ppid)
  *	Called to get the parent ID of the physical port of this device.
  *
- * void (*ndo_udp_tunnel_add)(struct net_device *dev,
- *			      struct udp_tunnel_info *ti);
- *	Called by UDP tunnel to notify a driver about the UDP port and socket
- *	address family that a UDP tunnel is listnening to. It is called only
- *	when a new port starts listening. The operation is protected by the
- *	RTNL.
- *
- * void (*ndo_udp_tunnel_del)(struct net_device *dev,
- *			      struct udp_tunnel_info *ti);
- *	Called by UDP tunnel to notify the driver about a UDP port and socket
- *	address family that the UDP tunnel is not listening to anymore. The
- *	operation is protected by the RTNL.
- *
  * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
  *				 struct net_device *dev)
  *	Called by upper layer devices to accelerate switching or other
@@ -1464,10 +1451,6 @@ struct net_device_ops {
 							  struct netdev_phys_item_id *ppid);
 	int			(*ndo_get_phys_port_name)(struct net_device *dev,
 							  char *name, size_t len);
-	void			(*ndo_udp_tunnel_add)(struct net_device *dev,
-						      struct udp_tunnel_info *ti);
-	void			(*ndo_udp_tunnel_del)(struct net_device *dev,
-						      struct udp_tunnel_info *ti);
 	void*			(*ndo_dfwd_add_station)(struct net_device *pdev,
 							struct net_device *dev);
 	void			(*ndo_dfwd_del_station)(struct net_device *pdev,
-- 
cgit v1.2.3


From 8b86850bf9ef259e194ce49c776aded3b8c281fb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 6 Jan 2021 09:09:44 -0800
Subject: net: phy: bcm7xxx: Add an entry for BCM72116

BCM72116 features a 28nm integrated EPHY, add an entry to match this PHY
OUI.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20210106170944.1253046-1-f.fainelli@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/bcm7xxx.c | 2 ++
 include/linux/brcmphy.h   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index 15812001b3ff..e79297a4bae8 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -612,6 +612,7 @@ static void bcm7xxx_28nm_remove(struct phy_device *phydev)
 
 static struct phy_driver bcm7xxx_driver[] = {
 	BCM7XXX_28NM_EPHY(PHY_ID_BCM72113, "Broadcom BCM72113"),
+	BCM7XXX_28NM_EPHY(PHY_ID_BCM72116, "Broadcom BCM72116"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7250, "Broadcom BCM7250"),
 	BCM7XXX_28NM_EPHY(PHY_ID_BCM7255, "Broadcom BCM7255"),
 	BCM7XXX_28NM_EPHY(PHY_ID_BCM7260, "Broadcom BCM7260"),
@@ -633,6 +634,7 @@ static struct phy_driver bcm7xxx_driver[] = {
 
 static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
 	{ PHY_ID_BCM72113, 0xfffffff0 },
+	{ PHY_ID_BCM72116, 0xfffffff0, },
 	{ PHY_ID_BCM7250, 0xfffffff0, },
 	{ PHY_ID_BCM7255, 0xfffffff0, },
 	{ PHY_ID_BCM7260, 0xfffffff0, },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index d0bd226d6bd9..de9430d55c90 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -31,6 +31,7 @@
 #define PHY_ID_BCM89610			0x03625cd0
 
 #define PHY_ID_BCM72113			0x35905310
+#define PHY_ID_BCM72116			0x35905350
 #define PHY_ID_BCM7250			0xae025280
 #define PHY_ID_BCM7255			0xae025120
 #define PHY_ID_BCM7260			0xae025190
-- 
cgit v1.2.3


From f46b9b8ee89b52f6ee1f4da49d392e609746ab10 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 7 Jan 2021 03:24:00 +0200
Subject: net: dsa: move the Broadcom tag information in a separate header file

It is a bit strange to see something as specific as Broadcom SYSTEMPORT
bits in the main DSA include file. Move these away into a separate
header, and have the tagger and the SYSTEMPORT driver include them.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                                |  1 +
 drivers/net/ethernet/broadcom/bcmsysport.c |  1 +
 include/linux/dsa/brcm.h                   | 16 ++++++++++++++++
 include/net/dsa.h                          |  6 ------
 net/dsa/tag_brcm.c                         |  1 +
 5 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/dsa/brcm.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7c1e45c416b1..3854aca806f5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3400,6 +3400,7 @@ L:	openwrt-devel@lists.openwrt.org (subscribers-only)
 S:	Supported
 F:	Documentation/devicetree/bindings/net/dsa/brcm,b53.yaml
 F:	drivers/net/dsa/b53/*
+F:	include/linux/dsa/brcm.h
 F:	include/linux/platform_data/b53.h
 
 BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index b1ae9eb8f247..9ef743d6ba67 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/dsa/brcm.h>
 #include <linux/etherdevice.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
diff --git a/include/linux/dsa/brcm.h b/include/linux/dsa/brcm.h
new file mode 100644
index 000000000000..47545a948784
--- /dev/null
+++ b/include/linux/dsa/brcm.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (C) 2014 Broadcom Corporation
+ */
+
+/* Included by drivers/net/ethernet/broadcom/bcmsysport.c and
+ * net/dsa/tag_brcm.c
+ */
+#ifndef _NET_DSA_BRCM_H
+#define _NET_DSA_BRCM_H
+
+/* Broadcom tag specific helpers to insert and extract queue/port number */
+#define BRCM_TAG_SET_PORT_QUEUE(p, q)	((p) << 8 | q)
+#define BRCM_TAG_GET_PORT(v)		((v) >> 8)
+#define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
+
+#endif
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6b74690bd8d4..e92a09c77f96 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -878,12 +878,6 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 }
 #endif
 
-/* Broadcom tag specific helpers to insert and extract queue/port number */
-#define BRCM_TAG_SET_PORT_QUEUE(p, q)	((p) << 8 | q)
-#define BRCM_TAG_GET_PORT(v)		((v) >> 8)
-#define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
-
-
 netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
 int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
 int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index e934dace3922..e2577a7dcbca 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2014 Broadcom Corporation
  */
 
+#include <linux/dsa/brcm.h>
 #include <linux/etherdevice.h>
 #include <linux/list.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 424f481f06dc7f1528447cc3224f5fd3c5e11f65 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:29 -0800
Subject: skbuff: remove unused skb_zcopy_abort function

skb_zcopy_abort() has no in-tree consumers, remove it.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 333bcdc39635..3ca8d7c7b30c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1490,17 +1490,6 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
 	}
 }
 
-/* Abort a zerocopy operation and revert zckey on error in send syscall */
-static inline void skb_zcopy_abort(struct sk_buff *skb)
-{
-	struct ubuf_info *uarg = skb_zcopy(skb);
-
-	if (uarg) {
-		sock_zerocopy_put_abort(uarg, false);
-		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
-	}
-}
-
 static inline void skb_mark_not_on_list(struct sk_buff *skb)
 {
 	skb->next = NULL;
-- 
cgit v1.2.3


From 75518851a2a0c047def521aaf87db401de098352 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:31 -0800
Subject: skbuff: Push status and refcounts into sock_zerocopy_callback

Before this change, the caller of sock_zerocopy_callback would
need to save the zerocopy status, decrement and check the refcount,
and then call the callback function - the callback was only invoked
when the refcount reached zero.

Now, the caller just passes the status into the callback function,
which saves the status and handles its own refcounts.

This makes the behavior of the sock_zerocopy_callback identical
to the tpacket and vhost callbacks.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  3 ---
 net/core/skbuff.c      | 14 +++++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3ca8d7c7b30c..52e96c35f5af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1479,9 +1479,6 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
 	if (uarg) {
 		if (skb_zcopy_is_nouarg(skb)) {
 			/* no notification callback */
-		} else if (uarg->callback == sock_zerocopy_callback) {
-			uarg->zerocopy = uarg->zerocopy && zerocopy;
-			sock_zerocopy_put(uarg);
 		} else {
 			uarg->callback(uarg, zerocopy);
 		}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d88963f47f7d..8c18940723ff 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1194,7 +1194,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
 	return true;
 }
 
-void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+static void __sock_zerocopy_callback(struct ubuf_info *uarg)
 {
 	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
 	struct sock_exterr_skb *serr;
@@ -1222,7 +1222,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
 	serr->ee.ee_data = hi;
 	serr->ee.ee_info = lo;
-	if (!success)
+	if (!uarg->zerocopy)
 		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
 
 	q = &sk->sk_error_queue;
@@ -1241,11 +1241,19 @@ release:
 	consume_skb(skb);
 	sock_put(sk);
 }
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+{
+	uarg->zerocopy = uarg->zerocopy & success;
+
+	if (refcount_dec_and_test(&uarg->refcnt))
+		__sock_zerocopy_callback(uarg);
+}
 EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 
 void sock_zerocopy_put(struct ubuf_info *uarg)
 {
-	if (uarg && refcount_dec_and_test(&uarg->refcnt))
+	if (uarg)
 		uarg->callback(uarg, uarg->zerocopy);
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put);
-- 
cgit v1.2.3


From 59776362b14b81dc45c6bbd4e7cb3871f390fc1b Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:32 -0800
Subject: skbuff: replace sock_zerocopy_put() with skb_zcopy_put()

Replace sock_zerocopy_put with the generic skb_zcopy_put()
function.  Pass 'true' as the success argument, as this
is identical to no change.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 7 ++++++-
 net/core/skbuff.c      | 9 +--------
 net/ipv4/tcp.c         | 2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 52e96c35f5af..a6c86839035b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -496,7 +496,6 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 	refcount_inc(&uarg->refcnt);
 }
 
-void sock_zerocopy_put(struct ubuf_info *uarg);
 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
@@ -1471,6 +1470,12 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
 	return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
 }
 
+static inline void skb_zcopy_put(struct ubuf_info *uarg)
+{
+	if (uarg)
+		uarg->callback(uarg, true);
+}
+
 /* Release a reference on a zerocopy structure */
 static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c18940723ff..0e028825367a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1251,13 +1251,6 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 
-void sock_zerocopy_put(struct ubuf_info *uarg)
-{
-	if (uarg)
-		uarg->callback(uarg, uarg->zerocopy);
-}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put);
-
 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
 	if (uarg) {
@@ -1267,7 +1260,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 		uarg->len--;
 
 		if (have_uref)
-			sock_zerocopy_put(uarg);
+			skb_zcopy_put(uarg);
 	}
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ed42d2193c5c..298a1fae841c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1429,7 +1429,7 @@ out:
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 	}
 out_nopush:
-	sock_zerocopy_put(uarg);
+	skb_zcopy_put(uarg);
 	return copied + copied_syn;
 
 do_error:
-- 
cgit v1.2.3


From e76d46cfff8d2335f363f58bd7387de4059d871d Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:33 -0800
Subject: skbuff: replace sock_zerocopy_get with skb_zcopy_get

Rename the get routines for consistency.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 12 ++++++------
 net/core/skbuff.c      |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a6c86839035b..5b8a53ab51fd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -491,11 +491,6 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 					struct ubuf_info *uarg);
 
-static inline void sock_zerocopy_get(struct ubuf_info *uarg)
-{
-	refcount_inc(&uarg->refcnt);
-}
-
 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
@@ -1441,6 +1436,11 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
+static inline void skb_zcopy_get(struct ubuf_info *uarg)
+{
+	refcount_inc(&uarg->refcnt);
+}
+
 static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 				 bool *have_ref)
 {
@@ -1448,7 +1448,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 		if (unlikely(have_ref && *have_ref))
 			*have_ref = false;
 		else
-			sock_zerocopy_get(uarg);
+			skb_zcopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
 		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
 	}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0e028825367a..00f195908e79 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1163,7 +1163,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 
 			/* no extra ref when appending to datagram (MSG_MORE) */
 			if (sk->sk_type == SOCK_STREAM)
-				sock_zerocopy_get(uarg);
+				skb_zcopy_get(uarg);
 
 			return uarg;
 		}
-- 
cgit v1.2.3


From 36177832f42d9c7b222ab039678398a9d4070fff Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:34 -0800
Subject: skbuff: Add skb parameter to the ubuf zerocopy callback

Add an optional skb parameter to the zerocopy callback parameter,
which is passed down from skb_zcopy_clear().  This gives access
to the original skb, which is needed for upcoming RX zero-copy
error handling.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tap.c                 |  2 +-
 drivers/net/tun.c                 |  2 +-
 drivers/net/xen-netback/common.h  |  3 ++-
 drivers/net/xen-netback/netback.c |  5 +++--
 drivers/vhost/net.c               |  3 ++-
 include/linux/skbuff.h            | 17 ++++++++---------
 net/core/skbuff.c                 |  3 ++-
 7 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1f4bdd94407a..3f51f3766d18 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -727,7 +727,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
-		uarg->callback(uarg, false);
+		uarg->callback(NULL, uarg, false);
 	}
 
 	if (tap) {
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 978ac0981d16..862c9063aa09 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1819,7 +1819,7 @@ drop:
 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
-		uarg->callback(uarg, false);
+		uarg->callback(NULL, uarg, false);
 	}
 
 	skb_reset_network_header(skb);
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 8ee24e351bdc..4a16d6e33c09 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -399,7 +399,8 @@ void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
 void xenvif_carrier_on(struct xenvif *vif);
 
 /* Callback from stack when TX packet can be released */
-void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
+			      bool zerocopy_success);
 
 /* Unmap a pending page and release it back to the guest */
 void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index bc3421d14576..19a27dce79d2 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1091,7 +1091,7 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s
 	uarg = skb_shinfo(skb)->destructor_arg;
 	/* increase inflight counter to offset decrement in callback */
 	atomic_inc(&queue->inflight_packets);
-	uarg->callback(uarg, true);
+	uarg->callback(NULL, uarg, true);
 	skb_shinfo(skb)->destructor_arg = NULL;
 
 	/* Fill the skb with the new (local) frags. */
@@ -1228,7 +1228,8 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 	return work_done;
 }
 
-void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
+			      bool zerocopy_success)
 {
 	unsigned long flags;
 	pending_ring_idx_t index;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c8784dfafdd7..f834a097895e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -381,7 +381,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	}
 }
 
-static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
+static void vhost_zerocopy_callback(struct sk_buff *skb,
+				    struct ubuf_info *ubuf, bool success)
 {
 	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
 	struct vhost_virtqueue *vq = ubufs->vq;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5b8a53ab51fd..b23c3b4b3209 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -461,7 +461,8 @@ enum {
  * The desc field is used to track userspace buffer index.
  */
 struct ubuf_info {
-	void (*callback)(struct ubuf_info *, bool zerocopy_success);
+	void (*callback)(struct sk_buff *, struct ubuf_info *,
+			 bool zerocopy_success);
 	union {
 		struct {
 			unsigned long desc;
@@ -493,7 +494,8 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 
 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
-void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
+void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+			    bool success);
 
 int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
@@ -1473,20 +1475,17 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
 static inline void skb_zcopy_put(struct ubuf_info *uarg)
 {
 	if (uarg)
-		uarg->callback(uarg, true);
+		uarg->callback(NULL, uarg, true);
 }
 
 /* Release a reference on a zerocopy structure */
-static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
+static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 {
 	struct ubuf_info *uarg = skb_zcopy(skb);
 
 	if (uarg) {
-		if (skb_zcopy_is_nouarg(skb)) {
-			/* no notification callback */
-		} else {
-			uarg->callback(uarg, zerocopy);
-		}
+		if (!skb_zcopy_is_nouarg(skb))
+			uarg->callback(skb, uarg, zerocopy_success);
 
 		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
 	}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 00f195908e79..89130b21d9f0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1242,7 +1242,8 @@ release:
 	sock_put(sk);
 }
 
-void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+			    bool success)
 {
 	uarg->zerocopy = uarg->zerocopy & success;
 
-- 
cgit v1.2.3


From 236a6b1cd585a408139550201343f3f16f9324b9 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:35 -0800
Subject: skbuff: Call sock_zerocopy_put_abort from skb_zcopy_put_abort

The sock_zerocopy_put_abort function contains logic which is
specific to the current zerocopy implementation.  Add a wrapper
which checks the callback and dispatches apppropriately.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 10 ++++++++++
 net/core/skbuff.c      | 12 +++++-------
 net/ipv4/ip_output.c   |  3 +--
 net/ipv4/tcp.c         |  2 +-
 net/ipv6/ip6_output.c  |  3 +--
 5 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b23c3b4b3209..9f7393167f0a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1478,6 +1478,16 @@ static inline void skb_zcopy_put(struct ubuf_info *uarg)
 		uarg->callback(NULL, uarg, true);
 }
 
+static inline void skb_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
+{
+	if (uarg) {
+		if (uarg->callback == sock_zerocopy_callback)
+			sock_zerocopy_put_abort(uarg, have_uref);
+		else if (have_uref)
+			skb_zcopy_put(uarg);
+	}
+}
+
 /* Release a reference on a zerocopy structure */
 static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 89130b21d9f0..5b9cd528d6a6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1254,15 +1254,13 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 
 void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
-	if (uarg) {
-		struct sock *sk = skb_from_uarg(uarg)->sk;
+	struct sock *sk = skb_from_uarg(uarg)->sk;
 
-		atomic_dec(&sk->sk_zckey);
-		uarg->len--;
+	atomic_dec(&sk->sk_zckey);
+	uarg->len--;
 
-		if (have_uref)
-			skb_zcopy_put(uarg);
-	}
+	if (have_uref)
+		sock_zerocopy_callback(NULL, uarg, true);
 }
 EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 89fff5f59eea..bae9b29e17a3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1230,8 +1230,7 @@ alloc_new_skb:
 error_efault:
 	err = -EFAULT;
 error:
-	if (uarg)
-		sock_zerocopy_put_abort(uarg, extra_uref);
+	skb_zcopy_put_abort(uarg, extra_uref);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 298a1fae841c..fb58215972ba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1440,7 +1440,7 @@ do_fault:
 	if (copied + copied_syn)
 		goto out;
 out_err:
-	sock_zerocopy_put_abort(uarg, true);
+	skb_zcopy_put_abort(uarg, true);
 	err = sk_stream_error(sk, flags, err);
 	/* make sure we wake any epoll edge trigger waiter */
 	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 749ad72386b2..c8c87891533a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1715,8 +1715,7 @@ alloc_new_skb:
 error_efault:
 	err = -EFAULT;
 error:
-	if (uarg)
-		sock_zerocopy_put_abort(uarg, extra_uref);
+	skb_zcopy_put_abort(uarg, extra_uref);
 	cork->length -= length;
 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
-- 
cgit v1.2.3


From 8c793822c5803e01d03f71c431f59316f0b278b7 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:37 -0800
Subject: skbuff: rename sock_zerocopy_* to msg_zerocopy_*

At Willem's suggestion, rename the sock_zerocopy_* functions
so that they match the MSG_ZEROCOPY flag, which makes it clear
they are specific to this zerocopy implementation.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 18 +++++++++---------
 net/core/skbuff.c      | 30 +++++++++++++++---------------
 net/ipv4/ip_output.c   |  2 +-
 net/ipv4/tcp.c         |  2 +-
 net/ipv6/ip6_output.c  |  2 +-
 5 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9f7393167f0a..f6a3cee5c967 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -488,14 +488,14 @@ struct ubuf_info {
 int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
 void mm_unaccount_pinned_pages(struct mmpin *mmp);
 
-struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
-struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
-					struct ubuf_info *uarg);
+struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size);
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+				       struct ubuf_info *uarg);
 
-void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
-void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
-			    bool success);
+void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+			   bool success);
 
 int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
@@ -1481,8 +1481,8 @@ static inline void skb_zcopy_put(struct ubuf_info *uarg)
 static inline void skb_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
 	if (uarg) {
-		if (uarg->callback == sock_zerocopy_callback)
-			sock_zerocopy_put_abort(uarg, have_uref);
+		if (uarg->callback == msg_zerocopy_callback)
+			msg_zerocopy_put_abort(uarg, have_uref);
 		else if (have_uref)
 			skb_zcopy_put(uarg);
 	}
@@ -2776,7 +2776,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
 	if (likely(!skb_zcopy(skb)))
 		return 0;
 	if (!skb_zcopy_is_nouarg(skb) &&
-	    skb_uarg(skb)->callback == sock_zerocopy_callback)
+	    skb_uarg(skb)->callback == msg_zerocopy_callback)
 		return 0;
 	return skb_copy_ubufs(skb, gfp_mask);
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d031ed99182..bcd56763952e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1094,7 +1094,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
 }
 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
 
-struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
 {
 	struct ubuf_info *uarg;
 	struct sk_buff *skb;
@@ -1114,7 +1114,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 		return NULL;
 	}
 
-	uarg->callback = sock_zerocopy_callback;
+	uarg->callback = msg_zerocopy_callback;
 	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
 	uarg->len = 1;
 	uarg->bytelen = size;
@@ -1124,15 +1124,15 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 
 	return uarg;
 }
-EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+EXPORT_SYMBOL_GPL(msg_zerocopy_alloc);
 
 static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
 {
 	return container_of((void *)uarg, struct sk_buff, cb);
 }
 
-struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
-					struct ubuf_info *uarg)
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+				       struct ubuf_info *uarg)
 {
 	if (uarg) {
 		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
@@ -1171,9 +1171,9 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 	}
 
 new_alloc:
-	return sock_zerocopy_alloc(sk, size);
+	return msg_zerocopy_alloc(sk, size);
 }
-EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
 
 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
 {
@@ -1195,7 +1195,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
 	return true;
 }
 
-static void __sock_zerocopy_callback(struct ubuf_info *uarg)
+static void __msg_zerocopy_callback(struct ubuf_info *uarg)
 {
 	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
 	struct sock_exterr_skb *serr;
@@ -1243,17 +1243,17 @@ release:
 	sock_put(sk);
 }
 
-void sock_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
-			    bool success)
+void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+			   bool success)
 {
 	uarg->zerocopy = uarg->zerocopy & success;
 
 	if (refcount_dec_and_test(&uarg->refcnt))
-		__sock_zerocopy_callback(uarg);
+		__msg_zerocopy_callback(uarg);
 }
-EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
+EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
 
-void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
 	struct sock *sk = skb_from_uarg(uarg)->sk;
 
@@ -1261,9 +1261,9 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 	uarg->len--;
 
 	if (have_uref)
-		sock_zerocopy_callback(NULL, uarg, true);
+		msg_zerocopy_callback(NULL, uarg, true);
 }
-EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
 
 int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
 {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bae9b29e17a3..ffee03729285 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1018,7 +1018,7 @@ static int __ip_append_data(struct sock *sk,
 		csummode = CHECKSUM_PARTIAL;
 
 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
-		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fb58215972ba..2882d520f5b1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1217,7 +1217,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 
 	if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
 		skb = tcp_write_queue_tail(sk);
-		uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
+		uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
 		if (!uarg) {
 			err = -ENOBUFS;
 			goto out_err;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c8c87891533a..f59cfa39686a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1471,7 +1471,7 @@ emsgsize:
 		csummode = CHECKSUM_PARTIAL;
 
 	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
-		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
 		if (!uarg)
 			return -ENOBUFS;
 		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
-- 
cgit v1.2.3


From 06b4feb37e64e543714c971a4162a75e2e4024d4 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:38 -0800
Subject: net: group skb_shinfo zerocopy related bits together.

In preparation for expanded zerocopy (TX and RX), move
the zerocopy related bits out of tx_flags into their own
flag word.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tap.c                   |  3 +--
 drivers/net/tun.c                   |  3 +--
 drivers/net/xen-netback/interface.c |  4 ++--
 include/linux/skbuff.h              | 38 ++++++++++++++++++++-----------------
 net/core/skbuff.c                   |  9 ++++-----
 net/ipv4/tcp.c                      |  2 +-
 net/kcm/kcmsock.c                   |  4 ++--
 7 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 3f51f3766d18..f7a19d9b7c27 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -723,8 +723,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
 		skb_shinfo(skb)->destructor_arg = msg_control;
-		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
-		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+		skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
 		uarg->callback(NULL, uarg, false);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 862c9063aa09..d8f1c3d34612 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1815,8 +1815,7 @@ drop:
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
 		skb_shinfo(skb)->destructor_arg = msg_control;
-		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
-		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+		skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
 		uarg->callback(NULL, uarg, false);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index acb786d8b1d8..08b0e3d0b7eb 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -47,7 +47,7 @@
 /* Number of bytes allowed on the internal guest Rx queue. */
 #define XENVIF_RX_QUEUE_BYTES (XEN_NETIF_RX_RING_SIZE/2 * PAGE_SIZE)
 
-/* This function is used to set SKBTX_DEV_ZEROCOPY as well as
+/* This function is used to set SKBFL_ZEROCOPY_ENABLE as well as
  * increasing the inflight counter. We need to increase the inflight
  * counter because core driver calls into xenvif_zerocopy_callback
  * which calls xenvif_skb_zerocopy_complete.
@@ -55,7 +55,7 @@
 void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue,
 				 struct sk_buff *skb)
 {
-	skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_ENABLE;
 	atomic_inc(&queue->inflight_packets);
 }
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f6a3cee5c967..7b5adc511ac6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -430,28 +430,32 @@ enum {
 	/* device driver is going to provide hardware time stamp */
 	SKBTX_IN_PROGRESS = 1 << 2,
 
-	/* device driver supports TX zero-copy buffers */
-	SKBTX_DEV_ZEROCOPY = 1 << 3,
-
 	/* generate wifi status information (where possible) */
 	SKBTX_WIFI_STATUS = 1 << 4,
 
-	/* This indicates at least one fragment might be overwritten
-	 * (as in vmsplice(), sendfile() ...)
-	 * If we need to compute a TX checksum, we'll need to copy
-	 * all frags to avoid possible bad checksum
-	 */
-	SKBTX_SHARED_FRAG = 1 << 5,
-
 	/* generate software time stamp when entering packet scheduling */
 	SKBTX_SCHED_TSTAMP = 1 << 6,
 };
 
-#define SKBTX_ZEROCOPY_FRAG	(SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
 #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
 				 SKBTX_SCHED_TSTAMP)
 #define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
 
+/* Definitions for flags in struct skb_shared_info */
+enum {
+	/* use zcopy routines */
+	SKBFL_ZEROCOPY_ENABLE = BIT(0),
+
+	/* This indicates at least one fragment might be overwritten
+	 * (as in vmsplice(), sendfile() ...)
+	 * If we need to compute a TX checksum, we'll need to copy
+	 * all frags to avoid possible bad checksum
+	 */
+	SKBFL_SHARED_FRAG = BIT(1),
+};
+
+#define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
+
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
  * lower device, the skb last reference should be 0 when calling this.
@@ -506,7 +510,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	__u8		__unused;
+	__u8		flags;
 	__u8		meta_len;
 	__u8		nr_frags;
 	__u8		tx_flags;
@@ -1433,7 +1437,7 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
 
 static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 {
-	bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;
+	bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE;
 
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
@@ -1452,14 +1456,14 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 		else
 			skb_zcopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
-		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
 	}
 }
 
 static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
 {
 	skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
-	skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+	skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
 }
 
 static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
@@ -1497,7 +1501,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
 	}
 }
 
@@ -3323,7 +3327,7 @@ static inline int skb_linearize(struct sk_buff *skb)
 static inline bool skb_has_shared_frag(const struct sk_buff *skb)
 {
 	return skb_is_nonlinear(skb) &&
-	       skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
+	       skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bcd56763952e..5b9e52cbd087 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1327,7 +1327,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
  *	@skb: the skb to modify
  *	@gfp_mask: allocation priority
  *
- *	This must be called on SKBTX_DEV_ZEROCOPY skb.
+ *	This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
  *	It will copy all frags into kernel and drop the reference
  *	to userspace pages.
  *
@@ -3264,8 +3264,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 {
 	int pos = skb_headlen(skb);
 
-	skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
-				      SKBTX_SHARED_FRAG;
+	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
 	skb_zerocopy_clone(skb1, skb, 0);
 	if (len < pos)	/* Split line is inside header. */
 		skb_split_inside_header(skb, skb1, len, pos);
@@ -3954,8 +3953,8 @@ normal:
 		skb_copy_from_linear_data_offset(head_skb, offset,
 						 skb_put(nskb, hsize), hsize);
 
-		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
-					      SKBTX_SHARED_FRAG;
+		skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+					   SKBFL_SHARED_FRAG;
 
 		if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
 		    skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2882d520f5b1..1954190b33c7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1010,7 +1010,7 @@ new_segment:
 	}
 
 	if (!(flags & MSG_NO_SHARED_FRAGS))
-		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+		skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
 
 	skb->len += copy;
 	skb->data_len += copy;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index a9eb616f5521..d0b56ffbb057 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -786,7 +786,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
 
 		if (skb_can_coalesce(skb, i, page, offset)) {
 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
-			skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+			skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
 			goto coalesced;
 		}
 
@@ -834,7 +834,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
 
 	get_page(page);
 	skb_fill_page_desc(skb, i, page, offset, size);
-	skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+	skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
 
 coalesced:
 	skb->len += size;
-- 
cgit v1.2.3


From 04c2d33eabdc76df730e1e356f6b2c656381d7d5 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:39 -0800
Subject: skbuff: add flags to ubuf_info for ubuf setup

Currently, when an ubuf is attached to a new skb, the shared
flags word is initialized to a fixed value.  Instead of doing
this, set the default flags in the ubuf, and have new skbs
inherit from this default.

This is needed when setting up different zerocopy types.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 3 ++-
 net/core/skbuff.c      | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7b5adc511ac6..a7bc71d2debb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,7 @@ struct ubuf_info {
 		};
 	};
 	refcount_t refcnt;
+	u8 flags;
 
 	struct mmpin {
 		struct user_struct *user;
@@ -1456,7 +1457,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 		else
 			skb_zcopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
-		skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags |= uarg->flags;
 	}
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5b9e52cbd087..a7bcbbff99e3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1119,6 +1119,7 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
+	uarg->flags = SKBFL_ZEROCOPY_FRAG;
 	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
-- 
cgit v1.2.3


From 9ee5e5ade033875191a2d2e470033e9cdde44a6a Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:40 -0800
Subject: tap/tun: add skb_zcopy_init() helper for initialization.

Replace direct assignments with skb_zcopy_init() for zerocopy
cases where a new skb is initialized, without changing the
reference counts.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tap.c      | 3 +--
 drivers/net/tun.c      | 3 +--
 drivers/vhost/net.c    | 1 +
 include/linux/skbuff.h | 9 +++++++--
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f7a19d9b7c27..3c652c8ac5ba 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -722,8 +722,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 	tap = rcu_dereference(q->tap);
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = msg_control;
-		skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
+		skb_zcopy_init(skb, msg_control);
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
 		uarg->callback(NULL, uarg, false);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d8f1c3d34612..02a93cfdb6b1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1814,8 +1814,7 @@ drop:
 
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = msg_control;
-		skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
+		skb_zcopy_init(skb, msg_control);
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
 		uarg->callback(NULL, uarg, false);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f834a097895e..3b744031ec8f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -903,6 +903,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
+			ubuf->flags = SKBFL_ZEROCOPY_FRAG;
 			refcount_set(&ubuf->refcnt, 1);
 			msg.msg_control = &ctl;
 			ctl.type = TUN_MSG_UBUF;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a7bc71d2debb..68ad45a98810 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1448,6 +1448,12 @@ static inline void skb_zcopy_get(struct ubuf_info *uarg)
 	refcount_inc(&uarg->refcnt);
 }
 
+static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	skb_shinfo(skb)->destructor_arg = uarg;
+	skb_shinfo(skb)->flags |= uarg->flags;
+}
+
 static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 				 bool *have_ref)
 {
@@ -1456,8 +1462,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 			*have_ref = false;
 		else
 			skb_zcopy_get(uarg);
-		skb_shinfo(skb)->destructor_arg = uarg;
-		skb_shinfo(skb)->flags |= uarg->flags;
+		skb_zcopy_init(skb, uarg);
 	}
 }
 
-- 
cgit v1.2.3


From 8e0449172497a915e79da66b222255dc9b4a5f31 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Wed, 6 Jan 2021 14:18:41 -0800
Subject: skbuff: Rename skb_zcopy_{get|put} to net_zcopy_{get|put}

Unlike the rest of the skb_zcopy_ functions, these routines
operate on a 'struct ubuf', not a skb.  Remove the 'skb_'
prefix from the naming to make things clearer.

Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 10 +++++-----
 net/core/skbuff.c      |  2 +-
 net/ipv4/ip_output.c   |  2 +-
 net/ipv4/tcp.c         |  4 ++--
 net/ipv6/ip6_output.c  |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 68ad45a98810..7a057b1f1eb8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1443,7 +1443,7 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_get(struct ubuf_info *uarg)
+static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
 }
@@ -1461,7 +1461,7 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
 		if (unlikely(have_ref && *have_ref))
 			*have_ref = false;
 		else
-			skb_zcopy_get(uarg);
+			net_zcopy_get(uarg);
 		skb_zcopy_init(skb, uarg);
 	}
 }
@@ -1482,19 +1482,19 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
 	return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
 }
 
-static inline void skb_zcopy_put(struct ubuf_info *uarg)
+static inline void net_zcopy_put(struct ubuf_info *uarg)
 {
 	if (uarg)
 		uarg->callback(NULL, uarg, true);
 }
 
-static inline void skb_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
+static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
 	if (uarg) {
 		if (uarg->callback == msg_zerocopy_callback)
 			msg_zerocopy_put_abort(uarg, have_uref);
 		else if (have_uref)
-			skb_zcopy_put(uarg);
+			net_zcopy_put(uarg);
 	}
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a7bcbbff99e3..7626a33cce59 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1165,7 +1165,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
 
 			/* no extra ref when appending to datagram (MSG_MORE) */
 			if (sk->sk_type == SOCK_STREAM)
-				skb_zcopy_get(uarg);
+				net_zcopy_get(uarg);
 
 			return uarg;
 		}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ffee03729285..102b1998ba3c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1230,7 +1230,7 @@ alloc_new_skb:
 error_efault:
 	err = -EFAULT;
 error:
-	skb_zcopy_put_abort(uarg, extra_uref);
+	net_zcopy_put_abort(uarg, extra_uref);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1954190b33c7..2267d21c73a6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1429,7 +1429,7 @@ out:
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 	}
 out_nopush:
-	skb_zcopy_put(uarg);
+	net_zcopy_put(uarg);
 	return copied + copied_syn;
 
 do_error:
@@ -1440,7 +1440,7 @@ do_fault:
 	if (copied + copied_syn)
 		goto out;
 out_err:
-	skb_zcopy_put_abort(uarg, true);
+	net_zcopy_put_abort(uarg, true);
 	err = sk_stream_error(sk, flags, err);
 	/* make sure we wake any epoll edge trigger waiter */
 	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f59cfa39686a..072ce9678616 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1715,7 +1715,7 @@ alloc_new_skb:
 error_efault:
 	err = -EFAULT;
 error:
-	skb_zcopy_put_abort(uarg, extra_uref);
+	net_zcopy_put_abort(uarg, extra_uref);
 	cork->length -= length;
 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
-- 
cgit v1.2.3


From 33cb6d1ed311af2c1dfd107fa334cfb51113ef35 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 28 Dec 2020 21:30:21 +0100
Subject: dmaengine: at_hdmac: remove platform data header

linux/platform_data/dma-atmel.h is only used by the at_hdmac driver. Move
the CFG bits definitions back in at_hdmac_regs.h and the remaining
definitions in the driver.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201228203022.2674133-1-alexandre.belloni@bootlin.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 MAINTAINERS                             |  1 -
 drivers/dma/at_hdmac.c                  | 19 ++++++++++
 drivers/dma/at_hdmac_regs.h             | 28 +++++++++++++--
 include/linux/platform_data/dma-atmel.h | 61 ---------------------------------
 4 files changed, 44 insertions(+), 65 deletions(-)
 delete mode 100644 include/linux/platform_data/dma-atmel.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 546aa66428c9..0d62310a31f8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11604,7 +11604,6 @@ F:	drivers/dma/at_hdmac.c
 F:	drivers/dma/at_hdmac_regs.h
 F:	drivers/dma/at_xdmac.c
 F:	include/dt-bindings/dma/at91.h
-F:	include/linux/platform_data/dma-atmel.h
 
 MICROCHIP AT91 SERIAL DRIVER
 M:	Richard Genoud <richard.genoud@gmail.com>
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 7eaee5b705b1..30ae36124b1d 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -54,6 +54,25 @@ module_param(init_nr_desc_per_channel, uint, 0644);
 MODULE_PARM_DESC(init_nr_desc_per_channel,
 		 "initial descriptors per channel (default: 64)");
 
+/**
+ * struct at_dma_platform_data - Controller configuration parameters
+ * @nr_channels: Number of channels supported by hardware (max 8)
+ * @cap_mask: dma_capability flags supported by the platform
+ */
+struct at_dma_platform_data {
+	unsigned int	nr_channels;
+	dma_cap_mask_t  cap_mask;
+};
+
+/**
+ * struct at_dma_slave - Controller-specific information about a slave
+ * @dma_dev: required DMA master device
+ * @cfg: Platform-specific initializer for the CFG register
+ */
+struct at_dma_slave {
+	struct device		*dma_dev;
+	u32			cfg;
+};
 
 /* prototypes */
 static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx);
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
index 80fc2fe8c77e..4d1ebc040031 100644
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -7,8 +7,6 @@
 #ifndef AT_HDMAC_REGS_H
 #define	AT_HDMAC_REGS_H
 
-#include <linux/platform_data/dma-atmel.h>
-
 #define	AT_DMA_MAX_NR_CHANNELS	8
 
 
@@ -148,7 +146,31 @@
 #define	ATC_AUTO		(0x1 << 31)	/* Auto multiple buffer tx enable */
 
 /* Bitfields in CFG */
-/* are in at_hdmac.h */
+#define ATC_PER_MSB(h)	((0x30U & (h)) >> 4)	/* Extract most significant bits of a handshaking identifier */
+
+#define	ATC_SRC_PER(h)		(0xFU & (h))	/* Channel src rq associated with periph handshaking ifc h */
+#define	ATC_DST_PER(h)		((0xFU & (h)) <<  4)	/* Channel dst rq associated with periph handshaking ifc h */
+#define	ATC_SRC_REP		(0x1 <<  8)	/* Source Replay Mod */
+#define	ATC_SRC_H2SEL		(0x1 <<  9)	/* Source Handshaking Mod */
+#define		ATC_SRC_H2SEL_SW	(0x0 <<  9)
+#define		ATC_SRC_H2SEL_HW	(0x1 <<  9)
+#define	ATC_SRC_PER_MSB(h)	(ATC_PER_MSB(h) << 10)	/* Channel src rq (most significant bits) */
+#define	ATC_DST_REP		(0x1 << 12)	/* Destination Replay Mod */
+#define	ATC_DST_H2SEL		(0x1 << 13)	/* Destination Handshaking Mod */
+#define		ATC_DST_H2SEL_SW	(0x0 << 13)
+#define		ATC_DST_H2SEL_HW	(0x1 << 13)
+#define	ATC_DST_PER_MSB(h)	(ATC_PER_MSB(h) << 14)	/* Channel dst rq (most significant bits) */
+#define	ATC_SOD			(0x1 << 16)	/* Stop On Done */
+#define	ATC_LOCK_IF		(0x1 << 20)	/* Interface Lock */
+#define	ATC_LOCK_B		(0x1 << 21)	/* AHB Bus Lock */
+#define	ATC_LOCK_IF_L		(0x1 << 22)	/* Master Interface Arbiter Lock */
+#define		ATC_LOCK_IF_L_CHUNK	(0x0 << 22)
+#define		ATC_LOCK_IF_L_BUFFER	(0x1 << 22)
+#define	ATC_AHB_PROT_MASK	(0x7 << 24)	/* AHB Protection */
+#define	ATC_FIFOCFG_MASK	(0x3 << 28)	/* FIFO Request Configuration */
+#define		ATC_FIFOCFG_LARGESTBURST	(0x0 << 28)
+#define		ATC_FIFOCFG_HALFFIFO		(0x1 << 28)
+#define		ATC_FIFOCFG_ENOUGHSPACE		(0x2 << 28)
 
 /* Bitfields in SPIP */
 #define	ATC_SPIP_HOLE(x)	(0xFFFFU & (x))
diff --git a/include/linux/platform_data/dma-atmel.h b/include/linux/platform_data/dma-atmel.h
deleted file mode 100644
index 069637e6004f..000000000000
--- a/include/linux/platform_data/dma-atmel.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Header file for the Atmel AHB DMA Controller driver
- *
- * Copyright (C) 2008 Atmel Corporation
- */
-#ifndef AT_HDMAC_H
-#define AT_HDMAC_H
-
-#include <linux/dmaengine.h>
-
-/**
- * struct at_dma_platform_data - Controller configuration parameters
- * @nr_channels: Number of channels supported by hardware (max 8)
- * @cap_mask: dma_capability flags supported by the platform
- */
-struct at_dma_platform_data {
-	unsigned int	nr_channels;
-	dma_cap_mask_t  cap_mask;
-};
-
-/**
- * struct at_dma_slave - Controller-specific information about a slave
- * @dma_dev: required DMA master device
- * @cfg: Platform-specific initializer for the CFG register
- */
-struct at_dma_slave {
-	struct device		*dma_dev;
-	u32			cfg;
-};
-
-
-/* Platform-configurable bits in CFG */
-#define ATC_PER_MSB(h)	((0x30U & (h)) >> 4)	/* Extract most significant bits of a handshaking identifier */
-
-#define	ATC_SRC_PER(h)		(0xFU & (h))	/* Channel src rq associated with periph handshaking ifc h */
-#define	ATC_DST_PER(h)		((0xFU & (h)) <<  4)	/* Channel dst rq associated with periph handshaking ifc h */
-#define	ATC_SRC_REP		(0x1 <<  8)	/* Source Replay Mod */
-#define	ATC_SRC_H2SEL		(0x1 <<  9)	/* Source Handshaking Mod */
-#define		ATC_SRC_H2SEL_SW	(0x0 <<  9)
-#define		ATC_SRC_H2SEL_HW	(0x1 <<  9)
-#define	ATC_SRC_PER_MSB(h)	(ATC_PER_MSB(h) << 10)	/* Channel src rq (most significant bits) */
-#define	ATC_DST_REP		(0x1 << 12)	/* Destination Replay Mod */
-#define	ATC_DST_H2SEL		(0x1 << 13)	/* Destination Handshaking Mod */
-#define		ATC_DST_H2SEL_SW	(0x0 << 13)
-#define		ATC_DST_H2SEL_HW	(0x1 << 13)
-#define	ATC_DST_PER_MSB(h)	(ATC_PER_MSB(h) << 14)	/* Channel dst rq (most significant bits) */
-#define	ATC_SOD			(0x1 << 16)	/* Stop On Done */
-#define	ATC_LOCK_IF		(0x1 << 20)	/* Interface Lock */
-#define	ATC_LOCK_B		(0x1 << 21)	/* AHB Bus Lock */
-#define	ATC_LOCK_IF_L		(0x1 << 22)	/* Master Interface Arbiter Lock */
-#define		ATC_LOCK_IF_L_CHUNK	(0x0 << 22)
-#define		ATC_LOCK_IF_L_BUFFER	(0x1 << 22)
-#define	ATC_AHB_PROT_MASK	(0x7 << 24)	/* AHB Protection */
-#define	ATC_FIFOCFG_MASK	(0x3 << 28)	/* FIFO Request Configuration */
-#define		ATC_FIFOCFG_LARGESTBURST	(0x0 << 28)
-#define		ATC_FIFOCFG_HALFFIFO		(0x1 << 28)
-#define		ATC_FIFOCFG_ENOUGHSPACE		(0x2 << 28)
-
-
-#endif /* AT_HDMAC_H */
-- 
cgit v1.2.3


From 994122211665301080cd232c06ae219b681dcb57 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Thu, 7 Jan 2021 17:34:01 -0600
Subject: remoteproc: qcom: expose types for COMPILE_TEST

Stub functions are defined for SSR notifier registration in case
QCOM_RPROC_COMMON is not configured.  As a result, code that uses
these functions can link successfully even if the common remoteproc
code is not built.

Code that registers an SSR notifier function likely needs the
types defined in "qcom_rproc.h", but those are only exposed if
QCOM_RPROC_COMMON is enabled.

Rearrange the conditional definition so the qcom_ssr_notify_data
structure and qcom_ssr_notify_type enumerated type are defined
whether or not QCOM_RPROC_COMMON is enabled.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Alex Elder <elder@linaro.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/remoteproc/qcom_rproc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc/qcom_rproc.h b/include/linux/remoteproc/qcom_rproc.h
index 647051662174..82b211518136 100644
--- a/include/linux/remoteproc/qcom_rproc.h
+++ b/include/linux/remoteproc/qcom_rproc.h
@@ -3,8 +3,6 @@
 
 struct notifier_block;
 
-#if IS_ENABLED(CONFIG_QCOM_RPROC_COMMON)
-
 /**
  * enum qcom_ssr_notify_type - Startup/Shutdown events related to a remoteproc
  * processor.
@@ -26,6 +24,8 @@ struct qcom_ssr_notify_data {
 	bool crashed;
 };
 
+#if IS_ENABLED(CONFIG_QCOM_RPROC_COMMON)
+
 void *qcom_register_ssr_notifier(const char *name, struct notifier_block *nb);
 int qcom_unregister_ssr_notifier(void *notify, struct notifier_block *nb);
 
-- 
cgit v1.2.3


From ce2ceb9b1cff777c7d8beb368eddc3b8e45381f6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Thu, 7 Jan 2021 17:34:02 -0600
Subject: soc: qcom: mdt_loader: define stubs for COMPILE_TEST

Define stub functions for the exposed MDT functions in case
QCOM_MDT_LOADER is not configured.  This allows users of these
functions to link correctly for COMPILE_TEST builds without
QCOM_SCM enabled.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Alex Elder <elder@linaro.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/soc/qcom/mdt_loader.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h
index e600baec6825..afd47217996b 100644
--- a/include/linux/soc/qcom/mdt_loader.h
+++ b/include/linux/soc/qcom/mdt_loader.h
@@ -11,6 +11,8 @@
 struct device;
 struct firmware;
 
+#if IS_ENABLED(CONFIG_QCOM_MDT_LOADER)
+
 ssize_t qcom_mdt_get_size(const struct firmware *fw);
 int qcom_mdt_load(struct device *dev, const struct firmware *fw,
 		  const char *fw_name, int pas_id, void *mem_region,
@@ -23,4 +25,37 @@ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw,
 			  phys_addr_t *reloc_base);
 void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len);
 
+#else /* !IS_ENABLED(CONFIG_QCOM_MDT_LOADER) */
+
+static inline ssize_t qcom_mdt_get_size(const struct firmware *fw)
+{
+	return -ENODEV;
+}
+
+static inline int qcom_mdt_load(struct device *dev, const struct firmware *fw,
+				const char *fw_name, int pas_id,
+				void *mem_region, phys_addr_t mem_phys,
+				size_t mem_size, phys_addr_t *reloc_base)
+{
+	return -ENODEV;
+}
+
+static inline int qcom_mdt_load_no_init(struct device *dev,
+					const struct firmware *fw,
+					const char *fw_name, int pas_id,
+					void *mem_region, phys_addr_t mem_phys,
+					size_t mem_size,
+					phys_addr_t *reloc_base)
+{
+	return -ENODEV;
+}
+
+static inline void *qcom_mdt_read_metadata(const struct firmware *fw,
+					   size_t *data_len)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+#endif /* !IS_ENABLED(CONFIG_QCOM_MDT_LOADER) */
+
 #endif
-- 
cgit v1.2.3


From 1d11fa696733ffb9ac24771716b1b1b9953e5a48 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Jan 2021 03:39:03 -0800
Subject: net-gro: remove GRO_DROP

GRO_DROP can only be returned from napi_gro_frags()
if the skb has not been allocated by a prior napi_get_frags()

Since drivers must use napi_get_frags() and test its result
before populating the skb with metadata, we can safely remove
GRO_DROP since it offers no practical use.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 11 -----------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1ec3ac5d5bbf..5b949076ed23 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -376,7 +376,6 @@ enum gro_result {
 	GRO_MERGED_FREE,
 	GRO_HELD,
 	GRO_NORMAL,
-	GRO_DROP,
 	GRO_CONSUMED,
 };
 typedef enum gro_result gro_result_t;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7afbb642e203..e4d77c8abe76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6070,10 +6070,6 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
 		gro_normal_one(napi, skb);
 		break;
 
-	case GRO_DROP:
-		kfree_skb(skb);
-		break;
-
 	case GRO_MERGED_FREE:
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 			napi_skb_free_stolen_head(skb);
@@ -6158,10 +6154,6 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
 			gro_normal_one(napi, skb);
 		break;
 
-	case GRO_DROP:
-		napi_reuse_skb(napi, skb);
-		break;
-
 	case GRO_MERGED_FREE:
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 			napi_skb_free_stolen_head(skb);
@@ -6223,9 +6215,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
 	gro_result_t ret;
 	struct sk_buff *skb = napi_frags_skb(napi);
 
-	if (!skb)
-		return GRO_DROP;
-
 	trace_napi_gro_frags_entry(skb);
 
 	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
-- 
cgit v1.2.3


From 8fbdbb66f8c10bb359ad20fff95628e5deac88f4 Mon Sep 17 00:00:00 2001
From: Darren Salt <devspam@moreofthesa.me.uk>
Date: Tue, 5 Jan 2021 14:44:01 +0100
Subject: PCI: Export pci_rebar_get_possible_sizes()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export pci_rebar_get_possible_sizes() for use by modular drivers.

Signed-off-by: Darren Salt <devspam@moreofthesa.me.uk>
Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patchwork.kernel.org/project/dri-devel/patch/20210107175017.15893-2-nirmoy.das@amd.com
---
 drivers/pci/pci.c   | 1 +
 drivers/pci/pci.h   | 1 -
 include/linux/pci.h | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b9fecc25d213..8d1aec39ecc8 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3605,6 +3605,7 @@ u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar)
 	pci_read_config_dword(pdev, pos + PCI_REBAR_CAP, &cap);
 	return (cap & PCI_REBAR_CAP_SIZES) >> 4;
 }
+EXPORT_SYMBOL(pci_rebar_get_possible_sizes);
 
 /**
  * pci_rebar_get_current_size - get the current size of a BAR
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5c59365092fa..8f130f9531b3 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -630,7 +630,6 @@ int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment,
 			  struct resource *res);
 #endif
 
-u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 int pci_rebar_get_current_size(struct pci_dev *pdev, int bar);
 int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size);
 static inline u64 pci_rebar_size_to_bytes(int size)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b32126d26997..5f25a0ec48c1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1232,6 +1232,7 @@ void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 void pci_release_resource(struct pci_dev *dev, int resno);
+u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
-- 
cgit v1.2.3


From 192f1bf7559e895d51f81c3976c5892c8b1e0601 Mon Sep 17 00:00:00 2001
From: Nirmoy Das <nirmoy.das@amd.com>
Date: Thu, 7 Jan 2021 14:30:34 +0100
Subject: PCI: Add pci_rebar_bytes_to_size()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users of pci_resize_resource() need a way to calculate BAR size
from desired bytes. Add a helper function and export it so that
modular drivers can use it.

Signed-off-by: Darren Salt <devspam@moreofthesa.me.uk>
Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patchwork.kernel.org/project/dri-devel/patch/20210107175017.15893-3-nirmoy.das@amd.com
---
 drivers/pci/pci.c   | 2 +-
 include/linux/pci.h | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8d1aec39ecc8..99ef2774b9d0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1644,7 +1644,7 @@ static void pci_restore_rebar_state(struct pci_dev *pdev)
 		pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
 		bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
 		res = pdev->resource + bar_idx;
-		size = ilog2(resource_size(res)) - 20;
+		size = pci_rebar_bytes_to_size(resource_size(res));
 		ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
 		ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT;
 		pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5f25a0ec48c1..53f4904ee83d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1232,6 +1232,14 @@ void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 void pci_release_resource(struct pci_dev *dev, int resno);
+static inline int pci_rebar_bytes_to_size(u64 bytes)
+{
+	bytes = roundup_pow_of_two(bytes);
+
+	/* Return BAR size as defined in the resizable BAR specification */
+	return max(ilog2(bytes), 20) - 20;
+}
+
 u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar);
 int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
-- 
cgit v1.2.3


From 398cb92495cc1972014ffa81ff9cea1e5167b8df Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Fri, 13 Nov 2020 09:48:27 +0000
Subject: csky: Fixup perf probe failed

Current perf init will failed with:
[    1.452433] csky-pmu: probe of soc:pmu failed with error -16

This patch fix it up with adding CPUHP_AP_PERF_CSKY_ONLINE in
cpuhotplug.h.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
---
 arch/csky/kernel/perf_event.c | 2 +-
 include/linux/cpuhotplug.h    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/csky/kernel/perf_event.c b/arch/csky/kernel/perf_event.c
index 1a29f1157449..55d5a5379483 100644
--- a/arch/csky/kernel/perf_event.c
+++ b/arch/csky/kernel/perf_event.c
@@ -1319,7 +1319,7 @@ int csky_pmu_device_probe(struct platform_device *pdev,
 		pr_notice("[perf] PMU request irq fail!\n");
 	}
 
-	ret = cpuhp_setup_state(CPUHP_AP_PERF_ONLINE, "AP_PERF_ONLINE",
+	ret = cpuhp_setup_state(CPUHP_AP_PERF_CSKY_ONLINE, "AP_PERF_ONLINE",
 				csky_pmu_starting_cpu,
 				csky_pmu_dying_cpu);
 	if (ret) {
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0042ef362511..d3e26dc81494 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -185,6 +185,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
 	CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
+	CPUHP_AP_PERF_CSKY_ONLINE,
 	CPUHP_AP_WATCHDOG_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
-- 
cgit v1.2.3


From 04769cb1c45a919cd94b931e6494d2a9afc32914 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Nov 2020 17:41:19 +0100
Subject: mm/frame-vector: Use FOLL_LONGTERM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is used by media/videbuf2 for persistent dma mappings, not just
for a single dma operation and then freed again, so needs
FOLL_LONGTERM.

Unfortunately current pup_locked doesn't support FOLL_LONGTERM due to
locking issues. Rework the code to pull the pup path out from the
mmap_sem critical section as suggested by Jason.

By relying entirely on the vma checks in pin_user_pages and follow_pfn
(for vm_flags and vma_is_fsdax) we can also streamline the code a lot.

Note that pin_user_pages_fast is a safe replacement despite the
seeming lack of checking for vma->vm_flasg & (VM_IO | VM_PFNMAP). Such
ptes are marked with pte_mkspecial (which pup_fast rejects in the
fastpath), and only architectures supporting that support the
pin_user_pages_fast fastpath.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Pawel Osciak <pawel@osciak.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20201127164131.2244124-6-daniel.vetter@ffwll.ch
---
 drivers/media/common/videobuf2/videobuf2-memops.c |  3 +-
 include/linux/mm.h                                |  2 +-
 mm/frame_vector.c                                 | 53 ++++++++---------------
 3 files changed, 19 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/common/videobuf2/videobuf2-memops.c b/drivers/media/common/videobuf2/videobuf2-memops.c
index 6e9e05153f4e..9dd6c27162f4 100644
--- a/drivers/media/common/videobuf2/videobuf2-memops.c
+++ b/drivers/media/common/videobuf2/videobuf2-memops.c
@@ -40,7 +40,6 @@ struct frame_vector *vb2_create_framevec(unsigned long start,
 	unsigned long first, last;
 	unsigned long nr;
 	struct frame_vector *vec;
-	unsigned int flags = FOLL_FORCE | FOLL_WRITE;
 
 	first = start >> PAGE_SHIFT;
 	last = (start + length - 1) >> PAGE_SHIFT;
@@ -48,7 +47,7 @@ struct frame_vector *vb2_create_framevec(unsigned long start,
 	vec = frame_vector_create(nr);
 	if (!vec)
 		return ERR_PTR(-ENOMEM);
-	ret = get_vaddr_frames(start & PAGE_MASK, nr, flags, vec);
+	ret = get_vaddr_frames(start & PAGE_MASK, nr, vec);
 	if (ret < 0)
 		goto out_destroy;
 	/* We accept only complete set of PFNs */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8cd6ae..2e7b1d048df9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1768,7 +1768,7 @@ struct frame_vector {
 struct frame_vector *frame_vector_create(unsigned int nr_frames);
 void frame_vector_destroy(struct frame_vector *vec);
 int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
-		     unsigned int gup_flags, struct frame_vector *vec);
+		     struct frame_vector *vec);
 void put_vaddr_frames(struct frame_vector *vec);
 int frame_vector_to_pages(struct frame_vector *vec);
 void frame_vector_to_pfns(struct frame_vector *vec);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 10f82d5643b6..f8c34b895c76 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -32,13 +32,12 @@
  * This function takes care of grabbing mmap_lock as necessary.
  */
 int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
-		     unsigned int gup_flags, struct frame_vector *vec)
+		     struct frame_vector *vec)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int ret = 0;
 	int err;
-	int locked;
 
 	if (nr_frames == 0)
 		return 0;
@@ -48,40 +47,26 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
 
 	start = untagged_addr(start);
 
-	mmap_read_lock(mm);
-	locked = 1;
-	vma = find_vma_intersection(mm, start, start + 1);
-	if (!vma) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	/*
-	 * While get_vaddr_frames() could be used for transient (kernel
-	 * controlled lifetime) pinning of memory pages all current
-	 * users establish long term (userspace controlled lifetime)
-	 * page pinning. Treat get_vaddr_frames() like
-	 * get_user_pages_longterm() and disallow it for filesystem-dax
-	 * mappings.
-	 */
-	if (vma_is_fsdax(vma)) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
+	ret = pin_user_pages_fast(start, nr_frames,
+				  FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
+				  (struct page **)(vec->ptrs));
+	if (ret > 0) {
 		vec->got_ref = true;
 		vec->is_pfns = false;
-		ret = pin_user_pages_locked(start, nr_frames,
-			gup_flags, (struct page **)(vec->ptrs), &locked);
-		goto out;
+		goto out_unlocked;
 	}
 
+	mmap_read_lock(mm);
 	vec->got_ref = false;
 	vec->is_pfns = true;
+	ret = 0;
 	do {
 		unsigned long *nums = frame_vector_pfns(vec);
 
+		vma = find_vma_intersection(mm, start, start + 1);
+		if (!vma)
+			break;
+
 		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
 			err = follow_pfn(vma, start, &nums[ret]);
 			if (err) {
@@ -92,17 +77,13 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
 			start += PAGE_SIZE;
 			ret++;
 		}
-		/*
-		 * We stop if we have enough pages or if VMA doesn't completely
-		 * cover the tail page.
-		 */
-		if (ret >= nr_frames || start < vma->vm_end)
+		/* Bail out if VMA doesn't completely cover the tail page. */
+		if (start < vma->vm_end)
 			break;
-		vma = find_vma_intersection(mm, start, start + 1);
-	} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
+	} while (ret < nr_frames);
 out:
-	if (locked)
-		mmap_read_unlock(mm);
+	mmap_read_unlock(mm);
+out_unlocked:
 	if (!ret)
 		ret = -EFAULT;
 	if (ret > 0)
-- 
cgit v1.2.3


From eb83b8e3e6478845f8130622a2048ed4ec3b3be3 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Nov 2020 17:41:20 +0100
Subject: media: videobuf2: Move frame_vector into media subsystem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's the only user. This also garbage collects the CONFIG_FRAME_VECTOR
symbol from all over the tree (well just one place, somehow omap media
driver still had this in its Kconfig, despite not using it).

Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Acked-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Acked-by: Tomasz Figa <tfiga@chromium.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Pawel Osciak <pawel@osciak.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20201127164131.2244124-7-daniel.vetter@ffwll.ch
---
 drivers/media/common/videobuf2/Kconfig        |   1 -
 drivers/media/common/videobuf2/Makefile       |   1 +
 drivers/media/common/videobuf2/frame_vector.c | 223 ++++++++++++++++++++++++++
 drivers/media/platform/omap/Kconfig           |   1 -
 include/linux/mm.h                            |  42 -----
 include/media/frame_vector.h                  |  47 ++++++
 include/media/videobuf2-core.h                |   1 +
 mm/Kconfig                                    |   3 -
 mm/Makefile                                   |   1 -
 mm/frame_vector.c                             | 221 -------------------------
 10 files changed, 272 insertions(+), 269 deletions(-)
 create mode 100644 drivers/media/common/videobuf2/frame_vector.c
 create mode 100644 include/media/frame_vector.h
 delete mode 100644 mm/frame_vector.c

(limited to 'include/linux')

diff --git a/drivers/media/common/videobuf2/Kconfig b/drivers/media/common/videobuf2/Kconfig
index edbc99ebba87..d2223a12c95f 100644
--- a/drivers/media/common/videobuf2/Kconfig
+++ b/drivers/media/common/videobuf2/Kconfig
@@ -9,7 +9,6 @@ config VIDEOBUF2_V4L2
 
 config VIDEOBUF2_MEMOPS
 	tristate
-	select FRAME_VECTOR
 
 config VIDEOBUF2_DMA_CONTIG
 	tristate
diff --git a/drivers/media/common/videobuf2/Makefile b/drivers/media/common/videobuf2/Makefile
index 77bebe8b202f..54306f8d096c 100644
--- a/drivers/media/common/videobuf2/Makefile
+++ b/drivers/media/common/videobuf2/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 videobuf2-common-objs := videobuf2-core.o
+videobuf2-common-objs += frame_vector.o
 
 ifeq ($(CONFIG_TRACEPOINTS),y)
   videobuf2-common-objs += vb2-trace.o
diff --git a/drivers/media/common/videobuf2/frame_vector.c b/drivers/media/common/videobuf2/frame_vector.c
new file mode 100644
index 000000000000..a0e65481a201
--- /dev/null
+++ b/drivers/media/common/videobuf2/frame_vector.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+#include <media/frame_vector.h>
+
+/**
+ * get_vaddr_frames() - map virtual addresses to pfns
+ * @start:	starting user address
+ * @nr_frames:	number of pages / pfns from start to map
+ * @gup_flags:	flags modifying lookup behaviour
+ * @vec:	structure which receives pages / pfns of the addresses mapped.
+ *		It should have space for at least nr_frames entries.
+ *
+ * This function maps virtual addresses from @start and fills @vec structure
+ * with page frame numbers or page pointers to corresponding pages (choice
+ * depends on the type of the vma underlying the virtual address). If @start
+ * belongs to a normal vma, the function grabs reference to each of the pages
+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
+ * touch page structures and the caller must make sure pfns aren't reused for
+ * anything else while he is using them.
+ *
+ * The function returns number of pages mapped which may be less than
+ * @nr_frames. In particular we stop mapping if there are more vmas of
+ * different type underlying the specified range of virtual addresses.
+ * When the function isn't able to map a single page, it returns error.
+ *
+ * This function takes care of grabbing mmap_lock as necessary.
+ */
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+		     struct frame_vector *vec)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int ret = 0;
+	int err;
+
+	if (nr_frames == 0)
+		return 0;
+
+	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
+		nr_frames = vec->nr_allocated;
+
+	start = untagged_addr(start);
+
+	ret = pin_user_pages_fast(start, nr_frames,
+				  FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
+				  (struct page **)(vec->ptrs));
+	if (ret > 0) {
+		vec->got_ref = true;
+		vec->is_pfns = false;
+		goto out_unlocked;
+	}
+
+	mmap_read_lock(mm);
+	vec->got_ref = false;
+	vec->is_pfns = true;
+	ret = 0;
+	do {
+		unsigned long *nums = frame_vector_pfns(vec);
+
+		vma = find_vma_intersection(mm, start, start + 1);
+		if (!vma)
+			break;
+
+		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
+			err = follow_pfn(vma, start, &nums[ret]);
+			if (err) {
+				if (ret == 0)
+					ret = err;
+				goto out;
+			}
+			start += PAGE_SIZE;
+			ret++;
+		}
+		/* Bail out if VMA doesn't completely cover the tail page. */
+		if (start < vma->vm_end)
+			break;
+	} while (ret < nr_frames);
+out:
+	mmap_read_unlock(mm);
+out_unlocked:
+	if (!ret)
+		ret = -EFAULT;
+	if (ret > 0)
+		vec->nr_frames = ret;
+	return ret;
+}
+EXPORT_SYMBOL(get_vaddr_frames);
+
+/**
+ * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
+ *			them
+ * @vec:	frame vector to put
+ *
+ * Drop references to pages if get_vaddr_frames() acquired them. We also
+ * invalidate the frame vector so that it is prepared for the next call into
+ * get_vaddr_frames().
+ */
+void put_vaddr_frames(struct frame_vector *vec)
+{
+	struct page **pages;
+
+	if (!vec->got_ref)
+		goto out;
+	pages = frame_vector_pages(vec);
+	/*
+	 * frame_vector_pages() might needed to do a conversion when
+	 * get_vaddr_frames() got pages but vec was later converted to pfns.
+	 * But it shouldn't really fail to convert pfns back...
+	 */
+	if (WARN_ON(IS_ERR(pages)))
+		goto out;
+
+	unpin_user_pages(pages, vec->nr_frames);
+	vec->got_ref = false;
+out:
+	vec->nr_frames = 0;
+}
+EXPORT_SYMBOL(put_vaddr_frames);
+
+/**
+ * frame_vector_to_pages - convert frame vector to contain page pointers
+ * @vec:	frame vector to convert
+ *
+ * Convert @vec to contain array of page pointers.  If the conversion is
+ * successful, return 0. Otherwise return an error. Note that we do not grab
+ * page references for the page structures.
+ */
+int frame_vector_to_pages(struct frame_vector *vec)
+{
+	int i;
+	unsigned long *nums;
+	struct page **pages;
+
+	if (!vec->is_pfns)
+		return 0;
+	nums = frame_vector_pfns(vec);
+	for (i = 0; i < vec->nr_frames; i++)
+		if (!pfn_valid(nums[i]))
+			return -EINVAL;
+	pages = (struct page **)nums;
+	for (i = 0; i < vec->nr_frames; i++)
+		pages[i] = pfn_to_page(nums[i]);
+	vec->is_pfns = false;
+	return 0;
+}
+EXPORT_SYMBOL(frame_vector_to_pages);
+
+/**
+ * frame_vector_to_pfns - convert frame vector to contain pfns
+ * @vec:	frame vector to convert
+ *
+ * Convert @vec to contain array of pfns.
+ */
+void frame_vector_to_pfns(struct frame_vector *vec)
+{
+	int i;
+	unsigned long *nums;
+	struct page **pages;
+
+	if (vec->is_pfns)
+		return;
+	pages = (struct page **)(vec->ptrs);
+	nums = (unsigned long *)pages;
+	for (i = 0; i < vec->nr_frames; i++)
+		nums[i] = page_to_pfn(pages[i]);
+	vec->is_pfns = true;
+}
+EXPORT_SYMBOL(frame_vector_to_pfns);
+
+/**
+ * frame_vector_create() - allocate & initialize structure for pinned pfns
+ * @nr_frames:	number of pfns slots we should reserve
+ *
+ * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
+ * pfns.
+ */
+struct frame_vector *frame_vector_create(unsigned int nr_frames)
+{
+	struct frame_vector *vec;
+	int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
+
+	if (WARN_ON_ONCE(nr_frames == 0))
+		return NULL;
+	/*
+	 * This is absurdly high. It's here just to avoid strange effects when
+	 * arithmetics overflows.
+	 */
+	if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
+		return NULL;
+	/*
+	 * Avoid higher order allocations, use vmalloc instead. It should
+	 * be rare anyway.
+	 */
+	vec = kvmalloc(size, GFP_KERNEL);
+	if (!vec)
+		return NULL;
+	vec->nr_allocated = nr_frames;
+	vec->nr_frames = 0;
+	return vec;
+}
+EXPORT_SYMBOL(frame_vector_create);
+
+/**
+ * frame_vector_destroy() - free memory allocated to carry frame vector
+ * @vec:	Frame vector to free
+ *
+ * Free structure allocated by frame_vector_create() to carry frames.
+ */
+void frame_vector_destroy(struct frame_vector *vec)
+{
+	/* Make sure put_vaddr_frames() got called properly... */
+	VM_BUG_ON(vec->nr_frames > 0);
+	kvfree(vec);
+}
+EXPORT_SYMBOL(frame_vector_destroy);
diff --git a/drivers/media/platform/omap/Kconfig b/drivers/media/platform/omap/Kconfig
index f73b5893220d..de16de46c0f4 100644
--- a/drivers/media/platform/omap/Kconfig
+++ b/drivers/media/platform/omap/Kconfig
@@ -12,6 +12,5 @@ config VIDEO_OMAP2_VOUT
 	depends on VIDEO_V4L2
 	select VIDEOBUF2_DMA_CONTIG
 	select OMAP2_VRFB if ARCH_OMAP2 || ARCH_OMAP3
-	select FRAME_VECTOR
 	help
 	  V4L2 Display driver support for OMAP2/3 based boards.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e7b1d048df9..049439b65553 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1754,48 +1754,6 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
 			struct task_struct *task, bool bypass_rlim);
 
-/* Container for pinned pfns / pages */
-struct frame_vector {
-	unsigned int nr_allocated;	/* Number of frames we have space for */
-	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
-	bool got_ref;		/* Did we pin pages by getting page ref? */
-	bool is_pfns;		/* Does array contain pages or pfns? */
-	void *ptrs[];		/* Array of pinned pfns / pages. Use
-				 * pfns_vector_pages() or pfns_vector_pfns()
-				 * for access */
-};
-
-struct frame_vector *frame_vector_create(unsigned int nr_frames);
-void frame_vector_destroy(struct frame_vector *vec);
-int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
-		     struct frame_vector *vec);
-void put_vaddr_frames(struct frame_vector *vec);
-int frame_vector_to_pages(struct frame_vector *vec);
-void frame_vector_to_pfns(struct frame_vector *vec);
-
-static inline unsigned int frame_vector_count(struct frame_vector *vec)
-{
-	return vec->nr_frames;
-}
-
-static inline struct page **frame_vector_pages(struct frame_vector *vec)
-{
-	if (vec->is_pfns) {
-		int err = frame_vector_to_pages(vec);
-
-		if (err)
-			return ERR_PTR(err);
-	}
-	return (struct page **)(vec->ptrs);
-}
-
-static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
-{
-	if (!vec->is_pfns)
-		frame_vector_to_pfns(vec);
-	return (unsigned long *)(vec->ptrs);
-}
-
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
 			struct page **pages);
diff --git a/include/media/frame_vector.h b/include/media/frame_vector.h
new file mode 100644
index 000000000000..bfed1710dc24
--- /dev/null
+++ b/include/media/frame_vector.h
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _MEDIA_FRAME_VECTOR_H
+#define _MEDIA_FRAME_VECTOR_H
+
+/* Container for pinned pfns / pages in frame_vector.c */
+struct frame_vector {
+	unsigned int nr_allocated;	/* Number of frames we have space for */
+	unsigned int nr_frames;	/* Number of frames stored in ptrs array */
+	bool got_ref;		/* Did we pin pages by getting page ref? */
+	bool is_pfns;		/* Does array contain pages or pfns? */
+	void *ptrs[];		/* Array of pinned pfns / pages. Use
+				 * pfns_vector_pages() or pfns_vector_pfns()
+				 * for access */
+};
+
+struct frame_vector *frame_vector_create(unsigned int nr_frames);
+void frame_vector_destroy(struct frame_vector *vec);
+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
+		     struct frame_vector *vec);
+void put_vaddr_frames(struct frame_vector *vec);
+int frame_vector_to_pages(struct frame_vector *vec);
+void frame_vector_to_pfns(struct frame_vector *vec);
+
+static inline unsigned int frame_vector_count(struct frame_vector *vec)
+{
+	return vec->nr_frames;
+}
+
+static inline struct page **frame_vector_pages(struct frame_vector *vec)
+{
+	if (vec->is_pfns) {
+		int err = frame_vector_to_pages(vec);
+
+		if (err)
+			return ERR_PTR(err);
+	}
+	return (struct page **)(vec->ptrs);
+}
+
+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
+{
+	if (!vec->is_pfns)
+		frame_vector_to_pfns(vec);
+	return (unsigned long *)(vec->ptrs);
+}
+
+#endif /* _MEDIA_FRAME_VECTOR_H */
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 61969402a0e3..799ba61b5b6f 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -18,6 +18,7 @@
 #include <linux/dma-buf.h>
 #include <linux/bitops.h>
 #include <media/media-request.h>
+#include <media/frame_vector.h>
 
 #define VB2_MAX_FRAME	(32)
 #define VB2_MAX_PLANES	(8)
diff --git a/mm/Kconfig b/mm/Kconfig
index f730605b8dcf..24c045b24b95 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -804,9 +804,6 @@ config DEVICE_PRIVATE
 config VMAP_PFN
 	bool
 
-config FRAME_VECTOR
-	bool
-
 config ARCH_USES_HIGH_VMA_FLAGS
 	bool
 config ARCH_HAS_PKEYS
diff --git a/mm/Makefile b/mm/Makefile
index b6cd2fffa492..135bbb65511a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -110,7 +110,6 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
-obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
deleted file mode 100644
index f8c34b895c76..000000000000
--- a/mm/frame_vector.c
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
-
-/**
- * get_vaddr_frames() - map virtual addresses to pfns
- * @start:	starting user address
- * @nr_frames:	number of pages / pfns from start to map
- * @gup_flags:	flags modifying lookup behaviour
- * @vec:	structure which receives pages / pfns of the addresses mapped.
- *		It should have space for at least nr_frames entries.
- *
- * This function maps virtual addresses from @start and fills @vec structure
- * with page frame numbers or page pointers to corresponding pages (choice
- * depends on the type of the vma underlying the virtual address). If @start
- * belongs to a normal vma, the function grabs reference to each of the pages
- * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
- * touch page structures and the caller must make sure pfns aren't reused for
- * anything else while he is using them.
- *
- * The function returns number of pages mapped which may be less than
- * @nr_frames. In particular we stop mapping if there are more vmas of
- * different type underlying the specified range of virtual addresses.
- * When the function isn't able to map a single page, it returns error.
- *
- * This function takes care of grabbing mmap_lock as necessary.
- */
-int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
-		     struct frame_vector *vec)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	int ret = 0;
-	int err;
-
-	if (nr_frames == 0)
-		return 0;
-
-	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
-		nr_frames = vec->nr_allocated;
-
-	start = untagged_addr(start);
-
-	ret = pin_user_pages_fast(start, nr_frames,
-				  FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM,
-				  (struct page **)(vec->ptrs));
-	if (ret > 0) {
-		vec->got_ref = true;
-		vec->is_pfns = false;
-		goto out_unlocked;
-	}
-
-	mmap_read_lock(mm);
-	vec->got_ref = false;
-	vec->is_pfns = true;
-	ret = 0;
-	do {
-		unsigned long *nums = frame_vector_pfns(vec);
-
-		vma = find_vma_intersection(mm, start, start + 1);
-		if (!vma)
-			break;
-
-		while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
-			err = follow_pfn(vma, start, &nums[ret]);
-			if (err) {
-				if (ret == 0)
-					ret = err;
-				goto out;
-			}
-			start += PAGE_SIZE;
-			ret++;
-		}
-		/* Bail out if VMA doesn't completely cover the tail page. */
-		if (start < vma->vm_end)
-			break;
-	} while (ret < nr_frames);
-out:
-	mmap_read_unlock(mm);
-out_unlocked:
-	if (!ret)
-		ret = -EFAULT;
-	if (ret > 0)
-		vec->nr_frames = ret;
-	return ret;
-}
-EXPORT_SYMBOL(get_vaddr_frames);
-
-/**
- * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
- *			them
- * @vec:	frame vector to put
- *
- * Drop references to pages if get_vaddr_frames() acquired them. We also
- * invalidate the frame vector so that it is prepared for the next call into
- * get_vaddr_frames().
- */
-void put_vaddr_frames(struct frame_vector *vec)
-{
-	struct page **pages;
-
-	if (!vec->got_ref)
-		goto out;
-	pages = frame_vector_pages(vec);
-	/*
-	 * frame_vector_pages() might needed to do a conversion when
-	 * get_vaddr_frames() got pages but vec was later converted to pfns.
-	 * But it shouldn't really fail to convert pfns back...
-	 */
-	if (WARN_ON(IS_ERR(pages)))
-		goto out;
-
-	unpin_user_pages(pages, vec->nr_frames);
-	vec->got_ref = false;
-out:
-	vec->nr_frames = 0;
-}
-EXPORT_SYMBOL(put_vaddr_frames);
-
-/**
- * frame_vector_to_pages - convert frame vector to contain page pointers
- * @vec:	frame vector to convert
- *
- * Convert @vec to contain array of page pointers.  If the conversion is
- * successful, return 0. Otherwise return an error. Note that we do not grab
- * page references for the page structures.
- */
-int frame_vector_to_pages(struct frame_vector *vec)
-{
-	int i;
-	unsigned long *nums;
-	struct page **pages;
-
-	if (!vec->is_pfns)
-		return 0;
-	nums = frame_vector_pfns(vec);
-	for (i = 0; i < vec->nr_frames; i++)
-		if (!pfn_valid(nums[i]))
-			return -EINVAL;
-	pages = (struct page **)nums;
-	for (i = 0; i < vec->nr_frames; i++)
-		pages[i] = pfn_to_page(nums[i]);
-	vec->is_pfns = false;
-	return 0;
-}
-EXPORT_SYMBOL(frame_vector_to_pages);
-
-/**
- * frame_vector_to_pfns - convert frame vector to contain pfns
- * @vec:	frame vector to convert
- *
- * Convert @vec to contain array of pfns.
- */
-void frame_vector_to_pfns(struct frame_vector *vec)
-{
-	int i;
-	unsigned long *nums;
-	struct page **pages;
-
-	if (vec->is_pfns)
-		return;
-	pages = (struct page **)(vec->ptrs);
-	nums = (unsigned long *)pages;
-	for (i = 0; i < vec->nr_frames; i++)
-		nums[i] = page_to_pfn(pages[i]);
-	vec->is_pfns = true;
-}
-EXPORT_SYMBOL(frame_vector_to_pfns);
-
-/**
- * frame_vector_create() - allocate & initialize structure for pinned pfns
- * @nr_frames:	number of pfns slots we should reserve
- *
- * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
- * pfns.
- */
-struct frame_vector *frame_vector_create(unsigned int nr_frames)
-{
-	struct frame_vector *vec;
-	int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
-
-	if (WARN_ON_ONCE(nr_frames == 0))
-		return NULL;
-	/*
-	 * This is absurdly high. It's here just to avoid strange effects when
-	 * arithmetics overflows.
-	 */
-	if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
-		return NULL;
-	/*
-	 * Avoid higher order allocations, use vmalloc instead. It should
-	 * be rare anyway.
-	 */
-	vec = kvmalloc(size, GFP_KERNEL);
-	if (!vec)
-		return NULL;
-	vec->nr_allocated = nr_frames;
-	vec->nr_frames = 0;
-	return vec;
-}
-EXPORT_SYMBOL(frame_vector_create);
-
-/**
- * frame_vector_destroy() - free memory allocated to carry frame vector
- * @vec:	Frame vector to free
- *
- * Free structure allocated by frame_vector_create() to carry frames.
- */
-void frame_vector_destroy(struct frame_vector *vec)
-{
-	/* Make sure put_vaddr_frames() got called properly... */
-	VM_BUG_ON(vec->nr_frames > 0);
-	kvfree(vec);
-}
-EXPORT_SYMBOL(frame_vector_destroy);
-- 
cgit v1.2.3


From 96667f8a4382db9ed042332ca6ee165ae9b91307 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Nov 2020 17:41:21 +0100
Subject: mm: Close race in generic_access_phys
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Way back it was a reasonable assumptions that iomem mappings never
change the pfn range they point at. But this has changed:

- gpu drivers dynamically manage their memory nowadays, invalidating
  ptes with unmap_mapping_range when buffers get moved

- contiguous dma allocations have moved from dedicated carvetouts to
  cma regions. This means if we miss the unmap the pfn might contain
  pagecache or anon memory (well anything allocated with GFP_MOVEABLE)

- even /dev/mem now invalidates mappings when the kernel requests that
  iomem region when CONFIG_IO_STRICT_DEVMEM is set, see 3234ac664a87
  ("/dev/mem: Revoke mappings when a driver claims the region")

Accessing pfns obtained from ptes without holding all the locks is
therefore no longer a good idea. Fix this.

Since ioremap might need to manipulate pagetables too we need to drop
the pt lock and have a retry loop if we raced.

While at it, also add kerneldoc and improve the comment for the
vma_ops->access function. It's for accessing, not for moving the
memory from iomem to system memory, as the old comment seemed to
suggest.

References: 28b2ee20c7cb ("access_process_vm device memory infrastructure")
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Benjamin Herrensmidt <benh@kernel.crashing.org>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20201127164131.2244124-8-daniel.vetter@ffwll.ch
---
 include/linux/mm.h |  3 ++-
 mm/memory.c        | 46 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 049439b65553..c9900aedc195 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -590,7 +590,8 @@ struct vm_operations_struct {
 	vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
-	 * for use by special VMAs that can switch between memory and hardware
+	 * for use by special VMAs. See also generic_access_phys() for a generic
+	 * implementation useful for any iomem mapping.
 	 */
 	int (*access)(struct vm_area_struct *vma, unsigned long addr,
 		      void *buf, int len, int write);
diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..8079c181efff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4834,28 +4834,68 @@ out:
 	return ret;
 }
 
+/**
+ * generic_access_phys - generic implementation for iomem mmap access
+ * @vma: the vma to access
+ * @addr: userspace addres, not relative offset within @vma
+ * @buf: buffer to read/write
+ * @len: length of transfer
+ * @write: set to FOLL_WRITE when writing, otherwise reading
+ *
+ * This is a generic implementation for &vm_operations_struct.access for an
+ * iomem mapping. This callback is used by access_process_vm() when the @vma is
+ * not page based.
+ */
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write)
 {
 	resource_size_t phys_addr;
 	unsigned long prot = 0;
 	void __iomem *maddr;
-	int offset = addr & (PAGE_SIZE-1);
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	int offset = offset_in_page(addr);
+	int ret = -EINVAL;
+
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		return -EINVAL;
+
+retry:
+	if (follow_pte(vma->vm_mm, addr, NULL, &ptep, NULL, &ptl))
+		return -EINVAL;
+	pte = *ptep;
+	pte_unmap_unlock(ptep, ptl);
 
-	if (follow_phys(vma, addr, write, &prot, &phys_addr))
+	prot = pgprot_val(pte_pgprot(pte));
+	phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+	if ((write & FOLL_WRITE) && !pte_write(pte))
 		return -EINVAL;
 
 	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
 	if (!maddr)
 		return -ENOMEM;
 
+	if (follow_pte(vma->vm_mm, addr, NULL, &ptep, NULL, &ptl))
+		goto out_unmap;
+
+	if (!pte_same(pte, *ptep)) {
+		pte_unmap_unlock(ptep, ptl);
+		iounmap(maddr);
+
+		goto retry;
+	}
+
 	if (write)
 		memcpy_toio(maddr + offset, buf, len);
 	else
 		memcpy_fromio(buf, maddr + offset, len);
+	ret = len;
+	pte_unmap_unlock(ptep, ptl);
+out_unmap:
 	iounmap(maddr);
 
-	return len;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(generic_access_phys);
 #endif
-- 
cgit v1.2.3


From 71a1d8ed900f8cf53151beff17e3e2ff8e9283a1 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Nov 2020 17:41:24 +0100
Subject: resource: Move devmem revoke code to resource framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want all iomem mmaps to consistently revoke ptes when the kernel
takes over and CONFIG_IO_STRICT_DEVMEM is enabled. This includes the
pci bar mmaps available through procfs and sysfs, which currently do
not revoke mappings.

To prepare for this, move the code from the /dev/kmem driver to
kernel/resource.c.

During review Jason spotted that barriers are used somewhat
inconsistently. Fix that up while we shuffle this code, since it
doesn't have an actual impact at runtime. Otherwise no semantic and
behavioural changes intended, just code extraction and adjusting
comments and names.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20201127164131.2244124-11-daniel.vetter@ffwll.ch
---
 drivers/char/mem.c     | 85 +------------------------------------------
 include/linux/ioport.h |  6 +---
 kernel/resource.c      | 98 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 99 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 7dcf9e4ea79d..43c871dc7477 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -31,9 +31,6 @@
 #include <linux/uio.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
-#include <linux/pseudo_fs.h>
-#include <uapi/linux/magic.h>
-#include <linux/mount.h>
 
 #ifdef CONFIG_IA64
 # include <linux/efi.h>
@@ -836,42 +833,6 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
 	return ret;
 }
 
-static struct inode *devmem_inode;
-
-#ifdef CONFIG_IO_STRICT_DEVMEM
-void revoke_devmem(struct resource *res)
-{
-	/* pairs with smp_store_release() in devmem_init_inode() */
-	struct inode *inode = smp_load_acquire(&devmem_inode);
-
-	/*
-	 * Check that the initialization has completed. Losing the race
-	 * is ok because it means drivers are claiming resources before
-	 * the fs_initcall level of init and prevent /dev/mem from
-	 * establishing mappings.
-	 */
-	if (!inode)
-		return;
-
-	/*
-	 * The expectation is that the driver has successfully marked
-	 * the resource busy by this point, so devmem_is_allowed()
-	 * should start returning false, however for performance this
-	 * does not iterate the entire resource range.
-	 */
-	if (devmem_is_allowed(PHYS_PFN(res->start)) &&
-	    devmem_is_allowed(PHYS_PFN(res->end))) {
-		/*
-		 * *cringe* iomem=relaxed says "go ahead, what's the
-		 * worst that can happen?"
-		 */
-		return;
-	}
-
-	unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
-}
-#endif
-
 static int open_port(struct inode *inode, struct file *filp)
 {
 	int rc;
@@ -891,7 +852,7 @@ static int open_port(struct inode *inode, struct file *filp)
 	 * revocations when drivers want to take over a /dev/mem mapped
 	 * range.
 	 */
-	filp->f_mapping = inode->i_mapping;
+	filp->f_mapping = iomem_get_mapping();
 
 	return 0;
 }
@@ -1023,48 +984,6 @@ static char *mem_devnode(struct device *dev, umode_t *mode)
 
 static struct class *mem_class;
 
-static int devmem_fs_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type devmem_fs_type = {
-	.name		= "devmem",
-	.owner		= THIS_MODULE,
-	.init_fs_context = devmem_fs_init_fs_context,
-	.kill_sb	= kill_anon_super,
-};
-
-static int devmem_init_inode(void)
-{
-	static struct vfsmount *devmem_vfs_mount;
-	static int devmem_fs_cnt;
-	struct inode *inode;
-	int rc;
-
-	rc = simple_pin_fs(&devmem_fs_type, &devmem_vfs_mount, &devmem_fs_cnt);
-	if (rc < 0) {
-		pr_err("Cannot mount /dev/mem pseudo filesystem: %d\n", rc);
-		return rc;
-	}
-
-	inode = alloc_anon_inode(devmem_vfs_mount->mnt_sb);
-	if (IS_ERR(inode)) {
-		rc = PTR_ERR(inode);
-		pr_err("Cannot allocate inode for /dev/mem: %d\n", rc);
-		simple_release_fs(&devmem_vfs_mount, &devmem_fs_cnt);
-		return rc;
-	}
-
-	/*
-	 * Publish /dev/mem initialized.
-	 * Pairs with smp_load_acquire() in revoke_devmem().
-	 */
-	smp_store_release(&devmem_inode, inode);
-
-	return 0;
-}
-
 static int __init chr_dev_init(void)
 {
 	int minor;
@@ -1086,8 +1005,6 @@ static int __init chr_dev_init(void)
 		 */
 		if ((minor == DEVPORT_MINOR) && !arch_has_dev_port())
 			continue;
-		if ((minor == DEVMEM_MINOR) && devmem_init_inode() != 0)
-			continue;
 
 		device_create(mem_class, NULL, MKDEV(MEM_MAJOR, minor),
 			      NULL, devlist[minor].name);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index fe48b7840665..55de385c839c 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -334,11 +334,7 @@ static inline void irqresource_disabled(struct resource *res, u32 irq)
 	res->flags = IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET;
 }
 
-#ifdef CONFIG_IO_STRICT_DEVMEM
-void revoke_devmem(struct resource *res);
-#else
-static inline void revoke_devmem(struct resource *res) { };
-#endif
+extern struct address_space *iomem_get_mapping(void);
 
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 833394f9c608..627e61b0c124 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -18,12 +18,15 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
 #include <asm/io.h>
 
 
@@ -1119,6 +1122,55 @@ resource_size_t resource_alignment(struct resource *res)
 
 static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
 
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+	/* pairs with smp_store_release() in iomem_init_inode() */
+	struct inode *inode = smp_load_acquire(&iomem_inode);
+
+	/*
+	 * Check that the initialization has completed. Losing the race
+	 * is ok because it means drivers are claiming resources before
+	 * the fs_initcall level of init and prevent iomem_get_mapping users
+	 * from establishing mappings.
+	 */
+	if (!inode)
+		return;
+
+	/*
+	 * The expectation is that the driver has successfully marked
+	 * the resource busy by this point, so devmem_is_allowed()
+	 * should start returning false, however for performance this
+	 * does not iterate the entire resource range.
+	 */
+	if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+	    devmem_is_allowed(PHYS_PFN(res->end))) {
+		/*
+		 * *cringe* iomem=relaxed says "go ahead, what's the
+		 * worst that can happen?"
+		 */
+		return;
+	}
+
+	unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+	/*
+	 * This function is only called from file open paths, hence guaranteed
+	 * that fs_initcalls have completed and no need to check for NULL. But
+	 * since revoke_iomem can be called before the initcall we still need
+	 * the barrier to appease checkers.
+	 */
+	return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
 /**
  * __request_region - create a new busy resource region
  * @parent: parent resource descriptor
@@ -1186,7 +1238,7 @@ struct resource * __request_region(struct resource *parent,
 	write_unlock(&resource_lock);
 
 	if (res && orig_parent == &iomem_resource)
-		revoke_devmem(res);
+		revoke_iomem(res);
 
 	return res;
 }
@@ -1786,4 +1838,48 @@ static int __init strict_iomem(char *str)
 	return 1;
 }
 
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+	return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+	.name		= "iomem",
+	.owner		= THIS_MODULE,
+	.init_fs_context = iomem_fs_init_fs_context,
+	.kill_sb	= kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+	static struct vfsmount *iomem_vfs_mount;
+	static int iomem_fs_cnt;
+	struct inode *inode;
+	int rc;
+
+	rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+	if (rc < 0) {
+		pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+		return rc;
+	}
+
+	inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+	if (IS_ERR(inode)) {
+		rc = PTR_ERR(inode);
+		pr_err("Cannot allocate inode for iomem: %d\n", rc);
+		simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+		return rc;
+	}
+
+	/*
+	 * Publish iomem revocation inode initialized.
+	 * Pairs with smp_load_acquire() in revoke_iomem().
+	 */
+	smp_store_release(&iomem_inode, inode);
+
+	return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
 __setup("iomem=", strict_iomem);
-- 
cgit v1.2.3


From 74b30195395c406c787280a77ae55aed82dbbfc7 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 27 Nov 2020 17:41:25 +0100
Subject: sysfs: Support zapping of binary attr mmaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want to be able to revoke pci mmaps so that the same access rules
applies as for /dev/kmem. Revoke support for devmem was added in
3234ac664a87 ("/dev/mem: Revoke mappings when a driver claims the
region").

The simplest way to achieve this is by having the same filp->f_mapping
for all mappings, so that unmap_mapping_range can find them all, no
matter through which file they've been created. Since this must be set
at open time we need sysfs support for this.

Add an optional mapping parameter bin_attr, which is only consulted
when there's also an mmap callback, since without mmap support
allowing to adjust the ->f_mapping makes no sense.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-mm@kvack.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-media@vger.kernel.org
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Nayna Jain <nayna@linux.ibm.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20201127164131.2244124-12-daniel.vetter@ffwll.ch
---
 fs/sysfs/file.c       | 11 +++++++++++
 include/linux/sysfs.h |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 96d0da65e088..9aefa7779b29 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -170,6 +170,16 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
 	return battr->mmap(of->file, kobj, battr, vma);
 }
 
+static int sysfs_kf_bin_open(struct kernfs_open_file *of)
+{
+	struct bin_attribute *battr = of->kn->priv;
+
+	if (battr->mapping)
+		of->file->f_mapping = battr->mapping;
+
+	return 0;
+}
+
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
 {
 	struct kernfs_node *kn = kobj->sd, *tmp;
@@ -241,6 +251,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
 	.read		= sysfs_kf_bin_read,
 	.write		= sysfs_kf_bin_write,
 	.mmap		= sysfs_kf_bin_mmap,
+	.open		= sysfs_kf_bin_open,
 };
 
 int sysfs_add_file_mode_ns(struct kernfs_node *parent,
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 2caa34c1ca1a..d76a1ddf83a3 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -164,11 +164,13 @@ __ATTRIBUTE_GROUPS(_name)
 
 struct file;
 struct vm_area_struct;
+struct address_space;
 
 struct bin_attribute {
 	struct attribute	attr;
 	size_t			size;
 	void			*private;
+	struct address_space	*mapping;
 	ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *,
 			char *, loff_t, size_t);
 	ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
-- 
cgit v1.2.3


From b90f3726ea3811421f56757e866f01f6e2bb37d6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 6 Jan 2021 18:33:04 -0800
Subject: linux/clk.h: use correct kernel-doc notation for 2 functions

Fix kernel-doc notation for 2 functions so that the generated
html is correct. Currently it skips all text between the
':' and the '-', so "[un]register a clock rate" is missing.

Fixes: 86bcfa2e87c4 ("clk: add pr_debug & kerneldoc around clk notifiers")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Mike Turquette <mturquette@linaro.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-clk@vger.kernel.org
Link: https://lore.kernel.org/r/20210107023304.24442-1-rdunlap@infradead.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 31ff1bf1b79f..45e698b74178 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -92,7 +92,7 @@ struct clk_bulk_data {
 #ifdef CONFIG_COMMON_CLK
 
 /**
- * clk_notifier_register: register a clock rate-change notifier callback
+ * clk_notifier_register - register a clock rate-change notifier callback
  * @clk: clock whose rate we are interested in
  * @nb: notifier block with callback function pointer
  *
@@ -103,7 +103,7 @@ struct clk_bulk_data {
 int clk_notifier_register(struct clk *clk, struct notifier_block *nb);
 
 /**
- * clk_notifier_unregister: unregister a clock rate-change notifier callback
+ * clk_notifier_unregister - unregister a clock rate-change notifier callback
  * @clk: clock whose rate we are no longer interested in
  * @nb: notifier block which will be unregistered
  */
-- 
cgit v1.2.3


From 3188677d4901fbfb7a363ab2558e0a4bc76deecb Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 10 Jan 2021 15:01:59 +0100
Subject: power: supply: max8903: Absorb pdata header

The platform data header is not included by any other file in
the kernel but the driver itself. Decomission the stand-alone
header and absorb it into the driver itself.

Cc: Chris Lapa <chris@lapa.com.au>
Cc: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/max8903_charger.c | 32 ++++++++++++++++++++++++-
 include/linux/power/max8903_charger.h  | 43 ----------------------------------
 2 files changed, 31 insertions(+), 44 deletions(-)
 delete mode 100644 include/linux/power/max8903_charger.h

(limited to 'include/linux')

diff --git a/drivers/power/supply/max8903_charger.c b/drivers/power/supply/max8903_charger.c
index 0bd39b0cc257..ab1aa7df985e 100644
--- a/drivers/power/supply/max8903_charger.c
+++ b/drivers/power/supply/max8903_charger.c
@@ -15,7 +15,37 @@
 #include <linux/slab.h>
 #include <linux/power_supply.h>
 #include <linux/platform_device.h>
-#include <linux/power/max8903_charger.h>
+
+struct max8903_pdata {
+	/*
+	 * GPIOs
+	 * cen, chg, flt, dcm and usus are optional.
+	 * dok and uok are not optional depending on the status of
+	 * dc_valid and usb_valid.
+	 */
+	int cen;	/* Charger Enable input */
+	int dok;	/* DC(Adapter) Power OK output */
+	int uok;	/* USB Power OK output */
+	int chg;	/* Charger status output */
+	int flt;	/* Fault output */
+	int dcm;	/* Current-Limit Mode input (1: DC, 2: USB) */
+	int usus;	/* USB Suspend Input (1: suspended) */
+
+	/*
+	 * DC(Adapter/TA) is wired
+	 * When dc_valid is true,
+	 *	dok should be valid.
+	 *
+	 * At least one of dc_valid or usb_valid should be true.
+	 */
+	bool dc_valid;
+	/*
+	 * USB is wired
+	 * When usb_valid is true,
+	 *	uok should be valid.
+	 */
+	bool usb_valid;
+};
 
 struct max8903_data {
 	struct max8903_pdata *pdata;
diff --git a/include/linux/power/max8903_charger.h b/include/linux/power/max8903_charger.h
deleted file mode 100644
index 02f94a1b323b..000000000000
--- a/include/linux/power/max8903_charger.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * max8903_charger.h - Maxim 8903 USB/Adapter Charger Driver
- *
- * Copyright (C) 2011 Samsung Electronics
- * MyungJoo Ham <myungjoo.ham@samsung.com>
- */
-
-#ifndef __MAX8903_CHARGER_H__
-#define __MAX8903_CHARGER_H__
-
-struct max8903_pdata {
-	/*
-	 * GPIOs
-	 * cen, chg, flt, dcm and usus are optional.
-	 * dok and uok are not optional depending on the status of
-	 * dc_valid and usb_valid.
-	 */
-	int cen;	/* Charger Enable input */
-	int dok;	/* DC(Adapter) Power OK output */
-	int uok;	/* USB Power OK output */
-	int chg;	/* Charger status output */
-	int flt;	/* Fault output */
-	int dcm;	/* Current-Limit Mode input (1: DC, 2: USB) */
-	int usus;	/* USB Suspend Input (1: suspended) */
-
-	/*
-	 * DC(Adapter/TA) is wired
-	 * When dc_valid is true,
-	 *	dok should be valid.
-	 *
-	 * At least one of dc_valid or usb_valid should be true.
-	 */
-	bool dc_valid;
-	/*
-	 * USB is wired
-	 * When usb_valid is true,
-	 *	uok should be valid.
-	 */
-	bool usb_valid;
-};
-
-#endif /* __MAX8903_CHARGER_H__ */
-- 
cgit v1.2.3


From a643bff752dcf72a07e1b2ab2f8587e4f51118be Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 11 Jan 2021 23:55:14 -0800
Subject: bpf: Add bpf_patch_call_args prototype to include/linux/bpf.h

Add bpf_patch_call_args() prototype. This function is called from BPF verifier
and only if CONFIG_BPF_JIT_ALWAYS_ON is not defined. This fixes compiler
warning about missing prototype in some kernel configurations.

Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210112075520.4103414-2-andrii@kernel.org
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 07cb5d15e743..ef9309604b3e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1403,7 +1403,10 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr,
 	      union bpf_attr __user *uattr);
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
+#endif
 
 struct btf *bpf_get_btf_vmlinux(void);
 
-- 
cgit v1.2.3


From 6943c2b05bf09fd5c5729f7d7d803bf3f126cb9a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 11 Jan 2021 23:55:15 -0800
Subject: bpf: Avoid warning when re-casting __bpf_call_base into
 __bpf_call_base_args

BPF interpreter uses extra input argument, so re-casts __bpf_call_base into
__bpf_call_base_args. Avoid compiler warning about incompatible function
prototypes by casting to void * first.

Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210112075520.4103414-3-andrii@kernel.org
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 29c27656165b..5edf2b660881 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -886,7 +886,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 #define __bpf_call_base_args \
 	((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
-	 __bpf_call_base)
+	 (void *)__bpf_call_base)
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
-- 
cgit v1.2.3


From 936f8946bdb48239f4292812d4d2e26c6d328c95 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 11 Jan 2021 23:55:16 -0800
Subject: bpf: Declare __bpf_free_used_maps() unconditionally

__bpf_free_used_maps() is always defined in kernel/bpf/core.c, while
include/linux/bpf.h is guarding it behind CONFIG_BPF_SYSCALL. Move it out of
that guard region and fix compiler warning.

Fixes: a2ea07465c8d ("bpf: Fix missing prog untrack in release_maps")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210112075520.4103414-4-andrii@kernel.org
---
 include/linux/bpf.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ef9309604b3e..6e585dbc10df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1206,8 +1206,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i);
 void bpf_prog_inc(struct bpf_prog *prog);
 struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
-void __bpf_free_used_maps(struct bpf_prog_aux *aux,
-			  struct bpf_map **used_maps, u32 len);
 
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
@@ -1676,6 +1674,9 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 	return bpf_prog_get_type_dev(ufd, type, false);
 }
 
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+			  struct bpf_map **used_maps, u32 len);
+
 bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
-- 
cgit v1.2.3


From 541c3bad8dc51b253ba8686d0cd7628e6b9b5f4c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 11 Jan 2021 23:55:18 -0800
Subject: bpf: Support BPF ksym variables in kernel modules

Add support for directly accessing kernel module variables from BPF programs
using special ldimm64 instructions. This functionality builds upon vmlinux
ksym support, but extends ldimm64 with src_reg=BPF_PSEUDO_BTF_ID to allow
specifying kernel module BTF's FD in insn[1].imm field.

During BPF program load time, verifier will resolve FD to BTF object and will
take reference on BTF object itself and, for module BTFs, corresponding module
as well, to make sure it won't be unloaded from under running BPF program. The
mechanism used is similar to how bpf_prog keeps track of used bpf_maps.

One interesting change is also in how per-CPU variable is determined. The
logic is to find .data..percpu data section in provided BTF, but both vmlinux
and module each have their own .data..percpu entries in BTF. So for module's
case, the search for DATASEC record needs to look at only module's added BTF
types. This is implemented with custom search function.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Hao Luo <haoluo@google.com>
Link: https://lore.kernel.org/bpf/20210112075520.4103414-6-andrii@kernel.org
---
 include/linux/bpf.h          |  10 +++
 include/linux/bpf_verifier.h |   3 +
 include/linux/btf.h          |   3 +
 kernel/bpf/btf.c             |  31 ++++++++-
 kernel/bpf/core.c            |  23 +++++++
 kernel/bpf/verifier.c        | 154 +++++++++++++++++++++++++++++++++++--------
 6 files changed, 194 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6e585dbc10df..1aac2af12fed 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -761,9 +761,15 @@ struct bpf_ctx_arg_aux {
 	u32 btf_id;
 };
 
+struct btf_mod_pair {
+	struct btf *btf;
+	struct module *module;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
+	u32 used_btf_cnt;
 	u32 max_ctx_offset;
 	u32 max_pkt_offset;
 	u32 max_tp_access;
@@ -802,6 +808,7 @@ struct bpf_prog_aux {
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
 	struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */
+	struct btf_mod_pair *used_btfs;
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
@@ -1668,6 +1675,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
+			  struct btf_mod_pair *used_btfs, u32 len);
+
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 						 enum bpf_prog_type type)
 {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e941fe1484e5..dfe6f85d97dd 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -340,6 +340,7 @@ struct bpf_insn_aux_data {
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+#define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */
 
 #define BPF_VERIFIER_TMP_LOG_SIZE	1024
 
@@ -398,7 +399,9 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+	struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */
 	u32 used_map_cnt;		/* number of used maps */
+	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
 	bool allow_ptr_to_map_access;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 4c200f5d242b..7fabf1428093 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -91,6 +91,9 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
 int btf_get_fd_by_id(u32 id);
 u32 btf_obj_id(const struct btf *btf);
 bool btf_is_kernel(const struct btf *btf);
+bool btf_is_module(const struct btf *btf);
+struct module *btf_try_get_module(const struct btf *btf);
+u32 btf_nr_types(const struct btf *btf);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8d6bdb4f4d61..7ccc0133723a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
 }
 
-static u32 btf_nr_types_total(const struct btf *btf)
+u32 btf_nr_types(const struct btf *btf)
 {
 	u32 total = 0;
 
@@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
 	const char *tname;
 	u32 i, total;
 
-	total = btf_nr_types_total(btf);
+	total = btf_nr_types(btf);
 	for (i = 1; i < total; i++) {
 		t = btf_type_by_id(btf, i);
 		if (BTF_INFO_KIND(t->info) != kind)
@@ -5743,6 +5743,11 @@ bool btf_is_kernel(const struct btf *btf)
 	return btf->kernel_btf;
 }
 
+bool btf_is_module(const struct btf *btf)
+{
+	return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
+}
+
 static int btf_id_cmp_func(const void *a, const void *b)
 {
 	const int *pa = a, *pb = b;
@@ -5877,3 +5882,25 @@ static int __init btf_module_init(void)
 
 fs_initcall(btf_module_init);
 #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
+struct module *btf_try_get_module(const struct btf *btf)
+{
+	struct module *res = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	struct btf_module *btf_mod, *tmp;
+
+	mutex_lock(&btf_module_mutex);
+	list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+		if (btf_mod->btf != btf)
+			continue;
+
+		if (try_module_get(btf_mod->module))
+			res = btf_mod->module;
+
+		break;
+	}
+	mutex_unlock(&btf_module_mutex);
+#endif
+
+	return res;
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 261f8692d0d2..69c3c308de5e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2119,6 +2119,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
+void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
+			  struct btf_mod_pair *used_btfs, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+	struct btf_mod_pair *btf_mod;
+	u32 i;
+
+	for (i = 0; i < len; i++) {
+		btf_mod = &used_btfs[i];
+		if (btf_mod->module)
+			module_put(btf_mod->module);
+		btf_put(btf_mod->btf);
+	}
+#endif
+}
+
+static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
+{
+	__bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+	kfree(aux->used_btfs);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
@@ -2126,6 +2148,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 
 	aux = container_of(work, struct bpf_prog_aux, work);
 	bpf_free_used_maps(aux);
+	bpf_free_used_btfs(aux);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5534e667bdb1..ae2aee48cf82 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9703,6 +9703,36 @@ process_bpf_exit:
 	return 0;
 }
 
+static int find_btf_percpu_datasec(struct btf *btf)
+{
+	const struct btf_type *t;
+	const char *tname;
+	int i, n;
+
+	/*
+	 * Both vmlinux and module each have their own ".data..percpu"
+	 * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
+	 * types to look at only module's own BTF types.
+	 */
+	n = btf_nr_types(btf);
+	if (btf_is_module(btf))
+		i = btf_nr_types(btf_vmlinux);
+	else
+		i = 1;
+
+	for(; i < n; i++) {
+		t = btf_type_by_id(btf, i);
+		if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
+			continue;
+
+		tname = btf_name_by_offset(btf, t->name_off);
+		if (!strcmp(tname, ".data..percpu"))
+			return i;
+	}
+
+	return -ENOENT;
+}
+
 /* replace pseudo btf_id with kernel symbol address */
 static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 			       struct bpf_insn *insn,
@@ -9710,48 +9740,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 {
 	const struct btf_var_secinfo *vsi;
 	const struct btf_type *datasec;
+	struct btf_mod_pair *btf_mod;
 	const struct btf_type *t;
 	const char *sym_name;
 	bool percpu = false;
 	u32 type, id = insn->imm;
+	struct btf *btf;
 	s32 datasec_id;
 	u64 addr;
-	int i;
-
-	if (!btf_vmlinux) {
-		verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
-		return -EINVAL;
-	}
+	int i, btf_fd, err;
 
-	if (insn[1].imm != 0) {
-		verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
-		return -EINVAL;
+	btf_fd = insn[1].imm;
+	if (btf_fd) {
+		btf = btf_get_by_fd(btf_fd);
+		if (IS_ERR(btf)) {
+			verbose(env, "invalid module BTF object FD specified.\n");
+			return -EINVAL;
+		}
+	} else {
+		if (!btf_vmlinux) {
+			verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+			return -EINVAL;
+		}
+		btf = btf_vmlinux;
+		btf_get(btf);
 	}
 
-	t = btf_type_by_id(btf_vmlinux, id);
+	t = btf_type_by_id(btf, id);
 	if (!t) {
 		verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
-		return -ENOENT;
+		err = -ENOENT;
+		goto err_put;
 	}
 
 	if (!btf_type_is_var(t)) {
-		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
-			id);
-		return -EINVAL;
+		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+		err = -EINVAL;
+		goto err_put;
 	}
 
-	sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
+	sym_name = btf_name_by_offset(btf, t->name_off);
 	addr = kallsyms_lookup_name(sym_name);
 	if (!addr) {
 		verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
 			sym_name);
-		return -ENOENT;
+		err = -ENOENT;
+		goto err_put;
 	}
 
-	datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
-					   BTF_KIND_DATASEC);
+	datasec_id = find_btf_percpu_datasec(btf);
 	if (datasec_id > 0) {
-		datasec = btf_type_by_id(btf_vmlinux, datasec_id);
+		datasec = btf_type_by_id(btf, datasec_id);
 		for_each_vsi(i, datasec, vsi) {
 			if (vsi->type == id) {
 				percpu = true;
@@ -9764,10 +9803,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 	insn[1].imm = addr >> 32;
 
 	type = t->type;
-	t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
+	t = btf_type_skip_modifiers(btf, type, NULL);
 	if (percpu) {
 		aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
-		aux->btf_var.btf = btf_vmlinux;
+		aux->btf_var.btf = btf;
 		aux->btf_var.btf_id = type;
 	} else if (!btf_type_is_struct(t)) {
 		const struct btf_type *ret;
@@ -9775,21 +9814,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 		u32 tsize;
 
 		/* resolve the type size of ksym. */
-		ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+		ret = btf_resolve_size(btf, t, &tsize);
 		if (IS_ERR(ret)) {
-			tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+			tname = btf_name_by_offset(btf, t->name_off);
 			verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
 				tname, PTR_ERR(ret));
-			return -EINVAL;
+			err = -EINVAL;
+			goto err_put;
 		}
 		aux->btf_var.reg_type = PTR_TO_MEM;
 		aux->btf_var.mem_size = tsize;
 	} else {
 		aux->btf_var.reg_type = PTR_TO_BTF_ID;
-		aux->btf_var.btf = btf_vmlinux;
+		aux->btf_var.btf = btf;
 		aux->btf_var.btf_id = type;
 	}
+
+	/* check whether we recorded this BTF (and maybe module) already */
+	for (i = 0; i < env->used_btf_cnt; i++) {
+		if (env->used_btfs[i].btf == btf) {
+			btf_put(btf);
+			return 0;
+		}
+	}
+
+	if (env->used_btf_cnt >= MAX_USED_BTFS) {
+		err = -E2BIG;
+		goto err_put;
+	}
+
+	btf_mod = &env->used_btfs[env->used_btf_cnt];
+	btf_mod->btf = btf;
+	btf_mod->module = NULL;
+
+	/* if we reference variables from kernel module, bump its refcount */
+	if (btf_is_module(btf)) {
+		btf_mod->module = btf_try_get_module(btf);
+		if (!btf_mod->module) {
+			err = -ENXIO;
+			goto err_put;
+		}
+	}
+
+	env->used_btf_cnt++;
+
 	return 0;
+err_put:
+	btf_put(btf);
+	return err;
 }
 
 static int check_map_prealloc(struct bpf_map *map)
@@ -10086,6 +10158,13 @@ static void release_maps(struct bpf_verifier_env *env)
 			     env->used_map_cnt);
 }
 
+/* drop refcnt of maps used by the rejected program */
+static void release_btfs(struct bpf_verifier_env *env)
+{
+	__bpf_free_used_btfs(env->prog->aux, env->used_btfs,
+			     env->used_btf_cnt);
+}
+
 /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
 static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
 {
@@ -12098,7 +12177,10 @@ skip_full_check:
 		goto err_release_maps;
 	}
 
-	if (ret == 0 && env->used_map_cnt) {
+	if (ret)
+		goto err_release_maps;
+
+	if (env->used_map_cnt) {
 		/* if program passed verifier, update used_maps in bpf_prog_info */
 		env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
 							  sizeof(env->used_maps[0]),
@@ -12112,15 +12194,29 @@ skip_full_check:
 		memcpy(env->prog->aux->used_maps, env->used_maps,
 		       sizeof(env->used_maps[0]) * env->used_map_cnt);
 		env->prog->aux->used_map_cnt = env->used_map_cnt;
+	}
+	if (env->used_btf_cnt) {
+		/* if program passed verifier, update used_btfs in bpf_prog_aux */
+		env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
+							  sizeof(env->used_btfs[0]),
+							  GFP_KERNEL);
+		if (!env->prog->aux->used_btfs) {
+			ret = -ENOMEM;
+			goto err_release_maps;
+		}
 
+		memcpy(env->prog->aux->used_btfs, env->used_btfs,
+		       sizeof(env->used_btfs[0]) * env->used_btf_cnt);
+		env->prog->aux->used_btf_cnt = env->used_btf_cnt;
+	}
+	if (env->used_map_cnt || env->used_btf_cnt) {
 		/* program is valid. Convert pseudo bpf_ld_imm64 into generic
 		 * bpf_ld_imm64 instructions
 		 */
 		convert_pseudo_ld_imm64(env);
 	}
 
-	if (ret == 0)
-		adjust_btf_func(env);
+	adjust_btf_func(env);
 
 err_release_maps:
 	if (!env->prog->aux->used_maps)
@@ -12128,6 +12224,8 @@ err_release_maps:
 		 * them now. Otherwise free_used_maps() will release them.
 		 */
 		release_maps(env);
+	if (!env->prog->aux->used_btfs)
+		release_btfs(env);
 
 	/* extension progs temporarily inherit the attach_type of their targets
 	   for verification purposes, so set it back to zero before returning
-- 
cgit v1.2.3


From 5a9d5ecd69ed65fc2d49cdecfc1c1ab1e4d7af34 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:18 +0100
Subject: can: dev: move bittiming related code into seperate file

This patch moves the bittiming related code of the CAN device infrastructure
into a separate file.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-4-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 MAINTAINERS                     |   1 +
 drivers/net/can/dev/Makefile    |   1 +
 drivers/net/can/dev/bittiming.c | 261 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/can/dev/dev.c       | 261 ----------------------------------------
 include/linux/can/bittiming.h   |  44 +++++++
 include/linux/can/dev.h         |  16 +--
 6 files changed, 308 insertions(+), 276 deletions(-)
 create mode 100644 drivers/net/can/dev/bittiming.c
 create mode 100644 include/linux/can/bittiming.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c3091a91ebbf..d17662df1cd7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3943,6 +3943,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git
 F:	Documentation/devicetree/bindings/net/can/
 F:	drivers/net/can/
+F:	include/linux/can/bittiming.h
 F:	include/linux/can/dev.h
 F:	include/linux/can/led.h
 F:	include/linux/can/platform/
diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile
index cba92e6bcf6f..b5c6bb848d9d 100644
--- a/drivers/net/can/dev/Makefile
+++ b/drivers/net/can/dev/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_CAN_DEV)		+= can-dev.o
+can-dev-y			+= bittiming.o
 can-dev-y			+= dev.o
 can-dev-y			+= rx-offload.o
 
diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c
new file mode 100644
index 000000000000..f7fe226bb395
--- /dev/null
+++ b/drivers/net/can/dev/bittiming.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
+ * Copyright (C) 2006 Andrey Volkov, Varma Electronics
+ * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ */
+
+#include <linux/can/dev.h>
+
+#ifdef CONFIG_CAN_CALC_BITTIMING
+#define CAN_CALC_MAX_ERROR 50 /* in one-tenth of a percent */
+
+/* Bit-timing calculation derived from:
+ *
+ * Code based on LinCAN sources and H8S2638 project
+ * Copyright 2004-2006 Pavel Pisa - DCE FELK CVUT cz
+ * Copyright 2005      Stanislav Marek
+ * email: pisa@cmp.felk.cvut.cz
+ *
+ * Calculates proper bit-timing parameters for a specified bit-rate
+ * and sample-point, which can then be used to set the bit-timing
+ * registers of the CAN controller. You can find more information
+ * in the header file linux/can/netlink.h.
+ */
+static int
+can_update_sample_point(const struct can_bittiming_const *btc,
+			unsigned int sample_point_nominal, unsigned int tseg,
+			unsigned int *tseg1_ptr, unsigned int *tseg2_ptr,
+			unsigned int *sample_point_error_ptr)
+{
+	unsigned int sample_point_error, best_sample_point_error = UINT_MAX;
+	unsigned int sample_point, best_sample_point = 0;
+	unsigned int tseg1, tseg2;
+	int i;
+
+	for (i = 0; i <= 1; i++) {
+		tseg2 = tseg + CAN_SYNC_SEG -
+			(sample_point_nominal * (tseg + CAN_SYNC_SEG)) /
+			1000 - i;
+		tseg2 = clamp(tseg2, btc->tseg2_min, btc->tseg2_max);
+		tseg1 = tseg - tseg2;
+		if (tseg1 > btc->tseg1_max) {
+			tseg1 = btc->tseg1_max;
+			tseg2 = tseg - tseg1;
+		}
+
+		sample_point = 1000 * (tseg + CAN_SYNC_SEG - tseg2) /
+			(tseg + CAN_SYNC_SEG);
+		sample_point_error = abs(sample_point_nominal - sample_point);
+
+		if (sample_point <= sample_point_nominal &&
+		    sample_point_error < best_sample_point_error) {
+			best_sample_point = sample_point;
+			best_sample_point_error = sample_point_error;
+			*tseg1_ptr = tseg1;
+			*tseg2_ptr = tseg2;
+		}
+	}
+
+	if (sample_point_error_ptr)
+		*sample_point_error_ptr = best_sample_point_error;
+
+	return best_sample_point;
+}
+
+int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
+		       const struct can_bittiming_const *btc)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	unsigned int bitrate;			/* current bitrate */
+	unsigned int bitrate_error;		/* difference between current and nominal value */
+	unsigned int best_bitrate_error = UINT_MAX;
+	unsigned int sample_point_error;	/* difference between current and nominal value */
+	unsigned int best_sample_point_error = UINT_MAX;
+	unsigned int sample_point_nominal;	/* nominal sample point */
+	unsigned int best_tseg = 0;		/* current best value for tseg */
+	unsigned int best_brp = 0;		/* current best value for brp */
+	unsigned int brp, tsegall, tseg, tseg1 = 0, tseg2 = 0;
+	u64 v64;
+
+	/* Use CiA recommended sample points */
+	if (bt->sample_point) {
+		sample_point_nominal = bt->sample_point;
+	} else {
+		if (bt->bitrate > 800000)
+			sample_point_nominal = 750;
+		else if (bt->bitrate > 500000)
+			sample_point_nominal = 800;
+		else
+			sample_point_nominal = 875;
+	}
+
+	/* tseg even = round down, odd = round up */
+	for (tseg = (btc->tseg1_max + btc->tseg2_max) * 2 + 1;
+	     tseg >= (btc->tseg1_min + btc->tseg2_min) * 2; tseg--) {
+		tsegall = CAN_SYNC_SEG + tseg / 2;
+
+		/* Compute all possible tseg choices (tseg=tseg1+tseg2) */
+		brp = priv->clock.freq / (tsegall * bt->bitrate) + tseg % 2;
+
+		/* choose brp step which is possible in system */
+		brp = (brp / btc->brp_inc) * btc->brp_inc;
+		if (brp < btc->brp_min || brp > btc->brp_max)
+			continue;
+
+		bitrate = priv->clock.freq / (brp * tsegall);
+		bitrate_error = abs(bt->bitrate - bitrate);
+
+		/* tseg brp biterror */
+		if (bitrate_error > best_bitrate_error)
+			continue;
+
+		/* reset sample point error if we have a better bitrate */
+		if (bitrate_error < best_bitrate_error)
+			best_sample_point_error = UINT_MAX;
+
+		can_update_sample_point(btc, sample_point_nominal, tseg / 2,
+					&tseg1, &tseg2, &sample_point_error);
+		if (sample_point_error > best_sample_point_error)
+			continue;
+
+		best_sample_point_error = sample_point_error;
+		best_bitrate_error = bitrate_error;
+		best_tseg = tseg / 2;
+		best_brp = brp;
+
+		if (bitrate_error == 0 && sample_point_error == 0)
+			break;
+	}
+
+	if (best_bitrate_error) {
+		/* Error in one-tenth of a percent */
+		v64 = (u64)best_bitrate_error * 1000;
+		do_div(v64, bt->bitrate);
+		bitrate_error = (u32)v64;
+		if (bitrate_error > CAN_CALC_MAX_ERROR) {
+			netdev_err(dev,
+				   "bitrate error %d.%d%% too high\n",
+				   bitrate_error / 10, bitrate_error % 10);
+			return -EDOM;
+		}
+		netdev_warn(dev, "bitrate error %d.%d%%\n",
+			    bitrate_error / 10, bitrate_error % 10);
+	}
+
+	/* real sample point */
+	bt->sample_point = can_update_sample_point(btc, sample_point_nominal,
+						   best_tseg, &tseg1, &tseg2,
+						   NULL);
+
+	v64 = (u64)best_brp * 1000 * 1000 * 1000;
+	do_div(v64, priv->clock.freq);
+	bt->tq = (u32)v64;
+	bt->prop_seg = tseg1 / 2;
+	bt->phase_seg1 = tseg1 - bt->prop_seg;
+	bt->phase_seg2 = tseg2;
+
+	/* check for sjw user settings */
+	if (!bt->sjw || !btc->sjw_max) {
+		bt->sjw = 1;
+	} else {
+		/* bt->sjw is at least 1 -> sanitize upper bound to sjw_max */
+		if (bt->sjw > btc->sjw_max)
+			bt->sjw = btc->sjw_max;
+		/* bt->sjw must not be higher than tseg2 */
+		if (tseg2 < bt->sjw)
+			bt->sjw = tseg2;
+	}
+
+	bt->brp = best_brp;
+
+	/* real bitrate */
+	bt->bitrate = priv->clock.freq /
+		(bt->brp * (CAN_SYNC_SEG + tseg1 + tseg2));
+
+	return 0;
+}
+#endif /* CONFIG_CAN_CALC_BITTIMING */
+
+/* Checks the validity of the specified bit-timing parameters prop_seg,
+ * phase_seg1, phase_seg2 and sjw and tries to determine the bitrate
+ * prescaler value brp. You can find more information in the header
+ * file linux/can/netlink.h.
+ */
+static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt,
+			       const struct can_bittiming_const *btc)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	int tseg1, alltseg;
+	u64 brp64;
+
+	tseg1 = bt->prop_seg + bt->phase_seg1;
+	if (!bt->sjw)
+		bt->sjw = 1;
+	if (bt->sjw > btc->sjw_max ||
+	    tseg1 < btc->tseg1_min || tseg1 > btc->tseg1_max ||
+	    bt->phase_seg2 < btc->tseg2_min || bt->phase_seg2 > btc->tseg2_max)
+		return -ERANGE;
+
+	brp64 = (u64)priv->clock.freq * (u64)bt->tq;
+	if (btc->brp_inc > 1)
+		do_div(brp64, btc->brp_inc);
+	brp64 += 500000000UL - 1;
+	do_div(brp64, 1000000000UL); /* the practicable BRP */
+	if (btc->brp_inc > 1)
+		brp64 *= btc->brp_inc;
+	bt->brp = (u32)brp64;
+
+	if (bt->brp < btc->brp_min || bt->brp > btc->brp_max)
+		return -EINVAL;
+
+	alltseg = bt->prop_seg + bt->phase_seg1 + bt->phase_seg2 + 1;
+	bt->bitrate = priv->clock.freq / (bt->brp * alltseg);
+	bt->sample_point = ((tseg1 + 1) * 1000) / alltseg;
+
+	return 0;
+}
+
+/* Checks the validity of predefined bitrate settings */
+static int
+can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt,
+		     const u32 *bitrate_const,
+		     const unsigned int bitrate_const_cnt)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	unsigned int i;
+
+	for (i = 0; i < bitrate_const_cnt; i++) {
+		if (bt->bitrate == bitrate_const[i])
+			break;
+	}
+
+	if (i >= priv->bitrate_const_cnt)
+		return -EINVAL;
+
+	return 0;
+}
+
+int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
+		      const struct can_bittiming_const *btc,
+		      const u32 *bitrate_const,
+		      const unsigned int bitrate_const_cnt)
+{
+	int err;
+
+	/* Depending on the given can_bittiming parameter structure the CAN
+	 * timing parameters are calculated based on the provided bitrate OR
+	 * alternatively the CAN timing parameters (tq, prop_seg, etc.) are
+	 * provided directly which are then checked and fixed up.
+	 */
+	if (!bt->tq && bt->bitrate && btc)
+		err = can_calc_bittiming(dev, bt, btc);
+	else if (bt->tq && !bt->bitrate && btc)
+		err = can_fixup_bittiming(dev, bt, btc);
+	else if (!bt->tq && bt->bitrate && bitrate_const)
+		err = can_validate_bitrate(dev, bt, bitrate_const,
+					   bitrate_const_cnt);
+	else
+		err = -EINVAL;
+
+	return err;
+}
diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 3486704c8a95..1b3ab95b3fd1 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -58,267 +58,6 @@ u8 can_fd_len2dlc(u8 len)
 }
 EXPORT_SYMBOL_GPL(can_fd_len2dlc);
 
-#ifdef CONFIG_CAN_CALC_BITTIMING
-#define CAN_CALC_MAX_ERROR 50 /* in one-tenth of a percent */
-
-/* Bit-timing calculation derived from:
- *
- * Code based on LinCAN sources and H8S2638 project
- * Copyright 2004-2006 Pavel Pisa - DCE FELK CVUT cz
- * Copyright 2005      Stanislav Marek
- * email: pisa@cmp.felk.cvut.cz
- *
- * Calculates proper bit-timing parameters for a specified bit-rate
- * and sample-point, which can then be used to set the bit-timing
- * registers of the CAN controller. You can find more information
- * in the header file linux/can/netlink.h.
- */
-static int
-can_update_sample_point(const struct can_bittiming_const *btc,
-			unsigned int sample_point_nominal, unsigned int tseg,
-			unsigned int *tseg1_ptr, unsigned int *tseg2_ptr,
-			unsigned int *sample_point_error_ptr)
-{
-	unsigned int sample_point_error, best_sample_point_error = UINT_MAX;
-	unsigned int sample_point, best_sample_point = 0;
-	unsigned int tseg1, tseg2;
-	int i;
-
-	for (i = 0; i <= 1; i++) {
-		tseg2 = tseg + CAN_SYNC_SEG -
-			(sample_point_nominal * (tseg + CAN_SYNC_SEG)) /
-			1000 - i;
-		tseg2 = clamp(tseg2, btc->tseg2_min, btc->tseg2_max);
-		tseg1 = tseg - tseg2;
-		if (tseg1 > btc->tseg1_max) {
-			tseg1 = btc->tseg1_max;
-			tseg2 = tseg - tseg1;
-		}
-
-		sample_point = 1000 * (tseg + CAN_SYNC_SEG - tseg2) /
-			(tseg + CAN_SYNC_SEG);
-		sample_point_error = abs(sample_point_nominal - sample_point);
-
-		if (sample_point <= sample_point_nominal &&
-		    sample_point_error < best_sample_point_error) {
-			best_sample_point = sample_point;
-			best_sample_point_error = sample_point_error;
-			*tseg1_ptr = tseg1;
-			*tseg2_ptr = tseg2;
-		}
-	}
-
-	if (sample_point_error_ptr)
-		*sample_point_error_ptr = best_sample_point_error;
-
-	return best_sample_point;
-}
-
-static int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
-			      const struct can_bittiming_const *btc)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	unsigned int bitrate;			/* current bitrate */
-	unsigned int bitrate_error;		/* difference between current and nominal value */
-	unsigned int best_bitrate_error = UINT_MAX;
-	unsigned int sample_point_error;	/* difference between current and nominal value */
-	unsigned int best_sample_point_error = UINT_MAX;
-	unsigned int sample_point_nominal;	/* nominal sample point */
-	unsigned int best_tseg = 0;		/* current best value for tseg */
-	unsigned int best_brp = 0;		/* current best value for brp */
-	unsigned int brp, tsegall, tseg, tseg1 = 0, tseg2 = 0;
-	u64 v64;
-
-	/* Use CiA recommended sample points */
-	if (bt->sample_point) {
-		sample_point_nominal = bt->sample_point;
-	} else {
-		if (bt->bitrate > 800000)
-			sample_point_nominal = 750;
-		else if (bt->bitrate > 500000)
-			sample_point_nominal = 800;
-		else
-			sample_point_nominal = 875;
-	}
-
-	/* tseg even = round down, odd = round up */
-	for (tseg = (btc->tseg1_max + btc->tseg2_max) * 2 + 1;
-	     tseg >= (btc->tseg1_min + btc->tseg2_min) * 2; tseg--) {
-		tsegall = CAN_SYNC_SEG + tseg / 2;
-
-		/* Compute all possible tseg choices (tseg=tseg1+tseg2) */
-		brp = priv->clock.freq / (tsegall * bt->bitrate) + tseg % 2;
-
-		/* choose brp step which is possible in system */
-		brp = (brp / btc->brp_inc) * btc->brp_inc;
-		if (brp < btc->brp_min || brp > btc->brp_max)
-			continue;
-
-		bitrate = priv->clock.freq / (brp * tsegall);
-		bitrate_error = abs(bt->bitrate - bitrate);
-
-		/* tseg brp biterror */
-		if (bitrate_error > best_bitrate_error)
-			continue;
-
-		/* reset sample point error if we have a better bitrate */
-		if (bitrate_error < best_bitrate_error)
-			best_sample_point_error = UINT_MAX;
-
-		can_update_sample_point(btc, sample_point_nominal, tseg / 2,
-					&tseg1, &tseg2, &sample_point_error);
-		if (sample_point_error > best_sample_point_error)
-			continue;
-
-		best_sample_point_error = sample_point_error;
-		best_bitrate_error = bitrate_error;
-		best_tseg = tseg / 2;
-		best_brp = brp;
-
-		if (bitrate_error == 0 && sample_point_error == 0)
-			break;
-	}
-
-	if (best_bitrate_error) {
-		/* Error in one-tenth of a percent */
-		v64 = (u64)best_bitrate_error * 1000;
-		do_div(v64, bt->bitrate);
-		bitrate_error = (u32)v64;
-		if (bitrate_error > CAN_CALC_MAX_ERROR) {
-			netdev_err(dev,
-				   "bitrate error %d.%d%% too high\n",
-				   bitrate_error / 10, bitrate_error % 10);
-			return -EDOM;
-		}
-		netdev_warn(dev, "bitrate error %d.%d%%\n",
-			    bitrate_error / 10, bitrate_error % 10);
-	}
-
-	/* real sample point */
-	bt->sample_point = can_update_sample_point(btc, sample_point_nominal,
-						   best_tseg, &tseg1, &tseg2,
-						   NULL);
-
-	v64 = (u64)best_brp * 1000 * 1000 * 1000;
-	do_div(v64, priv->clock.freq);
-	bt->tq = (u32)v64;
-	bt->prop_seg = tseg1 / 2;
-	bt->phase_seg1 = tseg1 - bt->prop_seg;
-	bt->phase_seg2 = tseg2;
-
-	/* check for sjw user settings */
-	if (!bt->sjw || !btc->sjw_max) {
-		bt->sjw = 1;
-	} else {
-		/* bt->sjw is at least 1 -> sanitize upper bound to sjw_max */
-		if (bt->sjw > btc->sjw_max)
-			bt->sjw = btc->sjw_max;
-		/* bt->sjw must not be higher than tseg2 */
-		if (tseg2 < bt->sjw)
-			bt->sjw = tseg2;
-	}
-
-	bt->brp = best_brp;
-
-	/* real bitrate */
-	bt->bitrate = priv->clock.freq /
-		(bt->brp * (CAN_SYNC_SEG + tseg1 + tseg2));
-
-	return 0;
-}
-#else /* !CONFIG_CAN_CALC_BITTIMING */
-static int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
-			      const struct can_bittiming_const *btc)
-{
-	netdev_err(dev, "bit-timing calculation not available\n");
-	return -EINVAL;
-}
-#endif /* CONFIG_CAN_CALC_BITTIMING */
-
-/* Checks the validity of the specified bit-timing parameters prop_seg,
- * phase_seg1, phase_seg2 and sjw and tries to determine the bitrate
- * prescaler value brp. You can find more information in the header
- * file linux/can/netlink.h.
- */
-static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt,
-			       const struct can_bittiming_const *btc)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	int tseg1, alltseg;
-	u64 brp64;
-
-	tseg1 = bt->prop_seg + bt->phase_seg1;
-	if (!bt->sjw)
-		bt->sjw = 1;
-	if (bt->sjw > btc->sjw_max ||
-	    tseg1 < btc->tseg1_min || tseg1 > btc->tseg1_max ||
-	    bt->phase_seg2 < btc->tseg2_min || bt->phase_seg2 > btc->tseg2_max)
-		return -ERANGE;
-
-	brp64 = (u64)priv->clock.freq * (u64)bt->tq;
-	if (btc->brp_inc > 1)
-		do_div(brp64, btc->brp_inc);
-	brp64 += 500000000UL - 1;
-	do_div(brp64, 1000000000UL); /* the practicable BRP */
-	if (btc->brp_inc > 1)
-		brp64 *= btc->brp_inc;
-	bt->brp = (u32)brp64;
-
-	if (bt->brp < btc->brp_min || bt->brp > btc->brp_max)
-		return -EINVAL;
-
-	alltseg = bt->prop_seg + bt->phase_seg1 + bt->phase_seg2 + 1;
-	bt->bitrate = priv->clock.freq / (bt->brp * alltseg);
-	bt->sample_point = ((tseg1 + 1) * 1000) / alltseg;
-
-	return 0;
-}
-
-/* Checks the validity of predefined bitrate settings */
-static int
-can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt,
-		     const u32 *bitrate_const,
-		     const unsigned int bitrate_const_cnt)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	unsigned int i;
-
-	for (i = 0; i < bitrate_const_cnt; i++) {
-		if (bt->bitrate == bitrate_const[i])
-			break;
-	}
-
-	if (i >= priv->bitrate_const_cnt)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
-			     const struct can_bittiming_const *btc,
-			     const u32 *bitrate_const,
-			     const unsigned int bitrate_const_cnt)
-{
-	int err;
-
-	/* Depending on the given can_bittiming parameter structure the CAN
-	 * timing parameters are calculated based on the provided bitrate OR
-	 * alternatively the CAN timing parameters (tq, prop_seg, etc.) are
-	 * provided directly which are then checked and fixed up.
-	 */
-	if (!bt->tq && bt->bitrate && btc)
-		err = can_calc_bittiming(dev, bt, btc);
-	else if (bt->tq && !bt->bitrate && btc)
-		err = can_fixup_bittiming(dev, bt, btc);
-	else if (!bt->tq && bt->bitrate && bitrate_const)
-		err = can_validate_bitrate(dev, bt, bitrate_const,
-					   bitrate_const_cnt);
-	else
-		err = -EINVAL;
-
-	return err;
-}
-
 static void can_update_state_error_stats(struct net_device *dev,
 					 enum can_state new_state)
 {
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
new file mode 100644
index 000000000000..707575c668f4
--- /dev/null
+++ b/include/linux/can/bittiming.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2020 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
+ */
+
+#ifndef _CAN_BITTIMING_H
+#define _CAN_BITTIMING_H
+
+#include <linux/netdevice.h>
+#include <linux/can/netlink.h>
+
+#define CAN_SYNC_SEG 1
+
+#ifdef CONFIG_CAN_CALC_BITTIMING
+int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
+		       const struct can_bittiming_const *btc);
+#else /* !CONFIG_CAN_CALC_BITTIMING */
+static inline int
+can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
+		   const struct can_bittiming_const *btc)
+{
+	netdev_err(dev, "bit-timing calculation not available\n");
+	return -EINVAL;
+}
+#endif /* CONFIG_CAN_CALC_BITTIMING */
+
+int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
+		      const struct can_bittiming_const *btc,
+		      const u32 *bitrate_const,
+		      const unsigned int bitrate_const_cnt);
+
+/*
+ * can_bit_time() - Duration of one bit
+ *
+ * Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for
+ * additional information.
+ *
+ * Return: the number of time quanta in one bit.
+ */
+static inline unsigned int can_bit_time(const struct can_bittiming *bt)
+{
+	return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2;
+}
+
+#endif /* !_CAN_BITTIMING_H */
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 197a79535cc2..054c3bed190b 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -15,6 +15,7 @@
 #define _CAN_DEV_H
 
 #include <linux/can.h>
+#include <linux/can/bittiming.h>
 #include <linux/can/error.h>
 #include <linux/can/led.h>
 #include <linux/can/netlink.h>
@@ -82,21 +83,6 @@ struct can_priv {
 #endif
 };
 
-#define CAN_SYNC_SEG 1
-
-/*
- * can_bit_time() - Duration of one bit
- *
- * Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for
- * additional information.
- *
- * Return: the number of time quanta in one bit.
- */
-static inline unsigned int can_bit_time(const struct can_bittiming *bt)
-{
-	return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2;
-}
-
 /*
  * can_cc_dlc2len(value) - convert a given data length code (dlc) of a
  * Classical CAN frame into a valid data length of max. 8 bytes.
-- 
cgit v1.2.3


From bdd2e413192dd5f2153d166cd907b048cce872e8 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:19 +0100
Subject: can: dev: move length related code into seperate file

This patch moves all CAN frame length related code of the CAN device
infrastructure into a separate file.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-5-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 MAINTAINERS                  |  1 +
 drivers/net/can/dev/Makefile |  1 +
 drivers/net/can/dev/dev.c    | 33 ------------------------------
 drivers/net/can/dev/length.c | 38 +++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h      | 41 +------------------------------------
 include/linux/can/length.h   | 48 ++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 73 deletions(-)
 create mode 100644 drivers/net/can/dev/length.c
 create mode 100644 include/linux/can/length.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index d17662df1cd7..21780c4e2e71 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3946,6 +3946,7 @@ F:	drivers/net/can/
 F:	include/linux/can/bittiming.h
 F:	include/linux/can/dev.h
 F:	include/linux/can/led.h
+F:	include/linux/can/length.h
 F:	include/linux/can/platform/
 F:	include/linux/can/rx-offload.h
 F:	include/uapi/linux/can/error.h
diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile
index b5c6bb848d9d..5c647951e06d 100644
--- a/drivers/net/can/dev/Makefile
+++ b/drivers/net/can/dev/Makefile
@@ -3,6 +3,7 @@
 obj-$(CONFIG_CAN_DEV)		+= can-dev.o
 can-dev-y			+= bittiming.o
 can-dev-y			+= dev.o
+can-dev-y			+= length.o
 can-dev-y			+= rx-offload.o
 
 can-dev-$(CONFIG_CAN_LEDS)	+= led.o
diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 1b3ab95b3fd1..3372e99d5db7 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -25,39 +25,6 @@ MODULE_DESCRIPTION(MOD_DESC);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>");
 
-/* CAN DLC to real data length conversion helpers */
-
-static const u8 dlc2len[] = {0, 1, 2, 3, 4, 5, 6, 7,
-			     8, 12, 16, 20, 24, 32, 48, 64};
-
-/* get data length from raw data length code (DLC) */
-u8 can_fd_dlc2len(u8 dlc)
-{
-	return dlc2len[dlc & 0x0F];
-}
-EXPORT_SYMBOL_GPL(can_fd_dlc2len);
-
-static const u8 len2dlc[] = {0, 1, 2, 3, 4, 5, 6, 7, 8,		/* 0 - 8 */
-			     9, 9, 9, 9,			/* 9 - 12 */
-			     10, 10, 10, 10,			/* 13 - 16 */
-			     11, 11, 11, 11,			/* 17 - 20 */
-			     12, 12, 12, 12,			/* 21 - 24 */
-			     13, 13, 13, 13, 13, 13, 13, 13,	/* 25 - 32 */
-			     14, 14, 14, 14, 14, 14, 14, 14,	/* 33 - 40 */
-			     14, 14, 14, 14, 14, 14, 14, 14,	/* 41 - 48 */
-			     15, 15, 15, 15, 15, 15, 15, 15,	/* 49 - 56 */
-			     15, 15, 15, 15, 15, 15, 15, 15};	/* 57 - 64 */
-
-/* map the sanitized data length to an appropriate data length code */
-u8 can_fd_len2dlc(u8 len)
-{
-	if (unlikely(len > 64))
-		return 0xF;
-
-	return len2dlc[len];
-}
-EXPORT_SYMBOL_GPL(can_fd_len2dlc);
-
 static void can_update_state_error_stats(struct net_device *dev,
 					 enum can_state new_state)
 {
diff --git a/drivers/net/can/dev/length.c b/drivers/net/can/dev/length.c
new file mode 100644
index 000000000000..6fe18aa23ec9
--- /dev/null
+++ b/drivers/net/can/dev/length.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2012, 2020 Oliver Hartkopp <socketcan@hartkopp.net>
+ */
+
+#include <linux/can/dev.h>
+
+/* CAN DLC to real data length conversion helpers */
+
+static const u8 dlc2len[] = {0, 1, 2, 3, 4, 5, 6, 7,
+			     8, 12, 16, 20, 24, 32, 48, 64};
+
+/* get data length from raw data length code (DLC) */
+u8 can_fd_dlc2len(u8 dlc)
+{
+	return dlc2len[dlc & 0x0F];
+}
+EXPORT_SYMBOL_GPL(can_fd_dlc2len);
+
+static const u8 len2dlc[] = {0, 1, 2, 3, 4, 5, 6, 7, 8,		/* 0 - 8 */
+			     9, 9, 9, 9,			/* 9 - 12 */
+			     10, 10, 10, 10,			/* 13 - 16 */
+			     11, 11, 11, 11,			/* 17 - 20 */
+			     12, 12, 12, 12,			/* 21 - 24 */
+			     13, 13, 13, 13, 13, 13, 13, 13,	/* 25 - 32 */
+			     14, 14, 14, 14, 14, 14, 14, 14,	/* 33 - 40 */
+			     14, 14, 14, 14, 14, 14, 14, 14,	/* 41 - 48 */
+			     15, 15, 15, 15, 15, 15, 15, 15,	/* 49 - 56 */
+			     15, 15, 15, 15, 15, 15, 15, 15};	/* 57 - 64 */
+
+/* map the sanitized data length to an appropriate data length code */
+u8 can_fd_len2dlc(u8 len)
+{
+	if (unlikely(len > 64))
+		return 0xF;
+
+	return len2dlc[len];
+}
+EXPORT_SYMBOL_GPL(can_fd_len2dlc);
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 054c3bed190b..d75fba1d030a 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -18,6 +18,7 @@
 #include <linux/can/bittiming.h>
 #include <linux/can/error.h>
 #include <linux/can/led.h>
+#include <linux/can/length.h>
 #include <linux/can/netlink.h>
 #include <linux/can/skb.h>
 #include <linux/netdevice.h>
@@ -83,15 +84,6 @@ struct can_priv {
 #endif
 };
 
-/*
- * can_cc_dlc2len(value) - convert a given data length code (dlc) of a
- * Classical CAN frame into a valid data length of max. 8 bytes.
- *
- * To be used in the CAN netdriver receive path to ensure conformance with
- * ISO 11898-1 Chapter 8.4.2.3 (DLC field)
- */
-#define can_cc_dlc2len(dlc)	(min_t(u8, (dlc), CAN_MAX_DLEN))
-
 /* Check for outgoing skbs that have not been created by the CAN subsystem */
 static inline bool can_skb_headroom_valid(struct net_device *dev,
 					  struct sk_buff *skb)
@@ -156,31 +148,6 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb)
 	return skb->len == CANFD_MTU;
 }
 
-/* helper to get the data length code (DLC) for Classical CAN raw DLC access */
-static inline u8 can_get_cc_dlc(const struct can_frame *cf, const u32 ctrlmode)
-{
-	/* return len8_dlc as dlc value only if all conditions apply */
-	if ((ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC) &&
-	    (cf->len == CAN_MAX_DLEN) &&
-	    (cf->len8_dlc > CAN_MAX_DLEN && cf->len8_dlc <= CAN_MAX_RAW_DLC))
-		return cf->len8_dlc;
-
-	/* return the payload length as dlc value */
-	return cf->len;
-}
-
-/* helper to set len and len8_dlc value for Classical CAN raw DLC access */
-static inline void can_frame_set_cc_len(struct can_frame *cf, const u8 dlc,
-					const u32 ctrlmode)
-{
-	/* the caller already ensured that dlc is a value from 0 .. 15 */
-	if (ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC && dlc > CAN_MAX_DLEN)
-		cf->len8_dlc = dlc;
-
-	/* limit the payload length 'len' to CAN_MAX_DLEN */
-	cf->len = can_cc_dlc2len(dlc);
-}
-
 /* helper to define static CAN controller features at device creation time */
 static inline void can_set_static_ctrlmode(struct net_device *dev,
 					   u32 static_mode)
@@ -196,12 +163,6 @@ static inline void can_set_static_ctrlmode(struct net_device *dev,
 		dev->mtu = CANFD_MTU;
 }
 
-/* get data length from raw data length code (DLC) */
-u8 can_fd_dlc2len(u8 dlc);
-
-/* map the sanitized data length to an appropriate data length code */
-u8 can_fd_len2dlc(u8 len);
-
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 				    unsigned int txqs, unsigned int rxqs);
 #define alloc_candev(sizeof_priv, echo_skb_max) \
diff --git a/include/linux/can/length.h b/include/linux/can/length.h
new file mode 100644
index 000000000000..156b9d17969a
--- /dev/null
+++ b/include/linux/can/length.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 Oliver Hartkopp <socketcan@hartkopp.net>
+ */
+
+#ifndef _CAN_LENGTH_H
+#define _CAN_LENGTH_H
+
+/*
+ * can_cc_dlc2len(value) - convert a given data length code (dlc) of a
+ * Classical CAN frame into a valid data length of max. 8 bytes.
+ *
+ * To be used in the CAN netdriver receive path to ensure conformance with
+ * ISO 11898-1 Chapter 8.4.2.3 (DLC field)
+ */
+#define can_cc_dlc2len(dlc)	(min_t(u8, (dlc), CAN_MAX_DLEN))
+
+/* helper to get the data length code (DLC) for Classical CAN raw DLC access */
+static inline u8 can_get_cc_dlc(const struct can_frame *cf, const u32 ctrlmode)
+{
+	/* return len8_dlc as dlc value only if all conditions apply */
+	if ((ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC) &&
+	    (cf->len == CAN_MAX_DLEN) &&
+	    (cf->len8_dlc > CAN_MAX_DLEN && cf->len8_dlc <= CAN_MAX_RAW_DLC))
+		return cf->len8_dlc;
+
+	/* return the payload length as dlc value */
+	return cf->len;
+}
+
+/* helper to set len and len8_dlc value for Classical CAN raw DLC access */
+static inline void can_frame_set_cc_len(struct can_frame *cf, const u8 dlc,
+					const u32 ctrlmode)
+{
+	/* the caller already ensured that dlc is a value from 0 .. 15 */
+	if (ctrlmode & CAN_CTRLMODE_CC_LEN8_DLC && dlc > CAN_MAX_DLEN)
+		cf->len8_dlc = dlc;
+
+	/* limit the payload length 'len' to CAN_MAX_DLEN */
+	cf->len = can_cc_dlc2len(dlc);
+}
+
+/* get data length from raw data length code (DLC) */
+u8 can_fd_dlc2len(u8 dlc);
+
+/* map the sanitized data length to an appropriate data length code */
+u8 can_fd_len2dlc(u8 len);
+
+#endif /* !_CAN_LENGTH_H */
-- 
cgit v1.2.3


From 18f2dbfd2232212f53af9d249682f13a8335d54f Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:20 +0100
Subject: can: dev: move skb related into seperate file

This patch moves the skb related code of the CAN device infrastructure into a
separate file.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-6-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/Makefile |   1 +
 drivers/net/can/dev/dev.c    | 213 -----------------------------------------
 drivers/net/can/dev/skb.c    | 220 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h      |  76 ---------------
 include/linux/can/skb.h      |  77 +++++++++++++++
 5 files changed, 298 insertions(+), 289 deletions(-)
 create mode 100644 drivers/net/can/dev/skb.c

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile
index 5c647951e06d..188bd53b113c 100644
--- a/drivers/net/can/dev/Makefile
+++ b/drivers/net/can/dev/Makefile
@@ -5,5 +5,6 @@ can-dev-y			+= bittiming.o
 can-dev-y			+= dev.o
 can-dev-y			+= length.o
 can-dev-y			+= rx-offload.o
+can-dev-y                       += skb.o
 
 can-dev-$(CONFIG_CAN_LEDS)	+= led.o
diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 3372e99d5db7..fe71ff9ef8c9 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -132,149 +132,6 @@ void can_change_state(struct net_device *dev, struct can_frame *cf,
 }
 EXPORT_SYMBOL_GPL(can_change_state);
 
-/* Local echo of CAN messages
- *
- * CAN network devices *should* support a local echo functionality
- * (see Documentation/networking/can.rst). To test the handling of CAN
- * interfaces that do not support the local echo both driver types are
- * implemented. In the case that the driver does not support the echo
- * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core
- * to perform the echo as a fallback solution.
- */
-static void can_flush_echo_skb(struct net_device *dev)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	int i;
-
-	for (i = 0; i < priv->echo_skb_max; i++) {
-		if (priv->echo_skb[i]) {
-			kfree_skb(priv->echo_skb[i]);
-			priv->echo_skb[i] = NULL;
-			stats->tx_dropped++;
-			stats->tx_aborted_errors++;
-		}
-	}
-}
-
-/* Put the skb on the stack to be looped backed locally lateron
- *
- * The function is typically called in the start_xmit function
- * of the device driver. The driver must protect access to
- * priv->echo_skb, if necessary.
- */
-int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
-		     unsigned int idx)
-{
-	struct can_priv *priv = netdev_priv(dev);
-
-	BUG_ON(idx >= priv->echo_skb_max);
-
-	/* check flag whether this packet has to be looped back */
-	if (!(dev->flags & IFF_ECHO) || skb->pkt_type != PACKET_LOOPBACK ||
-	    (skb->protocol != htons(ETH_P_CAN) &&
-	     skb->protocol != htons(ETH_P_CANFD))) {
-		kfree_skb(skb);
-		return 0;
-	}
-
-	if (!priv->echo_skb[idx]) {
-		skb = can_create_echo_skb(skb);
-		if (!skb)
-			return -ENOMEM;
-
-		/* make settings for echo to reduce code in irq context */
-		skb->pkt_type = PACKET_BROADCAST;
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		skb->dev = dev;
-
-		/* save this skb for tx interrupt echo handling */
-		priv->echo_skb[idx] = skb;
-	} else {
-		/* locking problem with netif_stop_queue() ?? */
-		netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx);
-		kfree_skb(skb);
-		return -EBUSY;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(can_put_echo_skb);
-
-struct sk_buff *
-__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr)
-{
-	struct can_priv *priv = netdev_priv(dev);
-
-	if (idx >= priv->echo_skb_max) {
-		netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n",
-			   __func__, idx, priv->echo_skb_max);
-		return NULL;
-	}
-
-	if (priv->echo_skb[idx]) {
-		/* Using "struct canfd_frame::len" for the frame
-		 * length is supported on both CAN and CANFD frames.
-		 */
-		struct sk_buff *skb = priv->echo_skb[idx];
-		struct canfd_frame *cf = (struct canfd_frame *)skb->data;
-
-		/* get the real payload length for netdev statistics */
-		if (cf->can_id & CAN_RTR_FLAG)
-			*len_ptr = 0;
-		else
-			*len_ptr = cf->len;
-
-		priv->echo_skb[idx] = NULL;
-
-		return skb;
-	}
-
-	return NULL;
-}
-
-/* Get the skb from the stack and loop it back locally
- *
- * The function is typically called when the TX done interrupt
- * is handled in the device driver. The driver must protect
- * access to priv->echo_skb, if necessary.
- */
-unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
-{
-	struct sk_buff *skb;
-	u8 len;
-
-	skb = __can_get_echo_skb(dev, idx, &len);
-	if (!skb)
-		return 0;
-
-	skb_get(skb);
-	if (netif_rx(skb) == NET_RX_SUCCESS)
-		dev_consume_skb_any(skb);
-	else
-		dev_kfree_skb_any(skb);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(can_get_echo_skb);
-
-/* Remove the skb from the stack and free it.
- *
- * The function is typically called when TX failed.
- */
-void can_free_echo_skb(struct net_device *dev, unsigned int idx)
-{
-	struct can_priv *priv = netdev_priv(dev);
-
-	BUG_ON(idx >= priv->echo_skb_max);
-
-	if (priv->echo_skb[idx]) {
-		dev_kfree_skb_any(priv->echo_skb[idx]);
-		priv->echo_skb[idx] = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(can_free_echo_skb);
-
 /* CAN device restart for bus-off recovery */
 static void can_restart(struct net_device *dev)
 {
@@ -379,76 +236,6 @@ static void can_setup(struct net_device *dev)
 	dev->features = NETIF_F_HW_CSUM;
 }
 
-struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
-{
-	struct sk_buff *skb;
-
-	skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) +
-			       sizeof(struct can_frame));
-	if (unlikely(!skb))
-		return NULL;
-
-	skb->protocol = htons(ETH_P_CAN);
-	skb->pkt_type = PACKET_BROADCAST;
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-
-	can_skb_reserve(skb);
-	can_skb_prv(skb)->ifindex = dev->ifindex;
-	can_skb_prv(skb)->skbcnt = 0;
-
-	*cf = skb_put_zero(skb, sizeof(struct can_frame));
-
-	return skb;
-}
-EXPORT_SYMBOL_GPL(alloc_can_skb);
-
-struct sk_buff *alloc_canfd_skb(struct net_device *dev,
-				struct canfd_frame **cfd)
-{
-	struct sk_buff *skb;
-
-	skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) +
-			       sizeof(struct canfd_frame));
-	if (unlikely(!skb))
-		return NULL;
-
-	skb->protocol = htons(ETH_P_CANFD);
-	skb->pkt_type = PACKET_BROADCAST;
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-
-	can_skb_reserve(skb);
-	can_skb_prv(skb)->ifindex = dev->ifindex;
-	can_skb_prv(skb)->skbcnt = 0;
-
-	*cfd = skb_put_zero(skb, sizeof(struct canfd_frame));
-
-	return skb;
-}
-EXPORT_SYMBOL_GPL(alloc_canfd_skb);
-
-struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf)
-{
-	struct sk_buff *skb;
-
-	skb = alloc_can_skb(dev, cf);
-	if (unlikely(!skb))
-		return NULL;
-
-	(*cf)->can_id = CAN_ERR_FLAG;
-	(*cf)->len = CAN_ERR_DLC;
-
-	return skb;
-}
-EXPORT_SYMBOL_GPL(alloc_can_err_skb);
-
 /* Allocate and setup space for the CAN network device */
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 				    unsigned int txqs, unsigned int rxqs)
diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
new file mode 100644
index 000000000000..26cd597ff771
--- /dev/null
+++ b/drivers/net/can/dev/skb.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
+ * Copyright (C) 2006 Andrey Volkov, Varma Electronics
+ * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ */
+
+#include <linux/can/dev.h>
+
+/* Local echo of CAN messages
+ *
+ * CAN network devices *should* support a local echo functionality
+ * (see Documentation/networking/can.rst). To test the handling of CAN
+ * interfaces that do not support the local echo both driver types are
+ * implemented. In the case that the driver does not support the echo
+ * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core
+ * to perform the echo as a fallback solution.
+ */
+void can_flush_echo_skb(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	struct net_device_stats *stats = &dev->stats;
+	int i;
+
+	for (i = 0; i < priv->echo_skb_max; i++) {
+		if (priv->echo_skb[i]) {
+			kfree_skb(priv->echo_skb[i]);
+			priv->echo_skb[i] = NULL;
+			stats->tx_dropped++;
+			stats->tx_aborted_errors++;
+		}
+	}
+}
+
+/* Put the skb on the stack to be looped backed locally lateron
+ *
+ * The function is typically called in the start_xmit function
+ * of the device driver. The driver must protect access to
+ * priv->echo_skb, if necessary.
+ */
+int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
+		     unsigned int idx)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	BUG_ON(idx >= priv->echo_skb_max);
+
+	/* check flag whether this packet has to be looped back */
+	if (!(dev->flags & IFF_ECHO) || skb->pkt_type != PACKET_LOOPBACK ||
+	    (skb->protocol != htons(ETH_P_CAN) &&
+	     skb->protocol != htons(ETH_P_CANFD))) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	if (!priv->echo_skb[idx]) {
+		skb = can_create_echo_skb(skb);
+		if (!skb)
+			return -ENOMEM;
+
+		/* make settings for echo to reduce code in irq context */
+		skb->pkt_type = PACKET_BROADCAST;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		skb->dev = dev;
+
+		/* save this skb for tx interrupt echo handling */
+		priv->echo_skb[idx] = skb;
+	} else {
+		/* locking problem with netif_stop_queue() ?? */
+		netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx);
+		kfree_skb(skb);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(can_put_echo_skb);
+
+struct sk_buff *
+__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	if (idx >= priv->echo_skb_max) {
+		netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n",
+			   __func__, idx, priv->echo_skb_max);
+		return NULL;
+	}
+
+	if (priv->echo_skb[idx]) {
+		/* Using "struct canfd_frame::len" for the frame
+		 * length is supported on both CAN and CANFD frames.
+		 */
+		struct sk_buff *skb = priv->echo_skb[idx];
+		struct canfd_frame *cf = (struct canfd_frame *)skb->data;
+
+		/* get the real payload length for netdev statistics */
+		if (cf->can_id & CAN_RTR_FLAG)
+			*len_ptr = 0;
+		else
+			*len_ptr = cf->len;
+
+		priv->echo_skb[idx] = NULL;
+
+		return skb;
+	}
+
+	return NULL;
+}
+
+/* Get the skb from the stack and loop it back locally
+ *
+ * The function is typically called when the TX done interrupt
+ * is handled in the device driver. The driver must protect
+ * access to priv->echo_skb, if necessary.
+ */
+unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
+{
+	struct sk_buff *skb;
+	u8 len;
+
+	skb = __can_get_echo_skb(dev, idx, &len);
+	if (!skb)
+		return 0;
+
+	skb_get(skb);
+	if (netif_rx(skb) == NET_RX_SUCCESS)
+		dev_consume_skb_any(skb);
+	else
+		dev_kfree_skb_any(skb);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(can_get_echo_skb);
+
+/* Remove the skb from the stack and free it.
+ *
+ * The function is typically called when TX failed.
+ */
+void can_free_echo_skb(struct net_device *dev, unsigned int idx)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	BUG_ON(idx >= priv->echo_skb_max);
+
+	if (priv->echo_skb[idx]) {
+		dev_kfree_skb_any(priv->echo_skb[idx]);
+		priv->echo_skb[idx] = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(can_free_echo_skb);
+
+struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
+{
+	struct sk_buff *skb;
+
+	skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) +
+			       sizeof(struct can_frame));
+	if (unlikely(!skb))
+		return NULL;
+
+	skb->protocol = htons(ETH_P_CAN);
+	skb->pkt_type = PACKET_BROADCAST;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+
+	can_skb_reserve(skb);
+	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
+
+	*cf = skb_put_zero(skb, sizeof(struct can_frame));
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(alloc_can_skb);
+
+struct sk_buff *alloc_canfd_skb(struct net_device *dev,
+				struct canfd_frame **cfd)
+{
+	struct sk_buff *skb;
+
+	skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) +
+			       sizeof(struct canfd_frame));
+	if (unlikely(!skb))
+		return NULL;
+
+	skb->protocol = htons(ETH_P_CANFD);
+	skb->pkt_type = PACKET_BROADCAST;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+
+	can_skb_reserve(skb);
+	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
+
+	*cfd = skb_put_zero(skb, sizeof(struct canfd_frame));
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(alloc_canfd_skb);
+
+struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_can_skb(dev, cf);
+	if (unlikely(!skb))
+		return NULL;
+
+	(*cf)->can_id = CAN_ERR_FLAG;
+	(*cf)->len = CAN_ERR_DLC;
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(alloc_can_err_skb);
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index d75fba1d030a..4a26e128af7f 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -84,69 +84,6 @@ struct can_priv {
 #endif
 };
 
-/* Check for outgoing skbs that have not been created by the CAN subsystem */
-static inline bool can_skb_headroom_valid(struct net_device *dev,
-					  struct sk_buff *skb)
-{
-	/* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */
-	if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv)))
-		return false;
-
-	/* af_packet does not apply CAN skb specific settings */
-	if (skb->ip_summed == CHECKSUM_NONE) {
-		/* init headroom */
-		can_skb_prv(skb)->ifindex = dev->ifindex;
-		can_skb_prv(skb)->skbcnt = 0;
-
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-		/* perform proper loopback on capable devices */
-		if (dev->flags & IFF_ECHO)
-			skb->pkt_type = PACKET_LOOPBACK;
-		else
-			skb->pkt_type = PACKET_HOST;
-
-		skb_reset_mac_header(skb);
-		skb_reset_network_header(skb);
-		skb_reset_transport_header(skb);
-	}
-
-	return true;
-}
-
-/* Drop a given socketbuffer if it does not contain a valid CAN frame. */
-static inline bool can_dropped_invalid_skb(struct net_device *dev,
-					  struct sk_buff *skb)
-{
-	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
-
-	if (skb->protocol == htons(ETH_P_CAN)) {
-		if (unlikely(skb->len != CAN_MTU ||
-			     cfd->len > CAN_MAX_DLEN))
-			goto inval_skb;
-	} else if (skb->protocol == htons(ETH_P_CANFD)) {
-		if (unlikely(skb->len != CANFD_MTU ||
-			     cfd->len > CANFD_MAX_DLEN))
-			goto inval_skb;
-	} else
-		goto inval_skb;
-
-	if (!can_skb_headroom_valid(dev, skb))
-		goto inval_skb;
-
-	return false;
-
-inval_skb:
-	kfree_skb(skb);
-	dev->stats.tx_dropped++;
-	return true;
-}
-
-static inline bool can_is_canfd_skb(const struct sk_buff *skb)
-{
-	/* the CAN specific type of skb is identified by its data length */
-	return skb->len == CANFD_MTU;
-}
 
 /* helper to define static CAN controller features at device creation time */
 static inline void can_set_static_ctrlmode(struct net_device *dev,
@@ -187,23 +124,10 @@ void can_bus_off(struct net_device *dev);
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state);
 
-int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
-		     unsigned int idx);
-struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
-				   u8 *len_ptr);
-unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
-void can_free_echo_skb(struct net_device *dev, unsigned int idx);
-
 #ifdef CONFIG_OF
 void of_can_transceiver(struct net_device *dev);
 #else
 static inline void of_can_transceiver(struct net_device *dev) { }
 #endif
 
-struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
-struct sk_buff *alloc_canfd_skb(struct net_device *dev,
-				struct canfd_frame **cfd);
-struct sk_buff *alloc_can_err_skb(struct net_device *dev,
-				  struct can_frame **cf);
-
 #endif /* !_CAN_DEV_H */
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index fc61cf4eff1c..c90ebbd3008c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -16,6 +16,19 @@
 #include <linux/can.h>
 #include <net/sock.h>
 
+void can_flush_echo_skb(struct net_device *dev);
+int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
+		     unsigned int idx);
+struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
+				   u8 *len_ptr);
+unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
+void can_free_echo_skb(struct net_device *dev, unsigned int idx);
+struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
+struct sk_buff *alloc_canfd_skb(struct net_device *dev,
+				struct canfd_frame **cfd);
+struct sk_buff *alloc_can_err_skb(struct net_device *dev,
+				  struct can_frame **cf);
+
 /*
  * The struct can_skb_priv is used to transport additional information along
  * with the stored struct can(fd)_frame that can not be contained in existing
@@ -74,4 +87,68 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb)
 	return nskb;
 }
 
+/* Check for outgoing skbs that have not been created by the CAN subsystem */
+static inline bool can_skb_headroom_valid(struct net_device *dev,
+					  struct sk_buff *skb)
+{
+	/* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */
+	if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv)))
+		return false;
+
+	/* af_packet does not apply CAN skb specific settings */
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		/* init headroom */
+		can_skb_prv(skb)->ifindex = dev->ifindex;
+		can_skb_prv(skb)->skbcnt = 0;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		/* perform proper loopback on capable devices */
+		if (dev->flags & IFF_ECHO)
+			skb->pkt_type = PACKET_LOOPBACK;
+		else
+			skb->pkt_type = PACKET_HOST;
+
+		skb_reset_mac_header(skb);
+		skb_reset_network_header(skb);
+		skb_reset_transport_header(skb);
+	}
+
+	return true;
+}
+
+/* Drop a given socketbuffer if it does not contain a valid CAN frame. */
+static inline bool can_dropped_invalid_skb(struct net_device *dev,
+					  struct sk_buff *skb)
+{
+	const struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
+	if (skb->protocol == htons(ETH_P_CAN)) {
+		if (unlikely(skb->len != CAN_MTU ||
+			     cfd->len > CAN_MAX_DLEN))
+			goto inval_skb;
+	} else if (skb->protocol == htons(ETH_P_CANFD)) {
+		if (unlikely(skb->len != CANFD_MTU ||
+			     cfd->len > CANFD_MAX_DLEN))
+			goto inval_skb;
+	} else
+		goto inval_skb;
+
+	if (!can_skb_headroom_valid(dev, skb))
+		goto inval_skb;
+
+	return false;
+
+inval_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return true;
+}
+
+static inline bool can_is_canfd_skb(const struct sk_buff *skb)
+{
+	/* the CAN specific type of skb is identified by its data length */
+	return skb->len == CANFD_MTU;
+}
+
 #endif /* !_CAN_SKB_H */
-- 
cgit v1.2.3


From 0a042c6ec991e4d33062ee52e9e23f615bc659f9 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:21 +0100
Subject: can: dev: move netlink related code into seperate file

This patch moves the netlink related code of the CAN device infrastructure into
a separate file.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-7-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/Makefile  |   1 +
 drivers/net/can/dev/dev.c     | 370 +----------------------------------------
 drivers/net/can/dev/netlink.c | 379 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h       |   6 +
 4 files changed, 389 insertions(+), 367 deletions(-)
 create mode 100644 drivers/net/can/dev/netlink.c

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile
index 188bd53b113c..3e2e207861fc 100644
--- a/drivers/net/can/dev/Makefile
+++ b/drivers/net/can/dev/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_CAN_DEV)		+= can-dev.o
 can-dev-y			+= bittiming.o
 can-dev-y			+= dev.o
 can-dev-y			+= length.o
+can-dev-y			+= netlink.o
 can-dev-y			+= rx-offload.o
 can-dev-y                       += skb.o
 
diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index fe71ff9ef8c9..f580f0ac6497 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -14,10 +14,8 @@
 #include <linux/can/can-ml.h>
 #include <linux/can/dev.h>
 #include <linux/can/skb.h>
-#include <linux/can/netlink.h>
 #include <linux/can/led.h>
 #include <linux/of.h>
-#include <net/rtnetlink.h>
 
 #define MOD_DESC "CAN device driver interface"
 
@@ -223,7 +221,7 @@ void can_bus_off(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(can_bus_off);
 
-static void can_setup(struct net_device *dev)
+void can_setup(struct net_device *dev)
 {
 	dev->type = ARPHRD_CAN;
 	dev->mtu = CAN_MTU;
@@ -399,368 +397,6 @@ void close_candev(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(close_candev);
 
-/* CAN netlink interface */
-static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
-	[IFLA_CAN_STATE]	= { .type = NLA_U32 },
-	[IFLA_CAN_CTRLMODE]	= { .len = sizeof(struct can_ctrlmode) },
-	[IFLA_CAN_RESTART_MS]	= { .type = NLA_U32 },
-	[IFLA_CAN_RESTART]	= { .type = NLA_U32 },
-	[IFLA_CAN_BITTIMING]	= { .len = sizeof(struct can_bittiming) },
-	[IFLA_CAN_BITTIMING_CONST]
-				= { .len = sizeof(struct can_bittiming_const) },
-	[IFLA_CAN_CLOCK]	= { .len = sizeof(struct can_clock) },
-	[IFLA_CAN_BERR_COUNTER]	= { .len = sizeof(struct can_berr_counter) },
-	[IFLA_CAN_DATA_BITTIMING]
-				= { .len = sizeof(struct can_bittiming) },
-	[IFLA_CAN_DATA_BITTIMING_CONST]
-				= { .len = sizeof(struct can_bittiming_const) },
-	[IFLA_CAN_TERMINATION]	= { .type = NLA_U16 },
-};
-
-static int can_validate(struct nlattr *tb[], struct nlattr *data[],
-			struct netlink_ext_ack *extack)
-{
-	bool is_can_fd = false;
-
-	/* Make sure that valid CAN FD configurations always consist of
-	 * - nominal/arbitration bittiming
-	 * - data bittiming
-	 * - control mode with CAN_CTRLMODE_FD set
-	 */
-
-	if (!data)
-		return 0;
-
-	if (data[IFLA_CAN_CTRLMODE]) {
-		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
-
-		is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD;
-	}
-
-	if (is_can_fd) {
-		if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING])
-			return -EOPNOTSUPP;
-	}
-
-	if (data[IFLA_CAN_DATA_BITTIMING]) {
-		if (!is_can_fd || !data[IFLA_CAN_BITTIMING])
-			return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-static int can_changelink(struct net_device *dev, struct nlattr *tb[],
-			  struct nlattr *data[],
-			  struct netlink_ext_ack *extack)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	int err;
-
-	/* We need synchronization with dev->stop() */
-	ASSERT_RTNL();
-
-	if (data[IFLA_CAN_BITTIMING]) {
-		struct can_bittiming bt;
-
-		/* Do not allow changing bittiming while running */
-		if (dev->flags & IFF_UP)
-			return -EBUSY;
-
-		/* Calculate bittiming parameters based on
-		 * bittiming_const if set, otherwise pass bitrate
-		 * directly via do_set_bitrate(). Bail out if neither
-		 * is given.
-		 */
-		if (!priv->bittiming_const && !priv->do_set_bittiming)
-			return -EOPNOTSUPP;
-
-		memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt));
-		err = can_get_bittiming(dev, &bt,
-					priv->bittiming_const,
-					priv->bitrate_const,
-					priv->bitrate_const_cnt);
-		if (err)
-			return err;
-
-		if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) {
-			netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n",
-				   priv->bitrate_max);
-			return -EINVAL;
-		}
-
-		memcpy(&priv->bittiming, &bt, sizeof(bt));
-
-		if (priv->do_set_bittiming) {
-			/* Finally, set the bit-timing registers */
-			err = priv->do_set_bittiming(dev);
-			if (err)
-				return err;
-		}
-	}
-
-	if (data[IFLA_CAN_CTRLMODE]) {
-		struct can_ctrlmode *cm;
-		u32 ctrlstatic;
-		u32 maskedflags;
-
-		/* Do not allow changing controller mode while running */
-		if (dev->flags & IFF_UP)
-			return -EBUSY;
-		cm = nla_data(data[IFLA_CAN_CTRLMODE]);
-		ctrlstatic = priv->ctrlmode_static;
-		maskedflags = cm->flags & cm->mask;
-
-		/* check whether provided bits are allowed to be passed */
-		if (cm->mask & ~(priv->ctrlmode_supported | ctrlstatic))
-			return -EOPNOTSUPP;
-
-		/* do not check for static fd-non-iso if 'fd' is disabled */
-		if (!(maskedflags & CAN_CTRLMODE_FD))
-			ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO;
-
-		/* make sure static options are provided by configuration */
-		if ((maskedflags & ctrlstatic) != ctrlstatic)
-			return -EOPNOTSUPP;
-
-		/* clear bits to be modified and copy the flag values */
-		priv->ctrlmode &= ~cm->mask;
-		priv->ctrlmode |= maskedflags;
-
-		/* CAN_CTRLMODE_FD can only be set when driver supports FD */
-		if (priv->ctrlmode & CAN_CTRLMODE_FD)
-			dev->mtu = CANFD_MTU;
-		else
-			dev->mtu = CAN_MTU;
-	}
-
-	if (data[IFLA_CAN_RESTART_MS]) {
-		/* Do not allow changing restart delay while running */
-		if (dev->flags & IFF_UP)
-			return -EBUSY;
-		priv->restart_ms = nla_get_u32(data[IFLA_CAN_RESTART_MS]);
-	}
-
-	if (data[IFLA_CAN_RESTART]) {
-		/* Do not allow a restart while not running */
-		if (!(dev->flags & IFF_UP))
-			return -EINVAL;
-		err = can_restart_now(dev);
-		if (err)
-			return err;
-	}
-
-	if (data[IFLA_CAN_DATA_BITTIMING]) {
-		struct can_bittiming dbt;
-
-		/* Do not allow changing bittiming while running */
-		if (dev->flags & IFF_UP)
-			return -EBUSY;
-
-		/* Calculate bittiming parameters based on
-		 * data_bittiming_const if set, otherwise pass bitrate
-		 * directly via do_set_bitrate(). Bail out if neither
-		 * is given.
-		 */
-		if (!priv->data_bittiming_const && !priv->do_set_data_bittiming)
-			return -EOPNOTSUPP;
-
-		memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]),
-		       sizeof(dbt));
-		err = can_get_bittiming(dev, &dbt,
-					priv->data_bittiming_const,
-					priv->data_bitrate_const,
-					priv->data_bitrate_const_cnt);
-		if (err)
-			return err;
-
-		if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) {
-			netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n",
-				   priv->bitrate_max);
-			return -EINVAL;
-		}
-
-		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
-
-		if (priv->do_set_data_bittiming) {
-			/* Finally, set the bit-timing registers */
-			err = priv->do_set_data_bittiming(dev);
-			if (err)
-				return err;
-		}
-	}
-
-	if (data[IFLA_CAN_TERMINATION]) {
-		const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]);
-		const unsigned int num_term = priv->termination_const_cnt;
-		unsigned int i;
-
-		if (!priv->do_set_termination)
-			return -EOPNOTSUPP;
-
-		/* check whether given value is supported by the interface */
-		for (i = 0; i < num_term; i++) {
-			if (termval == priv->termination_const[i])
-				break;
-		}
-		if (i >= num_term)
-			return -EINVAL;
-
-		/* Finally, set the termination value */
-		err = priv->do_set_termination(dev, termval);
-		if (err)
-			return err;
-
-		priv->termination = termval;
-	}
-
-	return 0;
-}
-
-static size_t can_get_size(const struct net_device *dev)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	size_t size = 0;
-
-	if (priv->bittiming.bitrate)				/* IFLA_CAN_BITTIMING */
-		size += nla_total_size(sizeof(struct can_bittiming));
-	if (priv->bittiming_const)				/* IFLA_CAN_BITTIMING_CONST */
-		size += nla_total_size(sizeof(struct can_bittiming_const));
-	size += nla_total_size(sizeof(struct can_clock));	/* IFLA_CAN_CLOCK */
-	size += nla_total_size(sizeof(u32));			/* IFLA_CAN_STATE */
-	size += nla_total_size(sizeof(struct can_ctrlmode));	/* IFLA_CAN_CTRLMODE */
-	size += nla_total_size(sizeof(u32));			/* IFLA_CAN_RESTART_MS */
-	if (priv->do_get_berr_counter)				/* IFLA_CAN_BERR_COUNTER */
-		size += nla_total_size(sizeof(struct can_berr_counter));
-	if (priv->data_bittiming.bitrate)			/* IFLA_CAN_DATA_BITTIMING */
-		size += nla_total_size(sizeof(struct can_bittiming));
-	if (priv->data_bittiming_const)				/* IFLA_CAN_DATA_BITTIMING_CONST */
-		size += nla_total_size(sizeof(struct can_bittiming_const));
-	if (priv->termination_const) {
-		size += nla_total_size(sizeof(priv->termination));		/* IFLA_CAN_TERMINATION */
-		size += nla_total_size(sizeof(*priv->termination_const) *	/* IFLA_CAN_TERMINATION_CONST */
-				       priv->termination_const_cnt);
-	}
-	if (priv->bitrate_const)				/* IFLA_CAN_BITRATE_CONST */
-		size += nla_total_size(sizeof(*priv->bitrate_const) *
-				       priv->bitrate_const_cnt);
-	if (priv->data_bitrate_const)				/* IFLA_CAN_DATA_BITRATE_CONST */
-		size += nla_total_size(sizeof(*priv->data_bitrate_const) *
-				       priv->data_bitrate_const_cnt);
-	size += sizeof(priv->bitrate_max);			/* IFLA_CAN_BITRATE_MAX */
-
-	return size;
-}
-
-static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
-{
-	struct can_priv *priv = netdev_priv(dev);
-	struct can_ctrlmode cm = {.flags = priv->ctrlmode};
-	struct can_berr_counter bec;
-	enum can_state state = priv->state;
-
-	if (priv->do_get_state)
-		priv->do_get_state(dev, &state);
-
-	if ((priv->bittiming.bitrate &&
-	     nla_put(skb, IFLA_CAN_BITTIMING,
-		     sizeof(priv->bittiming), &priv->bittiming)) ||
-
-	    (priv->bittiming_const &&
-	     nla_put(skb, IFLA_CAN_BITTIMING_CONST,
-		     sizeof(*priv->bittiming_const), priv->bittiming_const)) ||
-
-	    nla_put(skb, IFLA_CAN_CLOCK, sizeof(priv->clock), &priv->clock) ||
-	    nla_put_u32(skb, IFLA_CAN_STATE, state) ||
-	    nla_put(skb, IFLA_CAN_CTRLMODE, sizeof(cm), &cm) ||
-	    nla_put_u32(skb, IFLA_CAN_RESTART_MS, priv->restart_ms) ||
-
-	    (priv->do_get_berr_counter &&
-	     !priv->do_get_berr_counter(dev, &bec) &&
-	     nla_put(skb, IFLA_CAN_BERR_COUNTER, sizeof(bec), &bec)) ||
-
-	    (priv->data_bittiming.bitrate &&
-	     nla_put(skb, IFLA_CAN_DATA_BITTIMING,
-		     sizeof(priv->data_bittiming), &priv->data_bittiming)) ||
-
-	    (priv->data_bittiming_const &&
-	     nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST,
-		     sizeof(*priv->data_bittiming_const),
-		     priv->data_bittiming_const)) ||
-
-	    (priv->termination_const &&
-	     (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) ||
-	      nla_put(skb, IFLA_CAN_TERMINATION_CONST,
-		      sizeof(*priv->termination_const) *
-		      priv->termination_const_cnt,
-		      priv->termination_const))) ||
-
-	    (priv->bitrate_const &&
-	     nla_put(skb, IFLA_CAN_BITRATE_CONST,
-		     sizeof(*priv->bitrate_const) *
-		     priv->bitrate_const_cnt,
-		     priv->bitrate_const)) ||
-
-	    (priv->data_bitrate_const &&
-	     nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST,
-		     sizeof(*priv->data_bitrate_const) *
-		     priv->data_bitrate_const_cnt,
-		     priv->data_bitrate_const)) ||
-
-	    (nla_put(skb, IFLA_CAN_BITRATE_MAX,
-		     sizeof(priv->bitrate_max),
-		     &priv->bitrate_max))
-	    )
-
-		return -EMSGSIZE;
-
-	return 0;
-}
-
-static size_t can_get_xstats_size(const struct net_device *dev)
-{
-	return sizeof(struct can_device_stats);
-}
-
-static int can_fill_xstats(struct sk_buff *skb, const struct net_device *dev)
-{
-	struct can_priv *priv = netdev_priv(dev);
-
-	if (nla_put(skb, IFLA_INFO_XSTATS,
-		    sizeof(priv->can_stats), &priv->can_stats))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int can_newlink(struct net *src_net, struct net_device *dev,
-		       struct nlattr *tb[], struct nlattr *data[],
-		       struct netlink_ext_ack *extack)
-{
-	return -EOPNOTSUPP;
-}
-
-static void can_dellink(struct net_device *dev, struct list_head *head)
-{
-}
-
-static struct rtnl_link_ops can_link_ops __read_mostly = {
-	.kind		= "can",
-	.maxtype	= IFLA_CAN_MAX,
-	.policy		= can_policy,
-	.setup		= can_setup,
-	.validate	= can_validate,
-	.newlink	= can_newlink,
-	.changelink	= can_changelink,
-	.dellink	= can_dellink,
-	.get_size	= can_get_size,
-	.fill_info	= can_fill_info,
-	.get_xstats_size = can_get_xstats_size,
-	.fill_xstats	= can_fill_xstats,
-};
-
 /* Register the CAN network device */
 int register_candev(struct net_device *dev)
 {
@@ -812,7 +448,7 @@ static __init int can_dev_init(void)
 
 	can_led_notifier_init();
 
-	err = rtnl_link_register(&can_link_ops);
+	err = can_netlink_register();
 	if (!err)
 		pr_info(MOD_DESC "\n");
 
@@ -822,7 +458,7 @@ module_init(can_dev_init);
 
 static __exit void can_dev_exit(void)
 {
-	rtnl_link_unregister(&can_link_ops);
+	can_netlink_unregister();
 
 	can_led_notifier_exit();
 }
diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
new file mode 100644
index 000000000000..3ae884cdf677
--- /dev/null
+++ b/drivers/net/can/dev/netlink.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix
+ * Copyright (C) 2006 Andrey Volkov, Varma Electronics
+ * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com>
+ */
+
+#include <linux/can/dev.h>
+#include <net/rtnetlink.h>
+
+static const struct nla_policy can_policy[IFLA_CAN_MAX + 1] = {
+	[IFLA_CAN_STATE]	= { .type = NLA_U32 },
+	[IFLA_CAN_CTRLMODE]	= { .len = sizeof(struct can_ctrlmode) },
+	[IFLA_CAN_RESTART_MS]	= { .type = NLA_U32 },
+	[IFLA_CAN_RESTART]	= { .type = NLA_U32 },
+	[IFLA_CAN_BITTIMING]	= { .len = sizeof(struct can_bittiming) },
+	[IFLA_CAN_BITTIMING_CONST]
+				= { .len = sizeof(struct can_bittiming_const) },
+	[IFLA_CAN_CLOCK]	= { .len = sizeof(struct can_clock) },
+	[IFLA_CAN_BERR_COUNTER]	= { .len = sizeof(struct can_berr_counter) },
+	[IFLA_CAN_DATA_BITTIMING]
+				= { .len = sizeof(struct can_bittiming) },
+	[IFLA_CAN_DATA_BITTIMING_CONST]
+				= { .len = sizeof(struct can_bittiming_const) },
+	[IFLA_CAN_TERMINATION]	= { .type = NLA_U16 },
+};
+
+static int can_validate(struct nlattr *tb[], struct nlattr *data[],
+			struct netlink_ext_ack *extack)
+{
+	bool is_can_fd = false;
+
+	/* Make sure that valid CAN FD configurations always consist of
+	 * - nominal/arbitration bittiming
+	 * - data bittiming
+	 * - control mode with CAN_CTRLMODE_FD set
+	 */
+
+	if (!data)
+		return 0;
+
+	if (data[IFLA_CAN_CTRLMODE]) {
+		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+
+		is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD;
+	}
+
+	if (is_can_fd) {
+		if (!data[IFLA_CAN_BITTIMING] || !data[IFLA_CAN_DATA_BITTIMING])
+			return -EOPNOTSUPP;
+	}
+
+	if (data[IFLA_CAN_DATA_BITTIMING]) {
+		if (!is_can_fd || !data[IFLA_CAN_BITTIMING])
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int can_changelink(struct net_device *dev, struct nlattr *tb[],
+			  struct nlattr *data[],
+			  struct netlink_ext_ack *extack)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	int err;
+
+	/* We need synchronization with dev->stop() */
+	ASSERT_RTNL();
+
+	if (data[IFLA_CAN_BITTIMING]) {
+		struct can_bittiming bt;
+
+		/* Do not allow changing bittiming while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+
+		/* Calculate bittiming parameters based on
+		 * bittiming_const if set, otherwise pass bitrate
+		 * directly via do_set_bitrate(). Bail out if neither
+		 * is given.
+		 */
+		if (!priv->bittiming_const && !priv->do_set_bittiming)
+			return -EOPNOTSUPP;
+
+		memcpy(&bt, nla_data(data[IFLA_CAN_BITTIMING]), sizeof(bt));
+		err = can_get_bittiming(dev, &bt,
+					priv->bittiming_const,
+					priv->bitrate_const,
+					priv->bitrate_const_cnt);
+		if (err)
+			return err;
+
+		if (priv->bitrate_max && bt.bitrate > priv->bitrate_max) {
+			netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n",
+				   priv->bitrate_max);
+			return -EINVAL;
+		}
+
+		memcpy(&priv->bittiming, &bt, sizeof(bt));
+
+		if (priv->do_set_bittiming) {
+			/* Finally, set the bit-timing registers */
+			err = priv->do_set_bittiming(dev);
+			if (err)
+				return err;
+		}
+	}
+
+	if (data[IFLA_CAN_CTRLMODE]) {
+		struct can_ctrlmode *cm;
+		u32 ctrlstatic;
+		u32 maskedflags;
+
+		/* Do not allow changing controller mode while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+		cm = nla_data(data[IFLA_CAN_CTRLMODE]);
+		ctrlstatic = priv->ctrlmode_static;
+		maskedflags = cm->flags & cm->mask;
+
+		/* check whether provided bits are allowed to be passed */
+		if (cm->mask & ~(priv->ctrlmode_supported | ctrlstatic))
+			return -EOPNOTSUPP;
+
+		/* do not check for static fd-non-iso if 'fd' is disabled */
+		if (!(maskedflags & CAN_CTRLMODE_FD))
+			ctrlstatic &= ~CAN_CTRLMODE_FD_NON_ISO;
+
+		/* make sure static options are provided by configuration */
+		if ((maskedflags & ctrlstatic) != ctrlstatic)
+			return -EOPNOTSUPP;
+
+		/* clear bits to be modified and copy the flag values */
+		priv->ctrlmode &= ~cm->mask;
+		priv->ctrlmode |= maskedflags;
+
+		/* CAN_CTRLMODE_FD can only be set when driver supports FD */
+		if (priv->ctrlmode & CAN_CTRLMODE_FD)
+			dev->mtu = CANFD_MTU;
+		else
+			dev->mtu = CAN_MTU;
+	}
+
+	if (data[IFLA_CAN_RESTART_MS]) {
+		/* Do not allow changing restart delay while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+		priv->restart_ms = nla_get_u32(data[IFLA_CAN_RESTART_MS]);
+	}
+
+	if (data[IFLA_CAN_RESTART]) {
+		/* Do not allow a restart while not running */
+		if (!(dev->flags & IFF_UP))
+			return -EINVAL;
+		err = can_restart_now(dev);
+		if (err)
+			return err;
+	}
+
+	if (data[IFLA_CAN_DATA_BITTIMING]) {
+		struct can_bittiming dbt;
+
+		/* Do not allow changing bittiming while running */
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+
+		/* Calculate bittiming parameters based on
+		 * data_bittiming_const if set, otherwise pass bitrate
+		 * directly via do_set_bitrate(). Bail out if neither
+		 * is given.
+		 */
+		if (!priv->data_bittiming_const && !priv->do_set_data_bittiming)
+			return -EOPNOTSUPP;
+
+		memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]),
+		       sizeof(dbt));
+		err = can_get_bittiming(dev, &dbt,
+					priv->data_bittiming_const,
+					priv->data_bitrate_const,
+					priv->data_bitrate_const_cnt);
+		if (err)
+			return err;
+
+		if (priv->bitrate_max && dbt.bitrate > priv->bitrate_max) {
+			netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n",
+				   priv->bitrate_max);
+			return -EINVAL;
+		}
+
+		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
+
+		if (priv->do_set_data_bittiming) {
+			/* Finally, set the bit-timing registers */
+			err = priv->do_set_data_bittiming(dev);
+			if (err)
+				return err;
+		}
+	}
+
+	if (data[IFLA_CAN_TERMINATION]) {
+		const u16 termval = nla_get_u16(data[IFLA_CAN_TERMINATION]);
+		const unsigned int num_term = priv->termination_const_cnt;
+		unsigned int i;
+
+		if (!priv->do_set_termination)
+			return -EOPNOTSUPP;
+
+		/* check whether given value is supported by the interface */
+		for (i = 0; i < num_term; i++) {
+			if (termval == priv->termination_const[i])
+				break;
+		}
+		if (i >= num_term)
+			return -EINVAL;
+
+		/* Finally, set the termination value */
+		err = priv->do_set_termination(dev, termval);
+		if (err)
+			return err;
+
+		priv->termination = termval;
+	}
+
+	return 0;
+}
+
+static size_t can_get_size(const struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	size_t size = 0;
+
+	if (priv->bittiming.bitrate)				/* IFLA_CAN_BITTIMING */
+		size += nla_total_size(sizeof(struct can_bittiming));
+	if (priv->bittiming_const)				/* IFLA_CAN_BITTIMING_CONST */
+		size += nla_total_size(sizeof(struct can_bittiming_const));
+	size += nla_total_size(sizeof(struct can_clock));	/* IFLA_CAN_CLOCK */
+	size += nla_total_size(sizeof(u32));			/* IFLA_CAN_STATE */
+	size += nla_total_size(sizeof(struct can_ctrlmode));	/* IFLA_CAN_CTRLMODE */
+	size += nla_total_size(sizeof(u32));			/* IFLA_CAN_RESTART_MS */
+	if (priv->do_get_berr_counter)				/* IFLA_CAN_BERR_COUNTER */
+		size += nla_total_size(sizeof(struct can_berr_counter));
+	if (priv->data_bittiming.bitrate)			/* IFLA_CAN_DATA_BITTIMING */
+		size += nla_total_size(sizeof(struct can_bittiming));
+	if (priv->data_bittiming_const)				/* IFLA_CAN_DATA_BITTIMING_CONST */
+		size += nla_total_size(sizeof(struct can_bittiming_const));
+	if (priv->termination_const) {
+		size += nla_total_size(sizeof(priv->termination));		/* IFLA_CAN_TERMINATION */
+		size += nla_total_size(sizeof(*priv->termination_const) *	/* IFLA_CAN_TERMINATION_CONST */
+				       priv->termination_const_cnt);
+	}
+	if (priv->bitrate_const)				/* IFLA_CAN_BITRATE_CONST */
+		size += nla_total_size(sizeof(*priv->bitrate_const) *
+				       priv->bitrate_const_cnt);
+	if (priv->data_bitrate_const)				/* IFLA_CAN_DATA_BITRATE_CONST */
+		size += nla_total_size(sizeof(*priv->data_bitrate_const) *
+				       priv->data_bitrate_const_cnt);
+	size += sizeof(priv->bitrate_max);			/* IFLA_CAN_BITRATE_MAX */
+
+	return size;
+}
+
+static int can_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	struct can_ctrlmode cm = {.flags = priv->ctrlmode};
+	struct can_berr_counter bec;
+	enum can_state state = priv->state;
+
+	if (priv->do_get_state)
+		priv->do_get_state(dev, &state);
+
+	if ((priv->bittiming.bitrate &&
+	     nla_put(skb, IFLA_CAN_BITTIMING,
+		     sizeof(priv->bittiming), &priv->bittiming)) ||
+
+	    (priv->bittiming_const &&
+	     nla_put(skb, IFLA_CAN_BITTIMING_CONST,
+		     sizeof(*priv->bittiming_const), priv->bittiming_const)) ||
+
+	    nla_put(skb, IFLA_CAN_CLOCK, sizeof(priv->clock), &priv->clock) ||
+	    nla_put_u32(skb, IFLA_CAN_STATE, state) ||
+	    nla_put(skb, IFLA_CAN_CTRLMODE, sizeof(cm), &cm) ||
+	    nla_put_u32(skb, IFLA_CAN_RESTART_MS, priv->restart_ms) ||
+
+	    (priv->do_get_berr_counter &&
+	     !priv->do_get_berr_counter(dev, &bec) &&
+	     nla_put(skb, IFLA_CAN_BERR_COUNTER, sizeof(bec), &bec)) ||
+
+	    (priv->data_bittiming.bitrate &&
+	     nla_put(skb, IFLA_CAN_DATA_BITTIMING,
+		     sizeof(priv->data_bittiming), &priv->data_bittiming)) ||
+
+	    (priv->data_bittiming_const &&
+	     nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST,
+		     sizeof(*priv->data_bittiming_const),
+		     priv->data_bittiming_const)) ||
+
+	    (priv->termination_const &&
+	     (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) ||
+	      nla_put(skb, IFLA_CAN_TERMINATION_CONST,
+		      sizeof(*priv->termination_const) *
+		      priv->termination_const_cnt,
+		      priv->termination_const))) ||
+
+	    (priv->bitrate_const &&
+	     nla_put(skb, IFLA_CAN_BITRATE_CONST,
+		     sizeof(*priv->bitrate_const) *
+		     priv->bitrate_const_cnt,
+		     priv->bitrate_const)) ||
+
+	    (priv->data_bitrate_const &&
+	     nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST,
+		     sizeof(*priv->data_bitrate_const) *
+		     priv->data_bitrate_const_cnt,
+		     priv->data_bitrate_const)) ||
+
+	    (nla_put(skb, IFLA_CAN_BITRATE_MAX,
+		     sizeof(priv->bitrate_max),
+		     &priv->bitrate_max))
+	    )
+
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static size_t can_get_xstats_size(const struct net_device *dev)
+{
+	return sizeof(struct can_device_stats);
+}
+
+static int can_fill_xstats(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+
+	if (nla_put(skb, IFLA_INFO_XSTATS,
+		    sizeof(priv->can_stats), &priv->can_stats))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int can_newlink(struct net *src_net, struct net_device *dev,
+		       struct nlattr *tb[], struct nlattr *data[],
+		       struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+
+static void can_dellink(struct net_device *dev, struct list_head *head)
+{
+}
+
+struct rtnl_link_ops can_link_ops __read_mostly = {
+	.kind		= "can",
+	.maxtype	= IFLA_CAN_MAX,
+	.policy		= can_policy,
+	.setup		= can_setup,
+	.validate	= can_validate,
+	.newlink	= can_newlink,
+	.changelink	= can_changelink,
+	.dellink	= can_dellink,
+	.get_size	= can_get_size,
+	.fill_info	= can_fill_info,
+	.get_xstats_size = can_get_xstats_size,
+	.fill_xstats	= can_fill_xstats,
+};
+
+int can_netlink_register(void)
+{
+	return rtnl_link_register(&can_link_ops);
+}
+
+void can_netlink_unregister(void)
+{
+	rtnl_link_unregister(&can_link_ops);
+}
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 4a26e128af7f..7faf6a37d5b2 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -100,6 +100,8 @@ static inline void can_set_static_ctrlmode(struct net_device *dev,
 		dev->mtu = CANFD_MTU;
 }
 
+void can_setup(struct net_device *dev);
+
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 				    unsigned int txqs, unsigned int rxqs);
 #define alloc_candev(sizeof_priv, echo_skb_max) \
@@ -130,4 +132,8 @@ void of_can_transceiver(struct net_device *dev);
 static inline void of_can_transceiver(struct net_device *dev) { }
 #endif
 
+extern struct rtnl_link_ops can_link_ops;
+int can_netlink_register(void);
+void can_netlink_unregister(void);
+
 #endif /* !_CAN_DEV_H */
-- 
cgit v1.2.3


From 35192007d28d528aae74ddf5b26eaebf8e9a720c Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Fri, 18 Dec 2020 15:02:39 +0300
Subject: usb: phy: tegra: Support waking up from a low power mode

Support programming of waking up from a low power mode by implementing the
generic set_wakeup() callback of the USB PHY API.

Tested-by: Matt Merhar <mattmerhar@protonmail.com>
Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Tested-by: Peter Geis <pgwipeout@gmail.com>
Tested-by: Ion Agorria <ion@agorria.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20201218120246.7759-3-digetx@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/phy/phy-tegra-usb.c   | 100 +++++++++++++++++++++++++++++++++-----
 include/linux/usb/tegra_usb_phy.h |   2 +
 2 files changed, 91 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index 1296524e1bee..a48452a6172b 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c
@@ -45,6 +45,7 @@
 #define TEGRA_PORTSC1_RWC_BITS	(PORT_CSC | PORT_PEC | PORT_OCC)
 
 #define USB_SUSP_CTRL				0x400
+#define   USB_WAKE_ON_RESUME_EN			BIT(2)
 #define   USB_WAKE_ON_CNNT_EN_DEV		BIT(3)
 #define   USB_WAKE_ON_DISCON_EN_DEV		BIT(4)
 #define   USB_SUSP_CLR				BIT(5)
@@ -56,6 +57,15 @@
 #define   USB_SUSP_SET				BIT(14)
 #define   USB_WAKEUP_DEBOUNCE_COUNT(x)		(((x) & 0x7) << 16)
 
+#define USB_PHY_VBUS_SENSORS			0x404
+#define   B_SESS_VLD_WAKEUP_EN			BIT(6)
+#define   B_VBUS_VLD_WAKEUP_EN			BIT(14)
+#define   A_SESS_VLD_WAKEUP_EN			BIT(22)
+#define   A_VBUS_VLD_WAKEUP_EN			BIT(30)
+
+#define USB_PHY_VBUS_WAKEUP_ID			0x408
+#define   VBUS_WAKEUP_WAKEUP_EN			BIT(30)
+
 #define USB1_LEGACY_CTRL			0x410
 #define   USB1_NO_LEGACY_MODE			BIT(0)
 #define   USB1_VBUS_SENSE_CTL_MASK		(3 << 1)
@@ -334,6 +344,11 @@ static int utmip_pad_power_on(struct tegra_usb_phy *phy)
 		writel_relaxed(val, base + UTMIP_BIAS_CFG0);
 	}
 
+	if (phy->pad_wakeup) {
+		phy->pad_wakeup = false;
+		utmip_pad_count--;
+	}
+
 	spin_unlock(&utmip_pad_lock);
 
 	clk_disable_unprepare(phy->pad_clk);
@@ -359,6 +374,17 @@ static int utmip_pad_power_off(struct tegra_usb_phy *phy)
 		goto ulock;
 	}
 
+	/*
+	 * In accordance to TRM, OTG and Bias pad circuits could be turned off
+	 * to save power if wake is enabled, but the VBUS-change detection
+	 * method is board-specific and these circuits may need to be enabled
+	 * to generate wakeup event, hence we will just keep them both enabled.
+	 */
+	if (phy->wakeup_enabled) {
+		phy->pad_wakeup = true;
+		utmip_pad_count++;
+	}
+
 	if (--utmip_pad_count == 0) {
 		val = readl_relaxed(base + UTMIP_BIAS_CFG0);
 		val |= UTMIP_OTGPD | UTMIP_BIASPD;
@@ -503,11 +529,24 @@ static int utmi_phy_power_on(struct tegra_usb_phy *phy)
 		writel_relaxed(val, base + UTMIP_PLL_CFG1);
 	}
 
+	val = readl_relaxed(base + USB_SUSP_CTRL);
+	val &= ~USB_WAKE_ON_RESUME_EN;
+	writel_relaxed(val, base + USB_SUSP_CTRL);
+
 	if (phy->mode == USB_DR_MODE_PERIPHERAL) {
 		val = readl_relaxed(base + USB_SUSP_CTRL);
 		val &= ~(USB_WAKE_ON_CNNT_EN_DEV | USB_WAKE_ON_DISCON_EN_DEV);
 		writel_relaxed(val, base + USB_SUSP_CTRL);
 
+		val = readl_relaxed(base + USB_PHY_VBUS_WAKEUP_ID);
+		val &= ~VBUS_WAKEUP_WAKEUP_EN;
+		writel_relaxed(val, base + USB_PHY_VBUS_WAKEUP_ID);
+
+		val = readl_relaxed(base + USB_PHY_VBUS_SENSORS);
+		val &= ~(A_VBUS_VLD_WAKEUP_EN | A_SESS_VLD_WAKEUP_EN);
+		val &= ~(B_VBUS_VLD_WAKEUP_EN | B_SESS_VLD_WAKEUP_EN);
+		writel_relaxed(val, base + USB_PHY_VBUS_SENSORS);
+
 		val = readl_relaxed(base + UTMIP_BAT_CHRG_CFG0);
 		val &= ~UTMIP_PD_CHRG;
 		writel_relaxed(val, base + UTMIP_BAT_CHRG_CFG0);
@@ -605,31 +644,51 @@ static int utmi_phy_power_off(struct tegra_usb_phy *phy)
 
 	utmi_phy_clk_disable(phy);
 
-	if (phy->mode == USB_DR_MODE_PERIPHERAL) {
+	/* PHY won't resume if reset is asserted */
+	if (!phy->wakeup_enabled) {
 		val = readl_relaxed(base + USB_SUSP_CTRL);
-		val &= ~USB_WAKEUP_DEBOUNCE_COUNT(~0);
-		val |= USB_WAKE_ON_CNNT_EN_DEV | USB_WAKEUP_DEBOUNCE_COUNT(5);
+		val |= UTMIP_RESET;
 		writel_relaxed(val, base + USB_SUSP_CTRL);
 	}
 
-	val = readl_relaxed(base + USB_SUSP_CTRL);
-	val |= UTMIP_RESET;
-	writel_relaxed(val, base + USB_SUSP_CTRL);
-
 	val = readl_relaxed(base + UTMIP_BAT_CHRG_CFG0);
 	val |= UTMIP_PD_CHRG;
 	writel_relaxed(val, base + UTMIP_BAT_CHRG_CFG0);
 
-	val = readl_relaxed(base + UTMIP_XCVR_CFG0);
-	val |= UTMIP_FORCE_PD_POWERDOWN | UTMIP_FORCE_PD2_POWERDOWN |
-	       UTMIP_FORCE_PDZI_POWERDOWN;
-	writel_relaxed(val, base + UTMIP_XCVR_CFG0);
+	if (!phy->wakeup_enabled) {
+		val = readl_relaxed(base + UTMIP_XCVR_CFG0);
+		val |= UTMIP_FORCE_PD_POWERDOWN | UTMIP_FORCE_PD2_POWERDOWN |
+		       UTMIP_FORCE_PDZI_POWERDOWN;
+		writel_relaxed(val, base + UTMIP_XCVR_CFG0);
+	}
 
 	val = readl_relaxed(base + UTMIP_XCVR_CFG1);
 	val |= UTMIP_FORCE_PDDISC_POWERDOWN | UTMIP_FORCE_PDCHRP_POWERDOWN |
 	       UTMIP_FORCE_PDDR_POWERDOWN;
 	writel_relaxed(val, base + UTMIP_XCVR_CFG1);
 
+	if (phy->wakeup_enabled) {
+		val = readl_relaxed(base + USB_SUSP_CTRL);
+		val &= ~USB_WAKEUP_DEBOUNCE_COUNT(~0);
+		val |= USB_WAKEUP_DEBOUNCE_COUNT(5);
+		val |= USB_WAKE_ON_RESUME_EN;
+		writel_relaxed(val, base + USB_SUSP_CTRL);
+
+		/*
+		 * Ask VBUS sensor to generate wake event once cable is
+		 * connected.
+		 */
+		if (phy->mode == USB_DR_MODE_PERIPHERAL) {
+			val = readl_relaxed(base + USB_PHY_VBUS_WAKEUP_ID);
+			val |= VBUS_WAKEUP_WAKEUP_EN;
+			writel_relaxed(val, base + USB_PHY_VBUS_WAKEUP_ID);
+
+			val = readl_relaxed(base + USB_PHY_VBUS_SENSORS);
+			val |= A_VBUS_VLD_WAKEUP_EN;
+			writel_relaxed(val, base + USB_PHY_VBUS_SENSORS);
+		}
+	}
+
 	return utmip_pad_power_off(phy);
 }
 
@@ -765,6 +824,15 @@ static int ulpi_phy_power_off(struct tegra_usb_phy *phy)
 	usleep_range(5000, 6000);
 	clk_disable_unprepare(phy->clk);
 
+	/*
+	 * Wakeup currently unimplemented for ULPI, thus PHY needs to be
+	 * force-resumed.
+	 */
+	if (WARN_ON_ONCE(phy->wakeup_enabled)) {
+		ulpi_phy_power_on(phy);
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 
@@ -827,6 +895,15 @@ static void tegra_usb_phy_shutdown(struct usb_phy *u_phy)
 	phy->freq = NULL;
 }
 
+static int tegra_usb_phy_set_wakeup(struct usb_phy *u_phy, bool enable)
+{
+	struct tegra_usb_phy *phy = to_tegra_usb_phy(u_phy);
+
+	phy->wakeup_enabled = enable;
+
+	return 0;
+}
+
 static int tegra_usb_phy_set_suspend(struct usb_phy *u_phy, int suspend)
 {
 	struct tegra_usb_phy *phy = to_tegra_usb_phy(u_phy);
@@ -1198,6 +1275,7 @@ static int tegra_usb_phy_probe(struct platform_device *pdev)
 	tegra_phy->u_phy.dev = &pdev->dev;
 	tegra_phy->u_phy.init = tegra_usb_phy_init;
 	tegra_phy->u_phy.shutdown = tegra_usb_phy_shutdown;
+	tegra_phy->u_phy.set_wakeup = tegra_usb_phy_set_wakeup;
 	tegra_phy->u_phy.set_suspend = tegra_usb_phy_set_suspend;
 
 	platform_set_drvdata(pdev, tegra_phy);
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index c29d1b4c9381..fd1c9f6a4e37 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -79,6 +79,8 @@ struct tegra_usb_phy {
 	bool is_ulpi_phy;
 	struct gpio_desc *reset_gpio;
 	struct reset_control *pad_rst;
+	bool wakeup_enabled;
+	bool pad_wakeup;
 	bool powered_on;
 };
 
-- 
cgit v1.2.3


From fc53d5279094e38e6363506339772a7021da2df8 Mon Sep 17 00:00:00 2001
From: Peter Geis <pgwipeout@gmail.com>
Date: Fri, 18 Dec 2020 15:02:42 +0300
Subject: usb: chipidea: tegra: Support host mode

Add USB host mode to the Tegra HDRC driver. This allows us to benefit from
support provided by the generic ChipIdea driver instead of duplicating the
effort in a separate ehci-tegra driver.

Tested-by: Matt Merhar <mattmerhar@protonmail.com>
Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Tested-by: Ion Agorria <ion@agorria.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Signed-off-by: Peter Geis <pgwipeout@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20201218120246.7759-6-digetx@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/chipidea/Kconfig         |   1 -
 drivers/usb/chipidea/ci_hdrc_tegra.c | 243 ++++++++++++++++++++++++++++++++++-
 drivers/usb/chipidea/core.c          |  10 +-
 drivers/usb/chipidea/host.c          | 104 ++++++++++++++-
 include/linux/usb/chipidea.h         |   6 +
 5 files changed, 356 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/Kconfig b/drivers/usb/chipidea/Kconfig
index 8685ead6ccc7..661818e8fed6 100644
--- a/drivers/usb/chipidea/Kconfig
+++ b/drivers/usb/chipidea/Kconfig
@@ -55,7 +55,6 @@ config USB_CHIPIDEA_GENERIC
 config USB_CHIPIDEA_TEGRA
 	tristate "Enable Tegra USB glue driver" if EMBEDDED
 	depends on OF
-	depends on USB_CHIPIDEA_UDC
 	default USB_CHIPIDEA
 
 endif
diff --git a/drivers/usb/chipidea/ci_hdrc_tegra.c b/drivers/usb/chipidea/ci_hdrc_tegra.c
index d8efa80aa1c2..5fccdeeefc64 100644
--- a/drivers/usb/chipidea/ci_hdrc_tegra.c
+++ b/drivers/usb/chipidea/ci_hdrc_tegra.c
@@ -4,11 +4,18 @@
  */
 
 #include <linux/clk.h>
+#include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/reset.h>
 
+#include <linux/usb.h>
 #include <linux/usb/chipidea.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/of.h>
+#include <linux/usb/phy.h>
+
+#include "../host/ehci.h"
 
 #include "ci.h"
 
@@ -16,20 +23,47 @@ struct tegra_usb {
 	struct ci_hdrc_platform_data data;
 	struct platform_device *dev;
 
+	const struct tegra_usb_soc_info *soc;
 	struct usb_phy *phy;
 	struct clk *clk;
+
+	bool needs_double_reset;
 };
 
 struct tegra_usb_soc_info {
 	unsigned long flags;
+	unsigned int txfifothresh;
+	enum usb_dr_mode dr_mode;
+};
+
+static const struct tegra_usb_soc_info tegra20_ehci_soc_info = {
+	.flags = CI_HDRC_REQUIRES_ALIGNED_DMA |
+		 CI_HDRC_OVERRIDE_PHY_CONTROL,
+	.dr_mode = USB_DR_MODE_HOST,
+	.txfifothresh = 10,
+};
+
+static const struct tegra_usb_soc_info tegra30_ehci_soc_info = {
+	.flags = CI_HDRC_REQUIRES_ALIGNED_DMA |
+		 CI_HDRC_OVERRIDE_PHY_CONTROL,
+	.dr_mode = USB_DR_MODE_HOST,
+	.txfifothresh = 16,
 };
 
 static const struct tegra_usb_soc_info tegra_udc_soc_info = {
-	.flags = CI_HDRC_REQUIRES_ALIGNED_DMA,
+	.flags = CI_HDRC_REQUIRES_ALIGNED_DMA |
+		 CI_HDRC_OVERRIDE_PHY_CONTROL,
+	.dr_mode = USB_DR_MODE_UNKNOWN,
 };
 
 static const struct of_device_id tegra_usb_of_match[] = {
 	{
+		.compatible = "nvidia,tegra20-ehci",
+		.data = &tegra20_ehci_soc_info,
+	}, {
+		.compatible = "nvidia,tegra30-ehci",
+		.data = &tegra30_ehci_soc_info,
+	}, {
 		.compatible = "nvidia,tegra20-udc",
 		.data = &tegra_udc_soc_info,
 	}, {
@@ -47,6 +81,181 @@ static const struct of_device_id tegra_usb_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_usb_of_match);
 
+static int tegra_usb_reset_controller(struct device *dev)
+{
+	struct reset_control *rst, *rst_utmi;
+	struct device_node *phy_np;
+	int err;
+
+	rst = devm_reset_control_get_shared(dev, "usb");
+	if (IS_ERR(rst)) {
+		dev_err(dev, "can't get ehci reset: %pe\n", rst);
+		return PTR_ERR(rst);
+	}
+
+	phy_np = of_parse_phandle(dev->of_node, "nvidia,phy", 0);
+	if (!phy_np)
+		return -ENOENT;
+
+	/*
+	 * The 1st USB controller contains some UTMI pad registers that are
+	 * global for all the controllers on the chip. Those registers are
+	 * also cleared when reset is asserted to the 1st controller.
+	 */
+	rst_utmi = of_reset_control_get_shared(phy_np, "utmi-pads");
+	if (IS_ERR(rst_utmi)) {
+		dev_warn(dev, "can't get utmi-pads reset from the PHY\n");
+		dev_warn(dev, "continuing, but please update your DT\n");
+	} else {
+		/*
+		 * PHY driver performs UTMI-pads reset in a case of a
+		 * non-legacy DT.
+		 */
+		reset_control_put(rst_utmi);
+	}
+
+	of_node_put(phy_np);
+
+	/* reset control is shared, hence initialize it first */
+	err = reset_control_deassert(rst);
+	if (err)
+		return err;
+
+	err = reset_control_assert(rst);
+	if (err)
+		return err;
+
+	udelay(1);
+
+	err = reset_control_deassert(rst);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int tegra_usb_notify_event(struct ci_hdrc *ci, unsigned int event)
+{
+	struct tegra_usb *usb = dev_get_drvdata(ci->dev->parent);
+	struct ehci_hcd *ehci;
+
+	switch (event) {
+	case CI_HDRC_CONTROLLER_RESET_EVENT:
+		if (ci->hcd) {
+			ehci = hcd_to_ehci(ci->hcd);
+			ehci->has_tdi_phy_lpm = false;
+			ehci_writel(ehci, usb->soc->txfifothresh << 16,
+				    &ehci->regs->txfill_tuning);
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static int tegra_usb_internal_port_reset(struct ehci_hcd *ehci,
+					 u32 __iomem *portsc_reg,
+					 unsigned long *flags)
+{
+	u32 saved_usbintr, temp;
+	unsigned int i, tries;
+	int retval = 0;
+
+	saved_usbintr = ehci_readl(ehci, &ehci->regs->intr_enable);
+	/* disable USB interrupt */
+	ehci_writel(ehci, 0, &ehci->regs->intr_enable);
+	spin_unlock_irqrestore(&ehci->lock, *flags);
+
+	/*
+	 * Here we have to do Port Reset at most twice for
+	 * Port Enable bit to be set.
+	 */
+	for (i = 0; i < 2; i++) {
+		temp = ehci_readl(ehci, portsc_reg);
+		temp |= PORT_RESET;
+		ehci_writel(ehci, temp, portsc_reg);
+		fsleep(10000);
+		temp &= ~PORT_RESET;
+		ehci_writel(ehci, temp, portsc_reg);
+		fsleep(1000);
+		tries = 100;
+		do {
+			fsleep(1000);
+			/*
+			 * Up to this point, Port Enable bit is
+			 * expected to be set after 2 ms waiting.
+			 * USB1 usually takes extra 45 ms, for safety,
+			 * we take 100 ms as timeout.
+			 */
+			temp = ehci_readl(ehci, portsc_reg);
+		} while (!(temp & PORT_PE) && tries--);
+		if (temp & PORT_PE)
+			break;
+	}
+	if (i == 2)
+		retval = -ETIMEDOUT;
+
+	/*
+	 * Clear Connect Status Change bit if it's set.
+	 * We can't clear PORT_PEC. It will also cause PORT_PE to be cleared.
+	 */
+	if (temp & PORT_CSC)
+		ehci_writel(ehci, PORT_CSC, portsc_reg);
+
+	/*
+	 * Write to clear any interrupt status bits that might be set
+	 * during port reset.
+	 */
+	temp = ehci_readl(ehci, &ehci->regs->status);
+	ehci_writel(ehci, temp, &ehci->regs->status);
+
+	/* restore original interrupt-enable bits */
+	spin_lock_irqsave(&ehci->lock, *flags);
+	ehci_writel(ehci, saved_usbintr, &ehci->regs->intr_enable);
+
+	return retval;
+}
+
+static int tegra_ehci_hub_control(struct ci_hdrc *ci, u16 typeReq, u16 wValue,
+				  u16 wIndex, char *buf, u16 wLength,
+				  bool *done, unsigned long *flags)
+{
+	struct tegra_usb *usb = dev_get_drvdata(ci->dev->parent);
+	struct ehci_hcd *ehci = hcd_to_ehci(ci->hcd);
+	u32 __iomem *status_reg;
+	int retval = 0;
+
+	status_reg = &ehci->regs->port_status[(wIndex & 0xff) - 1];
+
+	switch (typeReq) {
+	case SetPortFeature:
+		if (wValue != USB_PORT_FEAT_RESET || !usb->needs_double_reset)
+			break;
+
+		/* for USB1 port we need to issue Port Reset twice internally */
+		retval = tegra_usb_internal_port_reset(ehci, status_reg, flags);
+		*done  = true;
+		break;
+	}
+
+	return retval;
+}
+
+static void tegra_usb_enter_lpm(struct ci_hdrc *ci, bool enable)
+{
+	/*
+	 * Touching any register which belongs to AHB clock domain will
+	 * hang CPU if USB controller is put into low power mode because
+	 * AHB USB clock is gated on Tegra in the LPM.
+	 *
+	 * Tegra PHY has a separate register for checking the clock status
+	 * and usb_phy_set_suspend() takes care of gating/ungating the clocks
+	 * and restoring the PHY state on Tegra. Hence DEVLC/PORTSC registers
+	 * shouldn't be touched directly by the CI driver.
+	 */
+	usb_phy_set_suspend(ci->usb_phy, enable);
+}
+
 static int tegra_usb_probe(struct platform_device *pdev)
 {
 	const struct tegra_usb_soc_info *soc;
@@ -83,24 +292,49 @@ static int tegra_usb_probe(struct platform_device *pdev)
 		return err;
 	}
 
+	if (device_property_present(&pdev->dev, "nvidia,needs-double-reset"))
+		usb->needs_double_reset = true;
+
+	err = tegra_usb_reset_controller(&pdev->dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to reset controller: %d\n", err);
+		goto fail_power_off;
+	}
+
+	/*
+	 * USB controller registers shouldn't be touched before PHY is
+	 * initialized, otherwise CPU will hang because clocks are gated.
+	 * PHY driver controls gating of internal USB clocks on Tegra.
+	 */
+	err = usb_phy_init(usb->phy);
+	if (err)
+		goto fail_power_off;
+
+	platform_set_drvdata(pdev, usb);
+
 	/* setup and register ChipIdea HDRC device */
+	usb->soc = soc;
 	usb->data.name = "tegra-usb";
 	usb->data.flags = soc->flags;
 	usb->data.usb_phy = usb->phy;
+	usb->data.dr_mode = soc->dr_mode;
 	usb->data.capoffset = DEF_CAPOFFSET;
+	usb->data.enter_lpm = tegra_usb_enter_lpm;
+	usb->data.hub_control = tegra_ehci_hub_control;
+	usb->data.notify_event = tegra_usb_notify_event;
 
 	usb->dev = ci_hdrc_add_device(&pdev->dev, pdev->resource,
 				      pdev->num_resources, &usb->data);
 	if (IS_ERR(usb->dev)) {
 		err = PTR_ERR(usb->dev);
 		dev_err(&pdev->dev, "failed to add HDRC device: %d\n", err);
-		goto fail_power_off;
+		goto phy_shutdown;
 	}
 
-	platform_set_drvdata(pdev, usb);
-
 	return 0;
 
+phy_shutdown:
+	usb_phy_shutdown(usb->phy);
 fail_power_off:
 	clk_disable_unprepare(usb->clk);
 	return err;
@@ -111,6 +345,7 @@ static int tegra_usb_remove(struct platform_device *pdev)
 	struct tegra_usb *usb = platform_get_drvdata(pdev);
 
 	ci_hdrc_remove_device(usb->dev);
+	usb_phy_shutdown(usb->phy);
 	clk_disable_unprepare(usb->clk);
 
 	return 0;
diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c
index aa40e510b806..3f6c21406dbd 100644
--- a/drivers/usb/chipidea/core.c
+++ b/drivers/usb/chipidea/core.c
@@ -195,7 +195,7 @@ static void hw_wait_phy_stable(void)
 }
 
 /* The PHY enters/leaves low power mode */
-static void ci_hdrc_enter_lpm(struct ci_hdrc *ci, bool enable)
+static void ci_hdrc_enter_lpm_common(struct ci_hdrc *ci, bool enable)
 {
 	enum ci_hw_regs reg = ci->hw_bank.lpm ? OP_DEVLC : OP_PORTSC;
 	bool lpm = !!(hw_read(ci, reg, PORTSC_PHCD(ci->hw_bank.lpm)));
@@ -208,6 +208,11 @@ static void ci_hdrc_enter_lpm(struct ci_hdrc *ci, bool enable)
 				0);
 }
 
+static void ci_hdrc_enter_lpm(struct ci_hdrc *ci, bool enable)
+{
+	return ci->platdata->enter_lpm(ci, enable);
+}
+
 static int hw_device_init(struct ci_hdrc *ci, void __iomem *base)
 {
 	u32 reg;
@@ -790,6 +795,9 @@ static int ci_get_platdata(struct device *dev,
 			platdata->pins_device = p;
 	}
 
+	if (!platdata->enter_lpm)
+		platdata->enter_lpm = ci_hdrc_enter_lpm_common;
+
 	return 0;
 }
 
diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c
index 48e4a5ca1835..67247d2ac07a 100644
--- a/drivers/usb/chipidea/host.c
+++ b/drivers/usb/chipidea/host.c
@@ -29,6 +29,12 @@ struct ehci_ci_priv {
 	bool enabled;
 };
 
+struct ci_hdrc_dma_aligned_buffer {
+	void *kmalloc_ptr;
+	void *old_xfer_buffer;
+	u8 data[0];
+};
+
 static int ehci_ci_portpower(struct usb_hcd *hcd, int portnum, bool enable)
 {
 	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
@@ -160,14 +166,15 @@ static int host_start(struct ci_hdrc *ci)
 		pinctrl_select_state(ci->platdata->pctl,
 				     ci->platdata->pins_host);
 
+	ci->hcd = hcd;
+
 	ret = usb_add_hcd(hcd, 0, 0);
 	if (ret) {
+		ci->hcd = NULL;
 		goto disable_reg;
 	} else {
 		struct usb_otg *otg = &ci->otg;
 
-		ci->hcd = hcd;
-
 		if (ci_otg_is_fsm_mode(ci)) {
 			otg->host = &hcd->self;
 			hcd->self.otg_port = 1;
@@ -237,6 +244,7 @@ static int ci_ehci_hub_control(
 	u32		temp;
 	unsigned long	flags;
 	int		retval = 0;
+	bool		done = false;
 	struct device *dev = hcd->self.controller;
 	struct ci_hdrc *ci = dev_get_drvdata(dev);
 
@@ -244,6 +252,13 @@ static int ci_ehci_hub_control(
 
 	spin_lock_irqsave(&ehci->lock, flags);
 
+	if (ci->platdata->hub_control) {
+		retval = ci->platdata->hub_control(ci, typeReq, wValue, wIndex,
+						   buf, wLength, &done, &flags);
+		if (done)
+			goto done;
+	}
+
 	if (typeReq == SetPortFeature && wValue == USB_PORT_FEAT_SUSPEND) {
 		temp = ehci_readl(ehci, status_reg);
 		if ((temp & PORT_PE) == 0 || (temp & PORT_RESET) != 0) {
@@ -349,6 +364,86 @@ static int ci_ehci_bus_suspend(struct usb_hcd *hcd)
 	return 0;
 }
 
+static void ci_hdrc_free_dma_aligned_buffer(struct urb *urb)
+{
+	struct ci_hdrc_dma_aligned_buffer *temp;
+	size_t length;
+
+	if (!(urb->transfer_flags & URB_ALIGNED_TEMP_BUFFER))
+		return;
+
+	temp = container_of(urb->transfer_buffer,
+			    struct ci_hdrc_dma_aligned_buffer, data);
+
+	if (usb_urb_dir_in(urb)) {
+		if (usb_pipeisoc(urb->pipe))
+			length = urb->transfer_buffer_length;
+		else
+			length = urb->actual_length;
+
+		memcpy(temp->old_xfer_buffer, temp->data, length);
+	}
+	urb->transfer_buffer = temp->old_xfer_buffer;
+	kfree(temp->kmalloc_ptr);
+
+	urb->transfer_flags &= ~URB_ALIGNED_TEMP_BUFFER;
+}
+
+static int ci_hdrc_alloc_dma_aligned_buffer(struct urb *urb, gfp_t mem_flags)
+{
+	struct ci_hdrc_dma_aligned_buffer *temp, *kmalloc_ptr;
+	const unsigned int ci_hdrc_usb_dma_align = 32;
+	size_t kmalloc_size;
+
+	if (urb->num_sgs || urb->sg || urb->transfer_buffer_length == 0 ||
+	    !((uintptr_t)urb->transfer_buffer & (ci_hdrc_usb_dma_align - 1)))
+		return 0;
+
+	/* Allocate a buffer with enough padding for alignment */
+	kmalloc_size = urb->transfer_buffer_length +
+		       sizeof(struct ci_hdrc_dma_aligned_buffer) +
+		       ci_hdrc_usb_dma_align - 1;
+
+	kmalloc_ptr = kmalloc(kmalloc_size, mem_flags);
+	if (!kmalloc_ptr)
+		return -ENOMEM;
+
+	/* Position our struct dma_aligned_buffer such that data is aligned */
+	temp = PTR_ALIGN(kmalloc_ptr + 1, ci_hdrc_usb_dma_align) - 1;
+	temp->kmalloc_ptr = kmalloc_ptr;
+	temp->old_xfer_buffer = urb->transfer_buffer;
+	if (usb_urb_dir_out(urb))
+		memcpy(temp->data, urb->transfer_buffer,
+		       urb->transfer_buffer_length);
+	urb->transfer_buffer = temp->data;
+
+	urb->transfer_flags |= URB_ALIGNED_TEMP_BUFFER;
+
+	return 0;
+}
+
+static int ci_hdrc_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
+				   gfp_t mem_flags)
+{
+	int ret;
+
+	ret = ci_hdrc_alloc_dma_aligned_buffer(urb, mem_flags);
+	if (ret)
+		return ret;
+
+	ret = usb_hcd_map_urb_for_dma(hcd, urb, mem_flags);
+	if (ret)
+		ci_hdrc_free_dma_aligned_buffer(urb);
+
+	return ret;
+}
+
+static void ci_hdrc_unmap_urb_for_dma(struct usb_hcd *hcd, struct urb *urb)
+{
+	usb_hcd_unmap_urb_for_dma(hcd, urb);
+	ci_hdrc_free_dma_aligned_buffer(urb);
+}
+
 int ci_hdrc_host_init(struct ci_hdrc *ci)
 {
 	struct ci_role_driver *rdrv;
@@ -366,6 +461,11 @@ int ci_hdrc_host_init(struct ci_hdrc *ci)
 	rdrv->name	= "host";
 	ci->roles[CI_ROLE_HOST] = rdrv;
 
+	if (ci->platdata->flags & CI_HDRC_REQUIRES_ALIGNED_DMA) {
+		ci_ehci_hc_driver.map_urb_for_dma = ci_hdrc_map_urb_for_dma;
+		ci_ehci_hc_driver.unmap_urb_for_dma = ci_hdrc_unmap_urb_for_dma;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index 025b41687ce9..edf3342507f1 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -88,6 +88,12 @@ struct ci_hdrc_platform_data {
 	struct pinctrl_state *pins_default;
 	struct pinctrl_state *pins_host;
 	struct pinctrl_state *pins_device;
+
+	/* platform-specific hooks */
+	int (*hub_control)(struct ci_hdrc *ci, u16 typeReq, u16 wValue,
+			   u16 wIndex, char *buf, u16 wLength,
+			   bool *done, unsigned long *flags);
+	void (*enter_lpm)(struct ci_hdrc *ci, bool enable);
 };
 
 /* Default offset of capability registers */
-- 
cgit v1.2.3


From c512150b266b5d173c5ba841e9c09e4830ea4eca Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 5 Dec 2020 01:40:56 +0100
Subject: regulator: ab8500: Remove unused platform data

The struct ab8500_regulator_platform_data was a leftover
since the days before we probed all regulators from the
device tree. The ab8500-ext regulator was the only used,
defining platform data and register intialization that
was never used for anything, a copy of a boardfile no
longer in use.

Delete the ab8500_regulator_platform_data and make the
ab8500-ext regulator reference the regulator init data
in the local file directly. We are 100% device tree
these days.

Cc: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20201205004057.1712753-1-linus.walleij@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/ab8500-ext.c    | 417 +-------------------------------------
 include/linux/mfd/abx500/ab8500.h |   3 -
 include/linux/regulator/ab8500.h  |  10 -
 3 files changed, 2 insertions(+), 428 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500-ext.c b/drivers/regulator/ab8500-ext.c
index 8bb43a671ded..05f9531bd108 100644
--- a/drivers/regulator/ab8500-ext.c
+++ b/drivers/regulator/ab8500-ext.c
@@ -24,403 +24,6 @@
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/regulator/ab8500.h>
 
-static struct regulator_consumer_supply ab8500_vaux1_consumers[] = {
-	/* Main display, u8500 R3 uib */
-	REGULATOR_SUPPLY("vddi", "mcde_disp_sony_acx424akp.0"),
-	/* Main display, u8500 uib and ST uib */
-	REGULATOR_SUPPLY("vdd1", "samsung_s6d16d0.0"),
-	/* Secondary display, ST uib */
-	REGULATOR_SUPPLY("vdd1", "samsung_s6d16d0.1"),
-	/* SFH7741 proximity sensor */
-	REGULATOR_SUPPLY("vcc", "gpio-keys.0"),
-	/* BH1780GLS ambient light sensor */
-	REGULATOR_SUPPLY("vcc", "2-0029"),
-	/* lsm303dlh accelerometer */
-	REGULATOR_SUPPLY("vdd", "2-0018"),
-	/* lsm303dlhc accelerometer */
-	REGULATOR_SUPPLY("vdd", "2-0019"),
-	/* lsm303dlh magnetometer */
-	REGULATOR_SUPPLY("vdd", "2-001e"),
-	/* Rohm BU21013 Touchscreen devices */
-	REGULATOR_SUPPLY("avdd", "3-005c"),
-	REGULATOR_SUPPLY("avdd", "3-005d"),
-	/* Synaptics RMI4 Touchscreen device */
-	REGULATOR_SUPPLY("vdd", "3-004b"),
-	/* L3G4200D Gyroscope device */
-	REGULATOR_SUPPLY("vdd", "2-0068"),
-	/* Ambient light sensor device */
-	REGULATOR_SUPPLY("vdd", "3-0029"),
-	/* Pressure sensor device */
-	REGULATOR_SUPPLY("vdd", "2-005c"),
-	/* Cypress TrueTouch Touchscreen device */
-	REGULATOR_SUPPLY("vcpin", "spi8.0"),
-	/* Camera device */
-	REGULATOR_SUPPLY("vaux12v5", "mmio_camera"),
-};
-
-static struct regulator_consumer_supply ab8500_vaux2_consumers[] = {
-	/* On-board eMMC power */
-	REGULATOR_SUPPLY("vmmc", "sdi4"),
-	/* AB8500 audio codec */
-	REGULATOR_SUPPLY("vcc-N2158", "ab8500-codec.0"),
-	/* AB8500 accessory detect 1 */
-	REGULATOR_SUPPLY("vcc-N2158", "ab8500-acc-det.0"),
-	/* AB8500 Tv-out device */
-	REGULATOR_SUPPLY("vcc-N2158", "mcde_tv_ab8500.4"),
-	/* AV8100 HDMI device */
-	REGULATOR_SUPPLY("vcc-N2158", "av8100_hdmi.3"),
-};
-
-static struct regulator_consumer_supply ab8500_vaux3_consumers[] = {
-	REGULATOR_SUPPLY("v-SD-STM", "stm"),
-	/* External MMC slot power */
-	REGULATOR_SUPPLY("vmmc", "sdi0"),
-};
-
-static struct regulator_consumer_supply ab8500_vtvout_consumers[] = {
-	/* TV-out DENC supply */
-	REGULATOR_SUPPLY("vtvout", "ab8500-denc.0"),
-	/* Internal general-purpose ADC */
-	REGULATOR_SUPPLY("vddadc", "ab8500-gpadc.0"),
-	/* ADC for charger */
-	REGULATOR_SUPPLY("vddadc", "ab8500-charger.0"),
-	/* AB8500 Tv-out device */
-	REGULATOR_SUPPLY("vtvout", "mcde_tv_ab8500.4"),
-};
-
-static struct regulator_consumer_supply ab8500_vaud_consumers[] = {
-	/* AB8500 audio-codec main supply */
-	REGULATOR_SUPPLY("vaud", "ab8500-codec.0"),
-};
-
-static struct regulator_consumer_supply ab8500_vamic1_consumers[] = {
-	/* AB8500 audio-codec Mic1 supply */
-	REGULATOR_SUPPLY("vamic1", "ab8500-codec.0"),
-};
-
-static struct regulator_consumer_supply ab8500_vamic2_consumers[] = {
-	/* AB8500 audio-codec Mic2 supply */
-	REGULATOR_SUPPLY("vamic2", "ab8500-codec.0"),
-};
-
-static struct regulator_consumer_supply ab8500_vdmic_consumers[] = {
-	/* AB8500 audio-codec DMic supply */
-	REGULATOR_SUPPLY("vdmic", "ab8500-codec.0"),
-};
-
-static struct regulator_consumer_supply ab8500_vintcore_consumers[] = {
-	/* SoC core supply, no device */
-	REGULATOR_SUPPLY("v-intcore", NULL),
-	/* USB Transceiver */
-	REGULATOR_SUPPLY("vddulpivio18", "ab8500-usb.0"),
-	/* Handled by abx500 clk driver */
-	REGULATOR_SUPPLY("v-intcore", "abx500-clk.0"),
-};
-
-static struct regulator_consumer_supply ab8500_vana_consumers[] = {
-	/* DB8500 DSI */
-	REGULATOR_SUPPLY("vdddsi1v2", "mcde"),
-	REGULATOR_SUPPLY("vdddsi1v2", "b2r2_core"),
-	REGULATOR_SUPPLY("vdddsi1v2", "b2r2_1_core"),
-	REGULATOR_SUPPLY("vdddsi1v2", "dsilink.0"),
-	REGULATOR_SUPPLY("vdddsi1v2", "dsilink.1"),
-	REGULATOR_SUPPLY("vdddsi1v2", "dsilink.2"),
-	/* DB8500 CSI */
-	REGULATOR_SUPPLY("vddcsi1v2", "mmio_camera"),
-};
-
-/* ab8500 regulator register initialization */
-static struct ab8500_regulator_reg_init ab8500_reg_init[] = {
-	/*
-	 * VanaRequestCtrl          = HP/LP depending on VxRequest
-	 * VextSupply1RequestCtrl   = HP/LP depending on VxRequest
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL2,       0xf0, 0x00),
-	/*
-	 * VextSupply2RequestCtrl   = HP/LP depending on VxRequest
-	 * VextSupply3RequestCtrl   = HP/LP depending on VxRequest
-	 * Vaux1RequestCtrl         = HP/LP depending on VxRequest
-	 * Vaux2RequestCtrl         = HP/LP depending on VxRequest
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL3,       0xff, 0x00),
-	/*
-	 * Vaux3RequestCtrl         = HP/LP depending on VxRequest
-	 * SwHPReq                  = Control through SWValid disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL4,       0x07, 0x00),
-	/*
-	 * VanaSysClkReq1HPValid    = disabled
-	 * Vaux1SysClkReq1HPValid   = disabled
-	 * Vaux2SysClkReq1HPValid   = disabled
-	 * Vaux3SysClkReq1HPValid   = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0xe8, 0x00),
-	/*
-	 * VextSupply1SysClkReq1HPValid = disabled
-	 * VextSupply2SysClkReq1HPValid = disabled
-	 * VextSupply3SysClkReq1HPValid = SysClkReq1 controlled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID2, 0x70, 0x40),
-	/*
-	 * VanaHwHPReq1Valid        = disabled
-	 * Vaux1HwHPreq1Valid       = disabled
-	 * Vaux2HwHPReq1Valid       = disabled
-	 * Vaux3HwHPReqValid        = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ1VALID1,     0xe8, 0x00),
-	/*
-	 * VextSupply1HwHPReq1Valid = disabled
-	 * VextSupply2HwHPReq1Valid = disabled
-	 * VextSupply3HwHPReq1Valid = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ1VALID2,     0x07, 0x00),
-	/*
-	 * VanaHwHPReq2Valid        = disabled
-	 * Vaux1HwHPReq2Valid       = disabled
-	 * Vaux2HwHPReq2Valid       = disabled
-	 * Vaux3HwHPReq2Valid       = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ2VALID1,     0xe8, 0x00),
-	/*
-	 * VextSupply1HwHPReq2Valid = disabled
-	 * VextSupply2HwHPReq2Valid = disabled
-	 * VextSupply3HwHPReq2Valid = HWReq2 controlled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ2VALID2,     0x07, 0x04),
-	/*
-	 * VanaSwHPReqValid         = disabled
-	 * Vaux1SwHPReqValid        = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSWHPREQVALID1,      0xa0, 0x00),
-	/*
-	 * Vaux2SwHPReqValid        = disabled
-	 * Vaux3SwHPReqValid        = disabled
-	 * VextSupply1SwHPReqValid  = disabled
-	 * VextSupply2SwHPReqValid  = disabled
-	 * VextSupply3SwHPReqValid  = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSWHPREQVALID2,      0x1f, 0x00),
-	/*
-	 * SysClkReq2Valid1         = SysClkReq2 controlled
-	 * SysClkReq3Valid1         = disabled
-	 * SysClkReq4Valid1         = SysClkReq4 controlled
-	 * SysClkReq5Valid1         = disabled
-	 * SysClkReq6Valid1         = SysClkReq6 controlled
-	 * SysClkReq7Valid1         = disabled
-	 * SysClkReq8Valid1         = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQVALID1,    0xfe, 0x2a),
-	/*
-	 * SysClkReq2Valid2         = disabled
-	 * SysClkReq3Valid2         = disabled
-	 * SysClkReq4Valid2         = disabled
-	 * SysClkReq5Valid2         = disabled
-	 * SysClkReq6Valid2         = SysClkReq6 controlled
-	 * SysClkReq7Valid2         = disabled
-	 * SysClkReq8Valid2         = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQVALID2,    0xfe, 0x20),
-	/*
-	 * VTVoutEna                = disabled
-	 * Vintcore12Ena            = disabled
-	 * Vintcore12Sel            = 1.25 V
-	 * Vintcore12LP             = inactive (HP)
-	 * VTVoutLP                 = inactive (HP)
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUMISC1,              0xfe, 0x10),
-	/*
-	 * VaudioEna                = disabled
-	 * VdmicEna                 = disabled
-	 * Vamic1Ena                = disabled
-	 * Vamic2Ena                = disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUDIOSUPPLY,           0x1e, 0x00),
-	/*
-	 * Vamic1_dzout             = high-Z when Vamic1 is disabled
-	 * Vamic2_dzout             = high-Z when Vamic2 is disabled
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL1VAMIC,         0x03, 0x00),
-	/*
-	 * VPll                     = Hw controlled (NOTE! PRCMU bits)
-	 * VanaRegu                 = force off
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VPLLVANAREGU,           0x0f, 0x02),
-	/*
-	 * VrefDDREna               = disabled
-	 * VrefDDRSleepMode         = inactive (no pulldown)
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VREFDDR,                0x03, 0x00),
-	/*
-	 * VextSupply1Regu          = force LP
-	 * VextSupply2Regu          = force OFF
-	 * VextSupply3Regu          = force HP (-> STBB2=LP and TPS=LP)
-	 * ExtSupply2Bypass         = ExtSupply12LPn ball is 0 when Ena is 0
-	 * ExtSupply3Bypass         = ExtSupply3LPn ball is 0 when Ena is 0
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_EXTSUPPLYREGU,          0xff, 0x13),
-	/*
-	 * Vaux1Regu                = force HP
-	 * Vaux2Regu                = force off
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUX12REGU,             0x0f, 0x01),
-	/*
-	 * Vaux3Regu                = force off
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU,          0x03, 0x00),
-	/*
-	 * Vaux1Sel                 = 2.8 V
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUX1SEL,               0x0f, 0x0C),
-	/*
-	 * Vaux2Sel                 = 2.9 V
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUX2SEL,               0x0f, 0x0d),
-	/*
-	 * Vaux3Sel                 = 2.91 V
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3SEL,           0x07, 0x07),
-	/*
-	 * VextSupply12LP           = disabled (no LP)
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL2SPARE,         0x01, 0x00),
-	/*
-	 * Vaux1Disch               = short discharge time
-	 * Vaux2Disch               = short discharge time
-	 * Vaux3Disch               = short discharge time
-	 * Vintcore12Disch          = short discharge time
-	 * VTVoutDisch              = short discharge time
-	 * VaudioDisch              = short discharge time
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRLDISCH,          0xfc, 0x00),
-	/*
-	 * VanaDisch                = short discharge time
-	 * VdmicPullDownEna         = pulldown disabled when Vdmic is disabled
-	 * VdmicDisch               = short discharge time
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRLDISCH2,         0x16, 0x00),
-};
-
-/* AB8500 regulators */
-static struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS] = {
-	/* supplies to the display/camera */
-	[AB8500_LDO_AUX1] = {
-		.supply_regulator = "ab8500-ext-supply3",
-		.constraints = {
-			.name = "V-DISPLAY",
-			.min_uV = 2800000,
-			.max_uV = 3300000,
-			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
-					  REGULATOR_CHANGE_STATUS,
-			.boot_on = 1, /* display is on at boot */
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaux1_consumers),
-		.consumer_supplies = ab8500_vaux1_consumers,
-	},
-	/* supplies to the on-board eMMC */
-	[AB8500_LDO_AUX2] = {
-		.supply_regulator = "ab8500-ext-supply3",
-		.constraints = {
-			.name = "V-eMMC1",
-			.min_uV = 1100000,
-			.max_uV = 3300000,
-			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
-					  REGULATOR_CHANGE_STATUS |
-					  REGULATOR_CHANGE_MODE,
-			.valid_modes_mask = REGULATOR_MODE_NORMAL |
-					    REGULATOR_MODE_IDLE,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaux2_consumers),
-		.consumer_supplies = ab8500_vaux2_consumers,
-	},
-	/* supply for VAUX3, supplies to SDcard slots */
-	[AB8500_LDO_AUX3] = {
-		.supply_regulator = "ab8500-ext-supply3",
-		.constraints = {
-			.name = "V-MMC-SD",
-			.min_uV = 1100000,
-			.max_uV = 3300000,
-			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
-					  REGULATOR_CHANGE_STATUS |
-					  REGULATOR_CHANGE_MODE,
-			.valid_modes_mask = REGULATOR_MODE_NORMAL |
-					    REGULATOR_MODE_IDLE,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaux3_consumers),
-		.consumer_supplies = ab8500_vaux3_consumers,
-	},
-	/* supply for tvout, gpadc, TVOUT LDO */
-	[AB8500_LDO_TVOUT] = {
-		.constraints = {
-			.name = "V-TVOUT",
-			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vtvout_consumers),
-		.consumer_supplies = ab8500_vtvout_consumers,
-	},
-	/* supply for ab8500-vaudio, VAUDIO LDO */
-	[AB8500_LDO_AUDIO] = {
-		.constraints = {
-			.name = "V-AUD",
-			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaud_consumers),
-		.consumer_supplies = ab8500_vaud_consumers,
-	},
-	/* supply for v-anamic1 VAMic1-LDO */
-	[AB8500_LDO_ANAMIC1] = {
-		.constraints = {
-			.name = "V-AMIC1",
-			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vamic1_consumers),
-		.consumer_supplies = ab8500_vamic1_consumers,
-	},
-	/* supply for v-amic2, VAMIC2 LDO, reuse constants for AMIC1 */
-	[AB8500_LDO_ANAMIC2] = {
-		.constraints = {
-			.name = "V-AMIC2",
-			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vamic2_consumers),
-		.consumer_supplies = ab8500_vamic2_consumers,
-	},
-	/* supply for v-dmic, VDMIC LDO */
-	[AB8500_LDO_DMIC] = {
-		.constraints = {
-			.name = "V-DMIC",
-			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vdmic_consumers),
-		.consumer_supplies = ab8500_vdmic_consumers,
-	},
-	/* supply for v-intcore12, VINTCORE12 LDO */
-	[AB8500_LDO_INTCORE] = {
-		.constraints = {
-			.name = "V-INTCORE",
-			.min_uV = 1250000,
-			.max_uV = 1350000,
-			.input_uV = 1800000,
-			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
-					  REGULATOR_CHANGE_STATUS |
-					  REGULATOR_CHANGE_MODE |
-					  REGULATOR_CHANGE_DRMS,
-			.valid_modes_mask = REGULATOR_MODE_NORMAL |
-					    REGULATOR_MODE_IDLE,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vintcore_consumers),
-		.consumer_supplies = ab8500_vintcore_consumers,
-	},
-	/* supply for U8500 CSI-DSI, VANA LDO */
-	[AB8500_LDO_ANA] = {
-		.constraints = {
-			.name = "V-CSI-DSI",
-			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
-		},
-		.num_consumer_supplies = ARRAY_SIZE(ab8500_vana_consumers),
-		.consumer_supplies = ab8500_vana_consumers,
-	},
-};
-
 /* supply for VextSupply3 */
 static struct regulator_consumer_supply ab8500_ext_supply3_consumers[] = {
 	/* SIM supply for 3 V SIM cards */
@@ -465,15 +68,6 @@ static struct regulator_init_data ab8500_ext_regulators[] = {
 	},
 };
 
-static struct ab8500_regulator_platform_data ab8500_regulator_plat_data = {
-	.reg_init               = ab8500_reg_init,
-	.num_reg_init           = ARRAY_SIZE(ab8500_reg_init),
-	.regulator              = ab8500_regulators,
-	.num_regulator          = ARRAY_SIZE(ab8500_regulators),
-	.ext_regulator          = ab8500_ext_regulators,
-	.num_ext_regulator      = ARRAY_SIZE(ab8500_ext_regulators),
-};
-
 /**
  * struct ab8500_ext_regulator_info - ab8500 regulator information
  * @dev: device pointer
@@ -788,7 +382,6 @@ static struct ab8500_ext_regulator_info
 static int ab8500_ext_regulator_probe(struct platform_device *pdev)
 {
 	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
-	struct ab8500_regulator_platform_data *pdata = &ab8500_regulator_plat_data;
 	struct regulator_config config = { };
 	struct regulator_dev *rdev;
 	int i;
@@ -798,12 +391,6 @@ static int ab8500_ext_regulator_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	/* make sure the platform data has the correct size */
-	if (pdata->num_ext_regulator != ARRAY_SIZE(ab8500_ext_regulator_info)) {
-		dev_err(&pdev->dev, "Configuration error: size mismatch.\n");
-		return -EINVAL;
-	}
-
 	/* check for AB8500 2.x */
 	if (is_ab8500_2p0_or_earlier(ab8500)) {
 		struct ab8500_ext_regulator_info *info;
@@ -823,11 +410,11 @@ static int ab8500_ext_regulator_probe(struct platform_device *pdev)
 		info = &ab8500_ext_regulator_info[i];
 		info->dev = &pdev->dev;
 		info->cfg = (struct ab8500_ext_regulator_cfg *)
-			pdata->ext_regulator[i].driver_data;
+			ab8500_ext_regulators[i].driver_data;
 
 		config.dev = &pdev->dev;
 		config.driver_data = info;
-		config.init_data = &pdata->ext_regulator[i];
+		config.init_data = &ab8500_ext_regulators[i];
 
 		/* register regulator with framework */
 		rdev = devm_regulator_register(&pdev->dev, &info->desc,
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 524a7e4702c2..302a330c5c84 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -368,7 +368,6 @@ struct ab8500 {
 	int it_latchhier_num;
 };
 
-struct ab8500_regulator_platform_data;
 struct ab8500_codec_platform_data;
 struct ab8500_sysctrl_platform_data;
 
@@ -376,11 +375,9 @@ struct ab8500_sysctrl_platform_data;
  * struct ab8500_platform_data - AB8500 platform data
  * @irq_base: start of AB8500 IRQs, AB8500_NR_IRQS will be used
  * @init: board-specific initialization after detection of ab8500
- * @regulator: machine-specific constraints for regulators
  */
 struct ab8500_platform_data {
 	void (*init) (struct ab8500 *);
-	struct ab8500_regulator_platform_data *regulator;
 	struct ab8500_codec_platform_data *codec;
 	struct ab8500_sysctrl_platform_data *sysctrl;
 };
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 3ab1ddf151a2..ecf7b4713df9 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -153,14 +153,4 @@ enum ab8500_ext_regulator_id {
 	AB8500_NUM_EXT_REGULATORS,
 };
 
-/* AB8500 regulator platform data */
-struct ab8500_regulator_platform_data {
-	int num_reg_init;
-	struct ab8500_regulator_reg_init *reg_init;
-	int num_regulator;
-	struct regulator_init_data *regulator;
-	int num_ext_regulator;
-	struct regulator_init_data *ext_regulator;
-};
-
 #endif
-- 
cgit v1.2.3


From 3acb64c07e95a75dc0af0bc958f2d09a44a9fd0d Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 5 Dec 2020 01:40:57 +0100
Subject: regulator: ab8500: Decomission platform data header

The platform data header was only used to pass platform
data from board files. We now populate the regulators
exclusively from device tree, so the header contents can
be moved into the regulator drivers.

Cc: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20201205004057.1712753-2-linus.walleij@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mfd/ab8500-core.c        |   1 -
 drivers/regulator/ab8500-ext.c   |  13 +++-
 drivers/regulator/ab8500.c       | 116 ++++++++++++++++++++++++++++-
 include/linux/regulator/ab8500.h | 156 ---------------------------------------
 4 files changed, 127 insertions(+), 159 deletions(-)
 delete mode 100644 include/linux/regulator/ab8500.h

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index a3bac9da8cbb..3b2276f04a98 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -21,7 +21,6 @@
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/mfd/dbx500-prcmu.h>
-#include <linux/regulator/ab8500.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 
diff --git a/drivers/regulator/ab8500-ext.c b/drivers/regulator/ab8500-ext.c
index 05f9531bd108..4f26952caa56 100644
--- a/drivers/regulator/ab8500-ext.c
+++ b/drivers/regulator/ab8500-ext.c
@@ -22,7 +22,18 @@
 #include <linux/regulator/of_regulator.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/regulator/ab8500.h>
+
+/* AB8500 external regulators */
+enum ab8500_ext_regulator_id {
+	AB8500_EXT_SUPPLY1,
+	AB8500_EXT_SUPPLY2,
+	AB8500_EXT_SUPPLY3,
+	AB8500_NUM_EXT_REGULATORS,
+};
+
+struct ab8500_ext_regulator_cfg {
+	bool hwreq; /* requires hw mode or high power mode */
+};
 
 /* supply for VextSupply3 */
 static struct regulator_consumer_supply ab8500_ext_supply3_consumers[] = {
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 47b8b6f7b571..23a401734a98 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -25,9 +25,123 @@
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
-#include <linux/regulator/ab8500.h>
 #include <linux/slab.h>
 
+/* AB8500 regulators */
+enum ab8500_regulator_id {
+	AB8500_LDO_AUX1,
+	AB8500_LDO_AUX2,
+	AB8500_LDO_AUX3,
+	AB8500_LDO_INTCORE,
+	AB8500_LDO_TVOUT,
+	AB8500_LDO_AUDIO,
+	AB8500_LDO_ANAMIC1,
+	AB8500_LDO_ANAMIC2,
+	AB8500_LDO_DMIC,
+	AB8500_LDO_ANA,
+	AB8500_NUM_REGULATORS,
+};
+
+/* AB8505 regulators */
+enum ab8505_regulator_id {
+	AB8505_LDO_AUX1,
+	AB8505_LDO_AUX2,
+	AB8505_LDO_AUX3,
+	AB8505_LDO_AUX4,
+	AB8505_LDO_AUX5,
+	AB8505_LDO_AUX6,
+	AB8505_LDO_INTCORE,
+	AB8505_LDO_ADC,
+	AB8505_LDO_AUDIO,
+	AB8505_LDO_ANAMIC1,
+	AB8505_LDO_ANAMIC2,
+	AB8505_LDO_AUX8,
+	AB8505_LDO_ANA,
+	AB8505_NUM_REGULATORS,
+};
+
+/* AB8500 registers */
+enum ab8500_regulator_reg {
+	AB8500_REGUREQUESTCTRL2,
+	AB8500_REGUREQUESTCTRL3,
+	AB8500_REGUREQUESTCTRL4,
+	AB8500_REGUSYSCLKREQ1HPVALID1,
+	AB8500_REGUSYSCLKREQ1HPVALID2,
+	AB8500_REGUHWHPREQ1VALID1,
+	AB8500_REGUHWHPREQ1VALID2,
+	AB8500_REGUHWHPREQ2VALID1,
+	AB8500_REGUHWHPREQ2VALID2,
+	AB8500_REGUSWHPREQVALID1,
+	AB8500_REGUSWHPREQVALID2,
+	AB8500_REGUSYSCLKREQVALID1,
+	AB8500_REGUSYSCLKREQVALID2,
+	AB8500_REGUMISC1,
+	AB8500_VAUDIOSUPPLY,
+	AB8500_REGUCTRL1VAMIC,
+	AB8500_VPLLVANAREGU,
+	AB8500_VREFDDR,
+	AB8500_EXTSUPPLYREGU,
+	AB8500_VAUX12REGU,
+	AB8500_VRF1VAUX3REGU,
+	AB8500_VAUX1SEL,
+	AB8500_VAUX2SEL,
+	AB8500_VRF1VAUX3SEL,
+	AB8500_REGUCTRL2SPARE,
+	AB8500_REGUCTRLDISCH,
+	AB8500_REGUCTRLDISCH2,
+	AB8500_NUM_REGULATOR_REGISTERS,
+};
+
+/* AB8505 registers */
+enum ab8505_regulator_reg {
+	AB8505_REGUREQUESTCTRL1,
+	AB8505_REGUREQUESTCTRL2,
+	AB8505_REGUREQUESTCTRL3,
+	AB8505_REGUREQUESTCTRL4,
+	AB8505_REGUSYSCLKREQ1HPVALID1,
+	AB8505_REGUSYSCLKREQ1HPVALID2,
+	AB8505_REGUHWHPREQ1VALID1,
+	AB8505_REGUHWHPREQ1VALID2,
+	AB8505_REGUHWHPREQ2VALID1,
+	AB8505_REGUHWHPREQ2VALID2,
+	AB8505_REGUSWHPREQVALID1,
+	AB8505_REGUSWHPREQVALID2,
+	AB8505_REGUSYSCLKREQVALID1,
+	AB8505_REGUSYSCLKREQVALID2,
+	AB8505_REGUVAUX4REQVALID,
+	AB8505_REGUMISC1,
+	AB8505_VAUDIOSUPPLY,
+	AB8505_REGUCTRL1VAMIC,
+	AB8505_VSMPSAREGU,
+	AB8505_VSMPSBREGU,
+	AB8505_VSAFEREGU, /* NOTE! PRCMU register */
+	AB8505_VPLLVANAREGU,
+	AB8505_EXTSUPPLYREGU,
+	AB8505_VAUX12REGU,
+	AB8505_VRF1VAUX3REGU,
+	AB8505_VSMPSASEL1,
+	AB8505_VSMPSASEL2,
+	AB8505_VSMPSASEL3,
+	AB8505_VSMPSBSEL1,
+	AB8505_VSMPSBSEL2,
+	AB8505_VSMPSBSEL3,
+	AB8505_VSAFESEL1, /* NOTE! PRCMU register */
+	AB8505_VSAFESEL2, /* NOTE! PRCMU register */
+	AB8505_VSAFESEL3, /* NOTE! PRCMU register */
+	AB8505_VAUX1SEL,
+	AB8505_VAUX2SEL,
+	AB8505_VRF1VAUX3SEL,
+	AB8505_VAUX4REQCTRL,
+	AB8505_VAUX4REGU,
+	AB8505_VAUX4SEL,
+	AB8505_REGUCTRLDISCH,
+	AB8505_REGUCTRLDISCH2,
+	AB8505_REGUCTRLDISCH3,
+	AB8505_CTRLVAUX5,
+	AB8505_CTRLVAUX6,
+	AB8505_NUM_REGULATOR_REGISTERS,
+};
+
 /**
  * struct ab8500_shared_mode - is used when mode is shared between
  * two regulators.
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
deleted file mode 100644
index ecf7b4713df9..000000000000
--- a/include/linux/regulator/ab8500.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson SA 2010
- *
- * Authors: Sundar Iyer <sundar.iyer@stericsson.com> for ST-Ericsson
- *          Bengt Jonsson <bengt.g.jonsson@stericsson.com> for ST-Ericsson
- *          Daniel Willerud <daniel.willerud@stericsson.com> for ST-Ericsson
- */
-
-#ifndef __LINUX_MFD_AB8500_REGULATOR_H
-#define __LINUX_MFD_AB8500_REGULATOR_H
-
-#include <linux/platform_device.h>
-
-/* AB8500 regulators */
-enum ab8500_regulator_id {
-	AB8500_LDO_AUX1,
-	AB8500_LDO_AUX2,
-	AB8500_LDO_AUX3,
-	AB8500_LDO_INTCORE,
-	AB8500_LDO_TVOUT,
-	AB8500_LDO_AUDIO,
-	AB8500_LDO_ANAMIC1,
-	AB8500_LDO_ANAMIC2,
-	AB8500_LDO_DMIC,
-	AB8500_LDO_ANA,
-	AB8500_NUM_REGULATORS,
-};
-
-/* AB8505 regulators */
-enum ab8505_regulator_id {
-	AB8505_LDO_AUX1,
-	AB8505_LDO_AUX2,
-	AB8505_LDO_AUX3,
-	AB8505_LDO_AUX4,
-	AB8505_LDO_AUX5,
-	AB8505_LDO_AUX6,
-	AB8505_LDO_INTCORE,
-	AB8505_LDO_ADC,
-	AB8505_LDO_AUDIO,
-	AB8505_LDO_ANAMIC1,
-	AB8505_LDO_ANAMIC2,
-	AB8505_LDO_AUX8,
-	AB8505_LDO_ANA,
-	AB8505_NUM_REGULATORS,
-};
-
-/* AB8500 and AB8505 register initialization */
-struct ab8500_regulator_reg_init {
-	int id;
-	u8 mask;
-	u8 value;
-};
-
-#define INIT_REGULATOR_REGISTER(_id, _mask, _value)	\
-	{						\
-		.id = _id,				\
-		.mask = _mask,				\
-		.value = _value,			\
-	}
-
-/* AB8500 registers */
-enum ab8500_regulator_reg {
-	AB8500_REGUREQUESTCTRL2,
-	AB8500_REGUREQUESTCTRL3,
-	AB8500_REGUREQUESTCTRL4,
-	AB8500_REGUSYSCLKREQ1HPVALID1,
-	AB8500_REGUSYSCLKREQ1HPVALID2,
-	AB8500_REGUHWHPREQ1VALID1,
-	AB8500_REGUHWHPREQ1VALID2,
-	AB8500_REGUHWHPREQ2VALID1,
-	AB8500_REGUHWHPREQ2VALID2,
-	AB8500_REGUSWHPREQVALID1,
-	AB8500_REGUSWHPREQVALID2,
-	AB8500_REGUSYSCLKREQVALID1,
-	AB8500_REGUSYSCLKREQVALID2,
-	AB8500_REGUMISC1,
-	AB8500_VAUDIOSUPPLY,
-	AB8500_REGUCTRL1VAMIC,
-	AB8500_VPLLVANAREGU,
-	AB8500_VREFDDR,
-	AB8500_EXTSUPPLYREGU,
-	AB8500_VAUX12REGU,
-	AB8500_VRF1VAUX3REGU,
-	AB8500_VAUX1SEL,
-	AB8500_VAUX2SEL,
-	AB8500_VRF1VAUX3SEL,
-	AB8500_REGUCTRL2SPARE,
-	AB8500_REGUCTRLDISCH,
-	AB8500_REGUCTRLDISCH2,
-	AB8500_NUM_REGULATOR_REGISTERS,
-};
-
-/* AB8505 registers */
-enum ab8505_regulator_reg {
-	AB8505_REGUREQUESTCTRL1,
-	AB8505_REGUREQUESTCTRL2,
-	AB8505_REGUREQUESTCTRL3,
-	AB8505_REGUREQUESTCTRL4,
-	AB8505_REGUSYSCLKREQ1HPVALID1,
-	AB8505_REGUSYSCLKREQ1HPVALID2,
-	AB8505_REGUHWHPREQ1VALID1,
-	AB8505_REGUHWHPREQ1VALID2,
-	AB8505_REGUHWHPREQ2VALID1,
-	AB8505_REGUHWHPREQ2VALID2,
-	AB8505_REGUSWHPREQVALID1,
-	AB8505_REGUSWHPREQVALID2,
-	AB8505_REGUSYSCLKREQVALID1,
-	AB8505_REGUSYSCLKREQVALID2,
-	AB8505_REGUVAUX4REQVALID,
-	AB8505_REGUMISC1,
-	AB8505_VAUDIOSUPPLY,
-	AB8505_REGUCTRL1VAMIC,
-	AB8505_VSMPSAREGU,
-	AB8505_VSMPSBREGU,
-	AB8505_VSAFEREGU, /* NOTE! PRCMU register */
-	AB8505_VPLLVANAREGU,
-	AB8505_EXTSUPPLYREGU,
-	AB8505_VAUX12REGU,
-	AB8505_VRF1VAUX3REGU,
-	AB8505_VSMPSASEL1,
-	AB8505_VSMPSASEL2,
-	AB8505_VSMPSASEL3,
-	AB8505_VSMPSBSEL1,
-	AB8505_VSMPSBSEL2,
-	AB8505_VSMPSBSEL3,
-	AB8505_VSAFESEL1, /* NOTE! PRCMU register */
-	AB8505_VSAFESEL2, /* NOTE! PRCMU register */
-	AB8505_VSAFESEL3, /* NOTE! PRCMU register */
-	AB8505_VAUX1SEL,
-	AB8505_VAUX2SEL,
-	AB8505_VRF1VAUX3SEL,
-	AB8505_VAUX4REQCTRL,
-	AB8505_VAUX4REGU,
-	AB8505_VAUX4SEL,
-	AB8505_REGUCTRLDISCH,
-	AB8505_REGUCTRLDISCH2,
-	AB8505_REGUCTRLDISCH3,
-	AB8505_CTRLVAUX5,
-	AB8505_CTRLVAUX6,
-	AB8505_NUM_REGULATOR_REGISTERS,
-};
-
-/* AB8500 external regulators */
-struct ab8500_ext_regulator_cfg {
-	bool hwreq; /* requires hw mode or high power mode */
-};
-
-enum ab8500_ext_regulator_id {
-	AB8500_EXT_SUPPLY1,
-	AB8500_EXT_SUPPLY2,
-	AB8500_EXT_SUPPLY3,
-	AB8500_NUM_EXT_REGULATORS,
-};
-
-#endif
-- 
cgit v1.2.3


From 1e9d63331f8fa556f31e1406ab12f2a1e5cdb495 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Jan 2021 11:02:44 -0800
Subject: fs: correctly document the inode dirty flags

The documentation for I_DIRTY_SYNC and I_DIRTY_DATASYNC is a bit
misleading, and I_DIRTY_TIME isn't documented at all.  Fix this.

Link: https://lore.kernel.org/r/20210112190253.64307-3-ebiggers@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..45a0303b2aeb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2084,8 +2084,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 /*
  * Inode state bits.  Protected by inode->i_lock
  *
- * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
- * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
+ * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
+ * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
  *
  * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
  * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
@@ -2094,12 +2094,20 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
  *
  * I_DIRTY_SYNC		Inode is dirty, but doesn't have to be written on
- *			fdatasync().  i_atime is the usual cause.
- * I_DIRTY_DATASYNC	Data-related inode changes pending. We keep track of
+ *			fdatasync() (unless I_DIRTY_DATASYNC is also set).
+ *			Timestamp updates are the usual cause.
+ * I_DIRTY_DATASYNC	Data-related inode changes pending.  We keep track of
  *			these changes separately from I_DIRTY_SYNC so that we
  *			don't have to write inode on fdatasync() when only
- *			mtime has changed in it.
+ *			e.g. the timestamps have changed.
  * I_DIRTY_PAGES	Inode has dirty pages.  Inode itself may be clean.
+ * I_DIRTY_TIME		The inode itself only has dirty timestamps, and the
+ *			lazytime mount option is enabled.  We keep track of this
+ *			separately from I_DIRTY_SYNC in order to implement
+ *			lazytime.  This gets cleared if I_DIRTY_INODE
+ *			(I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set.  I.e.
+ *			either I_DIRTY_TIME *or* I_DIRTY_INODE can be set in
+ *			i_state, but not both.  I_DIRTY_PAGES may still be set.
  * I_NEW		Serves as both a mutex and completion notification.
  *			New inodes set I_NEW.  If two processes both create
  *			the same inode, one of them will release its inode and
-- 
cgit v1.2.3


From ed296c6c05b0ac52d7c6bf13a90f02b8b8222169 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Jan 2021 11:02:53 -0800
Subject: ext4: simplify i_state checks in __ext4_update_other_inode_time()

Since I_DIRTY_TIME and I_DIRTY_INODE are mutually exclusive in i_state,
there's no need to check for I_DIRTY_TIME && !I_DIRTY_INODE.  Just check
for I_DIRTY_TIME.

Also introduce a helper function in include/linux/fs.h to do this check.

Link: https://lore.kernel.org/r/20210112190253.64307-12-ebiggers@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c    |  8 ++------
 include/linux/fs.h | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4cc6c7834312..d809a06b6ef7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4961,15 +4961,11 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
 	if (!inode)
 		return;
 
-	if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
-			       I_DIRTY_INODE)) ||
-	    ((inode->i_state & I_DIRTY_TIME) == 0))
+	if (!inode_is_dirtytime_only(inode))
 		return;
 
 	spin_lock(&inode->i_lock);
-	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
-				I_DIRTY_INODE)) == 0) &&
-	    (inode->i_state & I_DIRTY_TIME)) {
+	if (inode_is_dirtytime_only(inode)) {
 		struct ext4_inode_info	*ei = EXT4_I(inode);
 
 		inode->i_state &= ~I_DIRTY_TIME;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 45a0303b2aeb..de0c789104c2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2194,6 +2194,21 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+/*
+ * Returns true if the given inode itself only has dirty timestamps (its pages
+ * may still be dirty) and isn't currently being allocated or freed.
+ * Filesystems should call this if when writing an inode when lazytime is
+ * enabled, they want to opportunistically write the timestamps of other inodes
+ * located very nearby on-disk, e.g. in the same inode block.  This returns true
+ * if the given inode is in need of such an opportunistic update.  Requires
+ * i_lock, or at least later re-checking under i_lock.
+ */
+static inline bool inode_is_dirtytime_only(struct inode *inode)
+{
+	return (inode->i_state & (I_DIRTY_TIME | I_NEW |
+				  I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
+}
+
 extern void inc_nlink(struct inode *inode);
 extern void drop_nlink(struct inode *inode);
 extern void clear_nlink(struct inode *inode);
-- 
cgit v1.2.3


From 660343d063f7b7151a8c679d91ebe13cf40ad866 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 13 Jan 2021 13:49:21 +0200
Subject: dmaengine: Extend the dmaengine_alignment for 128 and 256 bytes

Some DMA device can benefit with higher order of alignment than the maximum
of 64 bytes currently defined.

Define 128 and 256 bytes alignment for these devices.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Tested-by: Kishon Vijay Abraham I <kishon@ti.com>
Link: https://lore.kernel.org/r/20210113114923.9231-2-peter.ujfalusi@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 68130f5f599e..004736b6a9c8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -745,6 +745,8 @@ enum dmaengine_alignment {
 	DMAENGINE_ALIGN_16_BYTES = 4,
 	DMAENGINE_ALIGN_32_BYTES = 5,
 	DMAENGINE_ALIGN_64_BYTES = 6,
+	DMAENGINE_ALIGN_128_BYTES = 7,
+	DMAENGINE_ALIGN_256_BYTES = 8,
 };
 
 /**
-- 
cgit v1.2.3


From a3b4388ea19b055ec482114e227c58d0184edfa5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 10 Jan 2021 15:49:05 +0100
Subject: power: supply: z2_battery: Convert to GPIO descriptors

This converts the Palm Z2 battery driver to use GPIO descriptors.

Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 arch/arm/mach-pxa/z2.c            | 12 +++++++++-
 drivers/power/supply/z2_battery.c | 46 ++++++++++++++++++---------------------
 include/linux/z2_battery.h        |  1 -
 3 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/z2.c b/arch/arm/mach-pxa/z2.c
index 21fd76bb09cd..a5dad8d08cac 100644
--- a/arch/arm/mach-pxa/z2.c
+++ b/arch/arm/mach-pxa/z2.c
@@ -488,7 +488,6 @@ static struct z2_battery_info batt_chip_info = {
 	.batt_I2C_bus	= 0,
 	.batt_I2C_addr	= 0x55,
 	.batt_I2C_reg	= 2,
-	.charge_gpio	= GPIO0_ZIPITZ2_AC_DETECT,
 	.min_voltage	= 3475000,
 	.max_voltage	= 4190000,
 	.batt_div	= 59,
@@ -497,9 +496,19 @@ static struct z2_battery_info batt_chip_info = {
 	.batt_name	= "Z2",
 };
 
+static struct gpiod_lookup_table z2_battery_gpio_table = {
+	.dev_id = "aer915",
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO0_ZIPITZ2_AC_DETECT,
+			    NULL, GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static struct i2c_board_info __initdata z2_i2c_board_info[] = {
 	{
 		I2C_BOARD_INFO("aer915", 0x55),
+		.dev_name = "aer915",
 		.platform_data	= &batt_chip_info,
 	}, {
 		I2C_BOARD_INFO("wm8750", 0x1b),
@@ -510,6 +519,7 @@ static struct i2c_board_info __initdata z2_i2c_board_info[] = {
 static void __init z2_i2c_init(void)
 {
 	pxa_set_i2c_info(NULL);
+	gpiod_add_lookup_table(&z2_battery_gpio_table);
 	i2c_register_board_info(0, ARRAY_AND_SIZE(z2_i2c_board_info));
 }
 #else
diff --git a/drivers/power/supply/z2_battery.c b/drivers/power/supply/z2_battery.c
index ebd2e42a4457..b1508fe70e5e 100644
--- a/drivers/power/supply/z2_battery.c
+++ b/drivers/power/supply/z2_battery.c
@@ -6,7 +6,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
@@ -18,6 +18,7 @@
 
 struct z2_charger {
 	struct z2_battery_info		*info;
+	struct gpio_desc		*charge_gpiod;
 	int				bat_status;
 	struct i2c_client		*client;
 	struct power_supply		*batt_ps;
@@ -95,8 +96,8 @@ static void z2_batt_update(struct z2_charger *charger)
 
 	mutex_lock(&charger->work_lock);
 
-	charger->bat_status = (info->charge_gpio >= 0) ?
-		(gpio_get_value(info->charge_gpio) ?
+	charger->bat_status = charger->charge_gpiod ?
+		(gpiod_get_value(charger->charge_gpiod) ?
 		POWER_SUPPLY_STATUS_CHARGING :
 		POWER_SUPPLY_STATUS_DISCHARGING) :
 		POWER_SUPPLY_STATUS_UNKNOWN;
@@ -131,7 +132,7 @@ static int z2_batt_ps_init(struct z2_charger *charger, int props)
 	enum power_supply_property *prop;
 	struct z2_battery_info *info = charger->info;
 
-	if (info->charge_gpio >= 0)
+	if (charger->charge_gpiod)
 		props++;	/* POWER_SUPPLY_PROP_STATUS */
 	if (info->batt_tech >= 0)
 		props++;	/* POWER_SUPPLY_PROP_TECHNOLOGY */
@@ -147,7 +148,7 @@ static int z2_batt_ps_init(struct z2_charger *charger, int props)
 		return -ENOMEM;
 
 	prop[i++] = POWER_SUPPLY_PROP_PRESENT;
-	if (info->charge_gpio >= 0)
+	if (charger->charge_gpiod)
 		prop[i++] = POWER_SUPPLY_PROP_STATUS;
 	if (info->batt_tech >= 0)
 		prop[i++] = POWER_SUPPLY_PROP_TECHNOLOGY;
@@ -206,22 +207,23 @@ static int z2_batt_probe(struct i2c_client *client,
 
 	mutex_init(&charger->work_lock);
 
-	if (info->charge_gpio >= 0 && gpio_is_valid(info->charge_gpio)) {
-		ret = gpio_request(info->charge_gpio, "BATT CHRG");
-		if (ret)
-			goto err;
+	charger->charge_gpiod = devm_gpiod_get_optional(&client->dev,
+							NULL, GPIOD_IN);
+	if (IS_ERR(charger->charge_gpiod))
+		return dev_err_probe(&client->dev,
+				     PTR_ERR(charger->charge_gpiod),
+				     "failed to get charge GPIO\n");
 
-		ret = gpio_direction_input(info->charge_gpio);
-		if (ret)
-			goto err2;
+	if (charger->charge_gpiod) {
+		gpiod_set_consumer_name(charger->charge_gpiod, "BATT CHRG");
 
-		irq_set_irq_type(gpio_to_irq(info->charge_gpio),
+		irq_set_irq_type(gpiod_to_irq(charger->charge_gpiod),
 				 IRQ_TYPE_EDGE_BOTH);
-		ret = request_irq(gpio_to_irq(info->charge_gpio),
+		ret = request_irq(gpiod_to_irq(charger->charge_gpiod),
 				z2_charge_switch_irq, 0,
 				"AC Detect", charger);
 		if (ret)
-			goto err3;
+			goto err;
 	}
 
 	ret = z2_batt_ps_init(charger, props);
@@ -245,11 +247,8 @@ static int z2_batt_probe(struct i2c_client *client,
 err4:
 	kfree(charger->batt_ps_desc.properties);
 err3:
-	if (info->charge_gpio >= 0 && gpio_is_valid(info->charge_gpio))
-		free_irq(gpio_to_irq(info->charge_gpio), charger);
-err2:
-	if (info->charge_gpio >= 0 && gpio_is_valid(info->charge_gpio))
-		gpio_free(info->charge_gpio);
+	if (charger->charge_gpiod)
+		free_irq(gpiod_to_irq(charger->charge_gpiod), charger);
 err:
 	kfree(charger);
 	return ret;
@@ -258,16 +257,13 @@ err:
 static int z2_batt_remove(struct i2c_client *client)
 {
 	struct z2_charger *charger = i2c_get_clientdata(client);
-	struct z2_battery_info *info = charger->info;
 
 	cancel_work_sync(&charger->bat_work);
 	power_supply_unregister(charger->batt_ps);
 
 	kfree(charger->batt_ps_desc.properties);
-	if (info->charge_gpio >= 0 && gpio_is_valid(info->charge_gpio)) {
-		free_irq(gpio_to_irq(info->charge_gpio), charger);
-		gpio_free(info->charge_gpio);
-	}
+	if (charger->charge_gpiod)
+		free_irq(gpiod_to_irq(charger->charge_gpiod), charger);
 
 	kfree(charger);
 
diff --git a/include/linux/z2_battery.h b/include/linux/z2_battery.h
index eaba53ff387c..9e8be7a7cd25 100644
--- a/include/linux/z2_battery.h
+++ b/include/linux/z2_battery.h
@@ -6,7 +6,6 @@ struct z2_battery_info {
 	int	 batt_I2C_bus;
 	int	 batt_I2C_addr;
 	int	 batt_I2C_reg;
-	int	 charge_gpio;
 	int	 min_voltage;
 	int	 max_voltage;
 	int	 batt_div;
-- 
cgit v1.2.3


From cb6d6918c56ffd98e88164d5471f692d33dabf2b Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 11 Jan 2021 00:45:08 +0100
Subject: power: supply: wm97xx_battery: Convert to GPIO descriptor

This converts the WM97xx driver to use a GPIO descriptor
instead of passing a GPIO number thru platform data.

Like everything else in the driver, use a simple local
variable for the descriptor, it can only ever appear in
one instance anyway so it should not hurt.

After converting the driver I noticed that none of the
boardfiles actually define a meaningful GPIO line for
this, but hey, it is converted.

Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 arch/arm/mach-pxa/mioa701.c           |  1 -
 arch/arm/mach-pxa/palm27x.c           |  1 -
 arch/arm/mach-pxa/palmte2.c           |  1 -
 drivers/power/supply/wm97xx_battery.c | 45 +++++++++++++++--------------------
 include/linux/wm97xx.h                |  1 -
 5 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c
index d3af80317f2d..a79f296e81e0 100644
--- a/arch/arm/mach-pxa/mioa701.c
+++ b/arch/arm/mach-pxa/mioa701.c
@@ -577,7 +577,6 @@ static struct platform_device power_dev = {
 static struct wm97xx_batt_pdata mioa701_battery_data = {
 	.batt_aux	= WM97XX_AUX_ID1,
 	.temp_aux	= -1,
-	.charge_gpio	= -1,
 	.min_voltage	= 0xc00,
 	.max_voltage	= 0xfc0,
 	.batt_tech	= POWER_SUPPLY_TECHNOLOGY_LION,
diff --git a/arch/arm/mach-pxa/palm27x.c b/arch/arm/mach-pxa/palm27x.c
index 0d246a1aebbc..6230381a7ca0 100644
--- a/arch/arm/mach-pxa/palm27x.c
+++ b/arch/arm/mach-pxa/palm27x.c
@@ -212,7 +212,6 @@ void __init palm27x_irda_init(int pwdn)
 static struct wm97xx_batt_pdata palm27x_batt_pdata = {
 	.batt_aux	= WM97XX_AUX_ID3,
 	.temp_aux	= WM97XX_AUX_ID2,
-	.charge_gpio	= -1,
 	.batt_mult	= 1000,
 	.batt_div	= 414,
 	.temp_mult	= 1,
diff --git a/arch/arm/mach-pxa/palmte2.c b/arch/arm/mach-pxa/palmte2.c
index e3bcf58b4e63..a2b10db4aacc 100644
--- a/arch/arm/mach-pxa/palmte2.c
+++ b/arch/arm/mach-pxa/palmte2.c
@@ -273,7 +273,6 @@ static struct platform_device power_supply = {
 static struct wm97xx_batt_pdata palmte2_batt_pdata = {
 	.batt_aux	= WM97XX_AUX_ID3,
 	.temp_aux	= WM97XX_AUX_ID2,
-	.charge_gpio	= -1,
 	.max_voltage	= PALMTE2_BAT_MAX_VOLTAGE,
 	.min_voltage	= PALMTE2_BAT_MIN_VOLTAGE,
 	.batt_mult	= 1000,
diff --git a/drivers/power/supply/wm97xx_battery.c b/drivers/power/supply/wm97xx_battery.c
index 58f01659daa5..a0e1eaa25d93 100644
--- a/drivers/power/supply/wm97xx_battery.c
+++ b/drivers/power/supply/wm97xx_battery.c
@@ -15,11 +15,12 @@
 #include <linux/wm97xx.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/irq.h>
 #include <linux/slab.h>
 
 static struct work_struct bat_work;
+static struct gpio_desc *charge_gpiod;
 static DEFINE_MUTEX(work_lock);
 static int bat_status = POWER_SUPPLY_STATUS_UNKNOWN;
 static enum power_supply_property *prop;
@@ -96,12 +97,11 @@ static void wm97xx_bat_external_power_changed(struct power_supply *bat_ps)
 static void wm97xx_bat_update(struct power_supply *bat_ps)
 {
 	int old_status = bat_status;
-	struct wm97xx_batt_pdata *pdata = power_supply_get_drvdata(bat_ps);
 
 	mutex_lock(&work_lock);
 
-	bat_status = (pdata->charge_gpio >= 0) ?
-			(gpio_get_value(pdata->charge_gpio) ?
+	bat_status = (charge_gpiod) ?
+			(gpiod_get_value(charge_gpiod) ?
 			POWER_SUPPLY_STATUS_DISCHARGING :
 			POWER_SUPPLY_STATUS_CHARGING) :
 			POWER_SUPPLY_STATUS_UNKNOWN;
@@ -171,18 +171,19 @@ static int wm97xx_bat_probe(struct platform_device *dev)
 	if (dev->id != -1)
 		return -EINVAL;
 
-	if (gpio_is_valid(pdata->charge_gpio)) {
-		ret = gpio_request(pdata->charge_gpio, "BATT CHRG");
-		if (ret)
-			goto err;
-		ret = gpio_direction_input(pdata->charge_gpio);
-		if (ret)
-			goto err2;
-		ret = request_irq(gpio_to_irq(pdata->charge_gpio),
+	charge_gpiod = devm_gpiod_get_optional(&dev->dev, NULL, GPIOD_IN);
+	if (IS_ERR(charge_gpiod))
+		return dev_err_probe(&dev->dev,
+				     PTR_ERR(charge_gpiod),
+				     "failed to get charge GPIO\n");
+	if (charge_gpiod) {
+		gpiod_set_consumer_name(charge_gpiod, "BATT CHRG");
+		ret = request_irq(gpiod_to_irq(charge_gpiod),
 				wm97xx_chrg_irq, 0,
 				"AC Detect", dev);
 		if (ret)
-			goto err2;
+			return dev_err_probe(&dev->dev, ret,
+					     "failed to request GPIO irq\n");
 		props++;	/* POWER_SUPPLY_PROP_STATUS */
 	}
 
@@ -204,7 +205,7 @@ static int wm97xx_bat_probe(struct platform_device *dev)
 	}
 
 	prop[i++] = POWER_SUPPLY_PROP_PRESENT;
-	if (pdata->charge_gpio >= 0)
+	if (charge_gpiod)
 		prop[i++] = POWER_SUPPLY_PROP_STATUS;
 	if (pdata->batt_tech >= 0)
 		prop[i++] = POWER_SUPPLY_PROP_TECHNOLOGY;
@@ -242,23 +243,15 @@ static int wm97xx_bat_probe(struct platform_device *dev)
 err4:
 	kfree(prop);
 err3:
-	if (gpio_is_valid(pdata->charge_gpio))
-		free_irq(gpio_to_irq(pdata->charge_gpio), dev);
-err2:
-	if (gpio_is_valid(pdata->charge_gpio))
-		gpio_free(pdata->charge_gpio);
-err:
+	if (charge_gpiod)
+		free_irq(gpiod_to_irq(charge_gpiod), dev);
 	return ret;
 }
 
 static int wm97xx_bat_remove(struct platform_device *dev)
 {
-	struct wm97xx_batt_pdata *pdata = dev->dev.platform_data;
-
-	if (pdata && gpio_is_valid(pdata->charge_gpio)) {
-		free_irq(gpio_to_irq(pdata->charge_gpio), dev);
-		gpio_free(pdata->charge_gpio);
-	}
+	if (charge_gpiod)
+		free_irq(gpiod_to_irq(charge_gpiod), dev);
 	cancel_work_sync(&bat_work);
 	power_supply_unregister(bat_psy);
 	kfree(prop);
diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h
index 58e082dadc68..462854f4f286 100644
--- a/include/linux/wm97xx.h
+++ b/include/linux/wm97xx.h
@@ -294,7 +294,6 @@ struct wm97xx {
 struct wm97xx_batt_pdata {
 	int	batt_aux;
 	int	temp_aux;
-	int	charge_gpio;
 	int	min_voltage;
 	int	max_voltage;
 	int	batt_div;
-- 
cgit v1.2.3


From 838b00a2260ab6ec104a2a5696e619c19e8cd9be Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Mon, 11 Jan 2021 23:05:24 -0800
Subject: net/mlx5: Add HW definition of reg_c_preserve

Add capability bit to test whether reg_c value is preserved on
recirculation.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 442c0160caab..823411e288c0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1278,7 +1278,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_a0[0x3];
 	u8	   ece_support[0x1];
-	u8	   reserved_at_a4[0x7];
+	u8	   reserved_at_a4[0x5];
+	u8         reg_c_preserve[0x1];
+	u8         reserved_at_aa[0x1];
 	u8         log_max_srq[0x5];
 	u8         reserved_at_b0[0x1];
 	u8         uplink_follow[0x1];
-- 
cgit v1.2.3


From 9d93a9e8aab3f82b6742dd034a6a81d4025cd82e Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:40 -0500
Subject: drivers/soc/litex: move generic accessors to litex.h

Move generic LiteX CSR (MMIO) register accessors to litex.h and
declare them as "static inline", in preparation for supporting
32-bit CSR subregisters and 64-bit CPUs.

NOTE: this is a non-functional change.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/litex_soc_ctrl.c | 73 -------------------------------------
 include/linux/litex.h              | 74 +++++++++++++++++++++++++++++++++++---
 2 files changed, 69 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
index 1217cafdfd4d..65977526d68e 100644
--- a/drivers/soc/litex/litex_soc_ctrl.c
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -16,79 +16,6 @@
 #include <linux/errno.h>
 #include <linux/io.h>
 
-/*
- * LiteX SoC Generator, depending on the configuration, can split a single
- * logical CSR (Control&Status Register) into a series of consecutive physical
- * registers.
- *
- * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
- * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
- * 32-bit physical registers, each one containing one byte of meaningful data.
- *
- * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
- *
- * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
- * of writing to/reading from the LiteX CSR in a single place that can be
- * then reused by all LiteX drivers.
- */
-
-/**
- * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
- * @reg: Address of the CSR
- * @reg_size: The width of the CSR expressed in the number of bytes
- * @val: Value to be written to the CSR
- *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function splits a single possibly multi-byte write into a series of
- * single-byte writes with a proper offset.
- */
-void litex_set_reg(void __iomem *reg, unsigned long reg_size,
-		    unsigned long val)
-{
-	unsigned long shifted_data, shift, i;
-
-	for (i = 0; i < reg_size; ++i) {
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		shifted_data = val >> shift;
-
-		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
-	}
-}
-EXPORT_SYMBOL_GPL(litex_set_reg);
-
-/**
- * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
- * @reg: Address of the CSR
- * @reg_size: The width of the CSR expressed in the number of bytes
- *
- * Return: Value read from the CSR
- *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function generates a series of single-byte reads with a proper offset
- * and joins their results into a single multi-byte value.
- */
-unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_size)
-{
-	unsigned long shifted_data, shift, i;
-	unsigned long result = 0;
-
-	for (i = 0; i < reg_size; ++i) {
-		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
-
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		result |= (shifted_data << shift);
-	}
-
-	return result;
-}
-EXPORT_SYMBOL_GPL(litex_get_reg);
-
 #define SCRATCH_REG_OFF         0x04
 #define SCRATCH_REG_VALUE       0x12345678
 #define SCRATCH_TEST_VALUE      0xdeadbeef
diff --git a/include/linux/litex.h b/include/linux/litex.h
index 40f5be503593..67c1a18a7425 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -3,9 +3,6 @@
  * Common LiteX header providing
  * helper functions for accessing CSRs.
  *
- * Implementation of the functions is provided by
- * the LiteX SoC Controller driver.
- *
  * Copyright (C) 2019-2020 Antmicro <www.antmicro.com>
  */
 
@@ -33,9 +30,76 @@
 #define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
 	le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id)))
 
-void litex_set_reg(void __iomem *reg, unsigned long reg_sz, unsigned long val);
+/*
+ * LiteX SoC Generator, depending on the configuration, can split a single
+ * logical CSR (Control&Status Register) into a series of consecutive physical
+ * registers.
+ *
+ * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
+ * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
+ * 32-bit physical registers, each one containing one byte of meaningful data.
+ *
+ * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
+ *
+ * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
+ * of writing to/reading from the LiteX CSR in a single place that can be
+ * then reused by all LiteX drivers.
+ */
+
+/**
+ * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ * @val: Value to be written to the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function splits a single possibly multi-byte write into a series of
+ * single-byte writes with a proper offset.
+ */
+static inline void litex_set_reg(void __iomem *reg, ulong reg_size, ulong val)
+{
+	ulong shifted_data, shift, i;
+
+	for (i = 0; i < reg_size; ++i) {
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		shifted_data = val >> shift;
+
+		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
+	}
+}
+
+/**
+ * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ *
+ * Return: Value read from the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function generates a series of single-byte reads with a proper offset
+ * and joins their results into a single multi-byte value.
+ */
+static inline ulong litex_get_reg(void __iomem *reg, ulong reg_size)
+{
+	ulong shifted_data, shift, i;
+	ulong result = 0;
+
+	for (i = 0; i < reg_size; ++i) {
+		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
+
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		result |= (shifted_data << shift);
+	}
+
+	return result;
+}
 
-unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_sz);
 
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-- 
cgit v1.2.3


From b5d3061ea2e691ab1fa6465fce3c59d9d10357de Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:41 -0500
Subject: drivers/soc/litex: separate MMIO from subregister offset calculation

Separate MMIO (read/write) access into _[read|write]_litex_subregister()
static inline functions, leaving existing "READ|WRITE" macros to handle
calculation of the subregister offset only.

NOTE: this is a non-functional change.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 include/linux/litex.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/litex.h b/include/linux/litex.h
index 67c1a18a7425..918bab45243c 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -24,11 +24,23 @@
 #define LITEX_SUBREG_SIZE	0x1
 #define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
 
+static inline void _write_litex_subregister(u32 val, void __iomem *addr)
+{
+	writel((u32 __force)cpu_to_le32(val), addr);
+}
+
+static inline u32 _read_litex_subregister(void __iomem *addr)
+{
+	return le32_to_cpu((__le32 __force)readl(addr));
+}
+
 #define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
-	writel((u32 __force)cpu_to_le32(val), base_offset + (LITEX_REG_SIZE * subreg_id))
+	_write_litex_subregister(val, (base_offset) + \
+					LITEX_REG_SIZE * (subreg_id))
 
 #define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
-	le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id)))
+	_read_litex_subregister((base_offset) + \
+					LITEX_REG_SIZE * (subreg_id))
 
 /*
  * LiteX SoC Generator, depending on the configuration, can split a single
-- 
cgit v1.2.3


From ffa4ebc48971abffed722b75887ac1d8c9256b41 Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:42 -0500
Subject: drivers/soc/litex: s/LITEX_REG_SIZE/LITEX_SUBREG_ALIGN/g

The constant LITEX_REG_SIZE is renamed to the more descriptive
LITEX_SUBREG_ALIGN (LiteX CSR subregisters are located at 32-bit
aligned MMIO addresses).

NOTE: this is a non-functional change.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 include/linux/litex.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/litex.h b/include/linux/litex.h
index 918bab45243c..c63a7e1a337c 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -20,10 +20,12 @@
  * Supporting other configurations will require extending the logic in this
  * header and in the LiteX SoC controller driver.
  */
-#define LITEX_REG_SIZE	  0x4
 #define LITEX_SUBREG_SIZE	0x1
 #define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
 
+/* LiteX subregisters of any width are always aligned on a 4-byte boundary */
+#define LITEX_SUBREG_ALIGN	  0x4
+
 static inline void _write_litex_subregister(u32 val, void __iomem *addr)
 {
 	writel((u32 __force)cpu_to_le32(val), addr);
@@ -36,11 +38,11 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
 
 #define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
 	_write_litex_subregister(val, (base_offset) + \
-					LITEX_REG_SIZE * (subreg_id))
+					LITEX_SUBREG_ALIGN * (subreg_id))
 
 #define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
 	_read_litex_subregister((base_offset) + \
-					LITEX_REG_SIZE * (subreg_id))
+					LITEX_SUBREG_ALIGN * (subreg_id))
 
 /*
  * LiteX SoC Generator, depending on the configuration, can split a single
-- 
cgit v1.2.3


From 51f109228308a87c7f2583360e54acfc567203da Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:43 -0500
Subject: drivers/soc/litex: support 32-bit subregisters, 64-bit CPUs

Upstream LiteX now defaults to using 32-bit CSR subregisters
(see https://github.com/enjoy-digital/litex/commit/a2b71fde).

This patch expands on commit 22447a99c97e ("drivers/soc/litex: add
LiteX SoC Controller driver"), adding support for handling both 8-
and 32-bit LiteX CSR (MMIO) subregisters, as determined by the
LITEX_SUBREG_SIZE Kconfig option.

NOTE that while LITEX_SUBREG_SIZE could theoretically be a device
tree property, defining it as a compile-time constant allows for
much better optimization of the resulting code. This is further
supported by the low expected usefulness of deploying the same
kernel across LiteX SoCs built with different CSR-Bus data widths.

Finally, the litex_[read|write][8|16|32|64]() accessors are
redefined in terms of litex_[get|set]_reg(), which, after compiler
optimization, will result in code as efficient as hardcoded shifts,
but with the added benefit of automatically matching the appropriate
LITEX_SUBREG_SIZE.

NOTE that litex_[get|set]_reg() nominally operate on 64-bit data,
but that will also be optimized by the compiler in situations where
narrower data is used from a call site.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/Kconfig          |  12 ++++
 drivers/soc/litex/litex_soc_ctrl.c |   3 +-
 include/linux/litex.h              | 137 +++++++++++++++----------------------
 3 files changed, 68 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
index 7c6b009b6f6c..973f8d2fe1a7 100644
--- a/drivers/soc/litex/Kconfig
+++ b/drivers/soc/litex/Kconfig
@@ -16,4 +16,16 @@ config LITEX_SOC_CONTROLLER
 	  All drivers that use functions from litex.h must depend on
 	  LITEX.
 
+config LITEX_SUBREG_SIZE
+	int "Size of a LiteX CSR subregister, in bytes"
+	depends on LITEX
+	range 1 4
+	default 4
+	help
+	LiteX MMIO registers (referred to as Configuration and Status
+	registers, or CSRs) are spread across adjacent 8- or 32-bit
+	subregisters, located at 32-bit aligned MMIO addresses. Use
+	this to select the appropriate size (1 or 4 bytes) matching
+	your particular LiteX build.
+
 endmenu
diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
index 65977526d68e..da17ba56b795 100644
--- a/drivers/soc/litex/litex_soc_ctrl.c
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -58,7 +58,8 @@ static int litex_check_csr_access(void __iomem *reg_addr)
 	/* restore original value of the SCRATCH register */
 	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_REG_VALUE);
 
-	pr_info("LiteX SoC Controller driver initialized");
+	pr_info("LiteX SoC Controller driver initialized: subreg:%d, align:%d",
+		LITEX_SUBREG_SIZE, LITEX_SUBREG_ALIGN);
 
 	return 0;
 }
diff --git a/include/linux/litex.h b/include/linux/litex.h
index c63a7e1a337c..3456d527f644 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -10,17 +10,14 @@
 #define _LINUX_LITEX_H
 
 #include <linux/io.h>
-#include <linux/types.h>
-#include <linux/compiler_types.h>
 
-/*
- * The parameters below are true for LiteX SoCs configured for 8-bit CSR Bus,
- * 32-bit aligned.
- *
- * Supporting other configurations will require extending the logic in this
- * header and in the LiteX SoC controller driver.
- */
-#define LITEX_SUBREG_SIZE	0x1
+/* LiteX SoCs support 8- or 32-bit CSR Bus data width (i.e., subreg. size) */
+#if defined(CONFIG_LITEX_SUBREG_SIZE) && \
+	(CONFIG_LITEX_SUBREG_SIZE == 1 || CONFIG_LITEX_SUBREG_SIZE == 4)
+#define LITEX_SUBREG_SIZE      CONFIG_LITEX_SUBREG_SIZE
+#else
+#error LiteX subregister size (LITEX_SUBREG_SIZE) must be 4 or 1!
+#endif
 #define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
 
 /* LiteX subregisters of any width are always aligned on a 4-byte boundary */
@@ -36,25 +33,32 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
 	return le32_to_cpu((__le32 __force)readl(addr));
 }
 
-#define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
-	_write_litex_subregister(val, (base_offset) + \
-					LITEX_SUBREG_ALIGN * (subreg_id))
-
-#define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
-	_read_litex_subregister((base_offset) + \
-					LITEX_SUBREG_ALIGN * (subreg_id))
-
 /*
  * LiteX SoC Generator, depending on the configuration, can split a single
  * logical CSR (Control&Status Register) into a series of consecutive physical
  * registers.
  *
- * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
- * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
- * 32-bit physical registers, each one containing one byte of meaningful data.
+ * For example, in the configuration with 8-bit CSR Bus, a 32-bit aligned,
+ * 32-bit wide logical CSR will be laid out as four 32-bit physical
+ * subregisters, each one containing one byte of meaningful data.
  *
  * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
- *
+ */
+
+/* number of LiteX subregisters needed to store a register of given reg_size */
+#define _litex_num_subregs(reg_size) \
+	(((reg_size) - 1) / LITEX_SUBREG_SIZE + 1)
+
+/*
+ * since the number of 4-byte aligned subregisters required to store a single
+ * LiteX CSR (MMIO) register varies with LITEX_SUBREG_SIZE, the offset of the
+ * next adjacent LiteX CSR register w.r.t. the offset of the current one also
+ * depends on how many subregisters the latter is spread across
+ */
+#define _next_reg_off(off, size) \
+	((off) + _litex_num_subregs(size) * LITEX_SUBREG_ALIGN)
+
+/*
  * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
  * of writing to/reading from the LiteX CSR in a single place that can be
  * then reused by all LiteX drivers.
@@ -66,22 +70,17 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
  * @reg_size: The width of the CSR expressed in the number of bytes
  * @val: Value to be written to the CSR
  *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function splits a single possibly multi-byte write into a series of
- * single-byte writes with a proper offset.
+ * This function splits a single (possibly multi-byte) LiteX CSR write into
+ * a series of subregister writes with a proper offset.
  */
-static inline void litex_set_reg(void __iomem *reg, ulong reg_size, ulong val)
+static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
 {
-	ulong shifted_data, shift, i;
-
-	for (i = 0; i < reg_size; ++i) {
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		shifted_data = val >> shift;
+	u8 shift = _litex_num_subregs(reg_size) * LITEX_SUBREG_SIZE_BIT;
 
-		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
+	while (shift > 0) {
+		shift -= LITEX_SUBREG_SIZE_BIT;
+		_write_litex_subregister(val >> shift, reg);
+		reg += LITEX_SUBREG_ALIGN;
 	}
 }
 
@@ -92,89 +91,61 @@ static inline void litex_set_reg(void __iomem *reg, ulong reg_size, ulong val)
  *
  * Return: Value read from the CSR
  *
- * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
- * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
- * each one containing one byte of meaningful data.
- *
- * This function generates a series of single-byte reads with a proper offset
- * and joins their results into a single multi-byte value.
+ * This function generates a series of subregister reads with a proper offset
+ * and joins their results into a single (possibly multi-byte) LiteX CSR value.
  */
-static inline ulong litex_get_reg(void __iomem *reg, ulong reg_size)
+static inline u64 litex_get_reg(void __iomem *reg, size_t reg_size)
 {
-	ulong shifted_data, shift, i;
-	ulong result = 0;
-
-	for (i = 0; i < reg_size; ++i) {
-		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
-
-		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
-		result |= (shifted_data << shift);
+	u64 r;
+	u8 i;
+
+	r = _read_litex_subregister(reg);
+	for (i = 1; i < _litex_num_subregs(reg_size); i++) {
+		r <<= LITEX_SUBREG_SIZE_BIT;
+		reg += LITEX_SUBREG_ALIGN;
+		r |= _read_litex_subregister(reg);
 	}
-
-	return result;
+	return r;
 }
 
-
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-	WRITE_LITEX_SUBREGISTER(val, reg, 0);
+	litex_set_reg(reg, sizeof(u8), val);
 }
 
 static inline void litex_write16(void __iomem *reg, u16 val)
 {
-	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 0);
-	WRITE_LITEX_SUBREGISTER(val, reg, 1);
+	litex_set_reg(reg, sizeof(u16), val);
 }
 
 static inline void litex_write32(void __iomem *reg, u32 val)
 {
-	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 0);
-	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 1);
-	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 2);
-	WRITE_LITEX_SUBREGISTER(val, reg, 3);
+	litex_set_reg(reg, sizeof(u32), val);
 }
 
 static inline void litex_write64(void __iomem *reg, u64 val)
 {
-	WRITE_LITEX_SUBREGISTER(val >> 56, reg, 0);
-	WRITE_LITEX_SUBREGISTER(val >> 48, reg, 1);
-	WRITE_LITEX_SUBREGISTER(val >> 40, reg, 2);
-	WRITE_LITEX_SUBREGISTER(val >> 32, reg, 3);
-	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 4);
-	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 5);
-	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 6);
-	WRITE_LITEX_SUBREGISTER(val, reg, 7);
+	litex_set_reg(reg, sizeof(u64), val);
 }
 
 static inline u8 litex_read8(void __iomem *reg)
 {
-	return READ_LITEX_SUBREGISTER(reg, 0);
+	return litex_get_reg(reg, sizeof(u8));
 }
 
 static inline u16 litex_read16(void __iomem *reg)
 {
-	return (READ_LITEX_SUBREGISTER(reg, 0) << 8)
-		| (READ_LITEX_SUBREGISTER(reg, 1));
+	return litex_get_reg(reg, sizeof(u16));
 }
 
 static inline u32 litex_read32(void __iomem *reg)
 {
-	return (READ_LITEX_SUBREGISTER(reg, 0) << 24)
-		| (READ_LITEX_SUBREGISTER(reg, 1) << 16)
-		| (READ_LITEX_SUBREGISTER(reg, 2) << 8)
-		| (READ_LITEX_SUBREGISTER(reg, 3));
+	return litex_get_reg(reg, sizeof(u32));
 }
 
 static inline u64 litex_read64(void __iomem *reg)
 {
-	return ((u64)READ_LITEX_SUBREGISTER(reg, 0) << 56)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 1) << 48)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 2) << 40)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 3) << 32)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 4) << 24)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 5) << 16)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 6) << 8)
-		| ((u64)READ_LITEX_SUBREGISTER(reg, 7));
+	return litex_get_reg(reg, sizeof(u64));
 }
 
 #endif /* _LINUX_LITEX_H */
-- 
cgit v1.2.3


From 4f70d150294b3ddfbe4be7130ca53898cd5b91be Mon Sep 17 00:00:00 2001
From: Gabriel Somlo <gsomlo@gmail.com>
Date: Tue, 12 Jan 2021 12:31:44 -0500
Subject: drivers/soc/litex: make 'litex_[set|get]_reg()' methods private

The 'litex_[set|get]_reg()' methods use the 'reg_size' parameter to
specify the width of the LiteX CSR (MMIO) register being accessed.

Since 'u64' is the widest data being supported, the value of 'reg_size'
MUST be between 1 and sizeof(u64), which SHOULD be checked at runtime
if these methods are publicly available for use by other LiteX device
drivers.

At the same time, none of the existing (or foreseeable) LiteX device
drivers have a need to access registers whose size is unknown during
compilation. As such, all LiteX device drivers should use fixed-width
accessor methods such as 'litex_[write|read][8|16|32|64]()'.

This patch renames 'litex_[set|get]_reg()' to '_litex_[set|get]_reg()',
indicating that they should NOT be directly called from outside of
the 'include/linux/litex.h' header file.

Signed-off-by: Gabriel Somlo <gsomlo@gmail.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 drivers/soc/litex/Kconfig |  2 +-
 include/linux/litex.h     | 35 ++++++++++++++++++++---------------
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
index 973f8d2fe1a7..b9b3d51ea7df 100644
--- a/drivers/soc/litex/Kconfig
+++ b/drivers/soc/litex/Kconfig
@@ -11,7 +11,7 @@ config LITEX_SOC_CONTROLLER
 	select LITEX
 	help
 	  This option enables the SoC Controller Driver which verifies
-	  LiteX CSR access and provides common litex_get_reg/litex_set_reg
+	  LiteX CSR access and provides common litex_[read|write]*
 	  accessors.
 	  All drivers that use functions from litex.h must depend on
 	  LITEX.
diff --git a/include/linux/litex.h b/include/linux/litex.h
index 3456d527f644..5ea9ccf5cce4 100644
--- a/include/linux/litex.h
+++ b/include/linux/litex.h
@@ -59,21 +59,25 @@ static inline u32 _read_litex_subregister(void __iomem *addr)
 	((off) + _litex_num_subregs(size) * LITEX_SUBREG_ALIGN)
 
 /*
- * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
- * of writing to/reading from the LiteX CSR in a single place that can be
- * then reused by all LiteX drivers.
+ * The purpose of `_litex_[set|get]_reg()` is to implement the logic of
+ * writing to/reading from the LiteX CSR in a single place that can be then
+ * reused by all LiteX drivers via the `litex_[write|read][8|16|32|64]()`
+ * accessors for the appropriate data width.
+ * NOTE: direct use of `_litex_[set|get]_reg()` by LiteX drivers is strongly
+ * discouraged, as they perform no error checking on the requested data width!
  */
 
 /**
- * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
+ * _litex_set_reg() - Writes a value to the LiteX CSR (Control&Status Register)
  * @reg: Address of the CSR
  * @reg_size: The width of the CSR expressed in the number of bytes
  * @val: Value to be written to the CSR
  *
  * This function splits a single (possibly multi-byte) LiteX CSR write into
  * a series of subregister writes with a proper offset.
+ * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)).
  */
-static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
+static inline void _litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
 {
 	u8 shift = _litex_num_subregs(reg_size) * LITEX_SUBREG_SIZE_BIT;
 
@@ -85,7 +89,7 @@ static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
 }
 
 /**
- * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
+ * _litex_get_reg() - Reads a value of the LiteX CSR (Control&Status Register)
  * @reg: Address of the CSR
  * @reg_size: The width of the CSR expressed in the number of bytes
  *
@@ -93,8 +97,9 @@ static inline void litex_set_reg(void __iomem *reg, size_t reg_size, u64 val)
  *
  * This function generates a series of subregister reads with a proper offset
  * and joins their results into a single (possibly multi-byte) LiteX CSR value.
+ * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)).
  */
-static inline u64 litex_get_reg(void __iomem *reg, size_t reg_size)
+static inline u64 _litex_get_reg(void __iomem *reg, size_t reg_size)
 {
 	u64 r;
 	u8 i;
@@ -110,42 +115,42 @@ static inline u64 litex_get_reg(void __iomem *reg, size_t reg_size)
 
 static inline void litex_write8(void __iomem *reg, u8 val)
 {
-	litex_set_reg(reg, sizeof(u8), val);
+	_litex_set_reg(reg, sizeof(u8), val);
 }
 
 static inline void litex_write16(void __iomem *reg, u16 val)
 {
-	litex_set_reg(reg, sizeof(u16), val);
+	_litex_set_reg(reg, sizeof(u16), val);
 }
 
 static inline void litex_write32(void __iomem *reg, u32 val)
 {
-	litex_set_reg(reg, sizeof(u32), val);
+	_litex_set_reg(reg, sizeof(u32), val);
 }
 
 static inline void litex_write64(void __iomem *reg, u64 val)
 {
-	litex_set_reg(reg, sizeof(u64), val);
+	_litex_set_reg(reg, sizeof(u64), val);
 }
 
 static inline u8 litex_read8(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u8));
+	return _litex_get_reg(reg, sizeof(u8));
 }
 
 static inline u16 litex_read16(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u16));
+	return _litex_get_reg(reg, sizeof(u16));
 }
 
 static inline u32 litex_read32(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u32));
+	return _litex_get_reg(reg, sizeof(u32));
 }
 
 static inline u64 litex_read64(void __iomem *reg)
 {
-	return litex_get_reg(reg, sizeof(u64));
+	return _litex_get_reg(reg, sizeof(u64));
 }
 
 #endif /* _LINUX_LITEX_H */
-- 
cgit v1.2.3


From 99b7beb0431a4b9728023e9169ed15af678d2c7f Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:24 +0100
Subject: can: length: canfd_sanitize_len(): add function to sanitize CAN-FD
 data length

The data field in CAN-FD frames have specifig frame length (0, 1, 2, 3, 4, 5,
6, 7, 8, 12, 16, 20, 24, 32, 48, 64). This function "rounds" up a given length
to the next valid CAN-FD frame length.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-10-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/length.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/can/length.h b/include/linux/can/length.h
index 156b9d17969a..b2313b2a0b02 100644
--- a/include/linux/can/length.h
+++ b/include/linux/can/length.h
@@ -45,4 +45,10 @@ u8 can_fd_dlc2len(u8 dlc);
 /* map the sanitized data length to an appropriate data length code */
 u8 can_fd_len2dlc(u8 len);
 
+/* map the data length to an appropriate data link layer length */
+static inline u8 canfd_sanitize_len(u8 len)
+{
+	return can_fd_dlc2len(can_fd_len2dlc(len));
+}
+
 #endif /* !_CAN_LENGTH_H */
-- 
cgit v1.2.3


From 85d99c3e2a13d542445342a1383a686dadeaac3d Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Mon, 11 Jan 2021 15:19:25 +0100
Subject: can: length: can_skb_get_frame_len(): introduce function to get data
 length of frame in data link layer

This patch adds the function can_skb_get_frame_len() which returns the length
of a CAN frame on the data link layer, including Start-of-frame, Identifier,
various other bits, the actual data, the CRC, the End-of-frame, the Inter frame
spacing.

Co-developed-by: Arunachalam Santhanam <arunachalam.santhanam@in.bosch.com>
Signed-off-by: Arunachalam Santhanam <arunachalam.santhanam@in.bosch.com>
Co-developed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Co-developed-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/r/20210111141930.693847-11-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/length.c |  50 ++++++++++++++++++
 include/linux/can/length.h   | 120 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/length.c b/drivers/net/can/dev/length.c
index 64673a8d1234..d35c4e82314d 100644
--- a/drivers/net/can/dev/length.c
+++ b/drivers/net/can/dev/length.c
@@ -38,3 +38,53 @@ u8 can_fd_len2dlc(u8 len)
 	return len2dlc[len];
 }
 EXPORT_SYMBOL_GPL(can_fd_len2dlc);
+
+/**
+ * can_skb_get_frame_len() - Calculate the CAN Frame length in bytes
+ * 	of a given skb.
+ * @skb: socket buffer of a CAN message.
+ *
+ * Do a rough calculation: bit stuffing is ignored and length in bits
+ * is rounded up to a length in bytes.
+ *
+ * Rationale: this function is to be used for the BQL functions
+ * (netdev_sent_queue() and netdev_completed_queue()) which expect a
+ * value in bytes. Just using skb->len is insufficient because it will
+ * return the constant value of CAN(FD)_MTU. Doing the bit stuffing
+ * calculation would be too expensive in term of computing resources
+ * for no noticeable gain.
+ *
+ * Remarks: The payload of CAN FD frames with BRS flag are sent at a
+ * different bitrate. Currently, the can-utils canbusload tool does
+ * not support CAN-FD yet and so we could not run any benchmark to
+ * measure the impact. There might be possible improvement here.
+ *
+ * Return: length in bytes.
+ */
+unsigned int can_skb_get_frame_len(const struct sk_buff *skb)
+{
+	const struct canfd_frame *cf = (const struct canfd_frame *)skb->data;
+	u8 len;
+
+	if (can_is_canfd_skb(skb))
+		len = canfd_sanitize_len(cf->len);
+	else if (cf->can_id & CAN_RTR_FLAG)
+		len = 0;
+	else
+		len = cf->len;
+
+	if (can_is_canfd_skb(skb)) {
+		if (cf->can_id & CAN_EFF_FLAG)
+			len += CANFD_FRAME_OVERHEAD_EFF;
+		else
+			len += CANFD_FRAME_OVERHEAD_SFF;
+	} else {
+		if (cf->can_id & CAN_EFF_FLAG)
+			len += CAN_FRAME_OVERHEAD_EFF;
+		else
+			len += CAN_FRAME_OVERHEAD_SFF;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(can_skb_get_frame_len);
diff --git a/include/linux/can/length.h b/include/linux/can/length.h
index b2313b2a0b02..6995092b774e 100644
--- a/include/linux/can/length.h
+++ b/include/linux/can/length.h
@@ -1,10 +1,127 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2020 Oliver Hartkopp <socketcan@hartkopp.net>
+ * Copyright (C) 2020 Marc Kleine-Budde <kernel@pengutronix.de>
  */
 
 #ifndef _CAN_LENGTH_H
 #define _CAN_LENGTH_H
 
+/*
+ * Size of a Classical CAN Standard Frame
+ *
+ * Name of Field			Bits
+ * ---------------------------------------------------------
+ * Start-of-frame			1
+ * Identifier				11
+ * Remote transmission request (RTR)	1
+ * Identifier extension bit (IDE)	1
+ * Reserved bit (r0)			1
+ * Data length code (DLC)		4
+ * Data field				0...64
+ * CRC					15
+ * CRC delimiter			1
+ * ACK slot				1
+ * ACK delimiter			1
+ * End-of-frame (EOF)			7
+ * Inter frame spacing			3
+ *
+ * rounded up and ignoring bitstuffing
+ */
+#define CAN_FRAME_OVERHEAD_SFF DIV_ROUND_UP(47, 8)
+
+/*
+ * Size of a Classical CAN Extended Frame
+ *
+ * Name of Field			Bits
+ * ---------------------------------------------------------
+ * Start-of-frame			1
+ * Identifier A				11
+ * Substitute remote request (SRR)	1
+ * Identifier extension bit (IDE)	1
+ * Identifier B				18
+ * Remote transmission request (RTR)	1
+ * Reserved bits (r1, r0)		2
+ * Data length code (DLC)		4
+ * Data field				0...64
+ * CRC					15
+ * CRC delimiter			1
+ * ACK slot				1
+ * ACK delimiter			1
+ * End-of-frame (EOF)			7
+ * Inter frame spacing			3
+ *
+ * rounded up and ignoring bitstuffing
+ */
+#define CAN_FRAME_OVERHEAD_EFF DIV_ROUND_UP(67, 8)
+
+/*
+ * Size of a CAN-FD Standard Frame
+ *
+ * Name of Field			Bits
+ * ---------------------------------------------------------
+ * Start-of-frame			1
+ * Identifier				11
+ * Reserved bit (r1)			1
+ * Identifier extension bit (IDE)	1
+ * Flexible data rate format (FDF)	1
+ * Reserved bit (r0)			1
+ * Bit Rate Switch (BRS)		1
+ * Error Status Indicator (ESI)		1
+ * Data length code (DLC)		4
+ * Data field				0...512
+ * Stuff Bit Count (SBC)		0...16: 4 20...64:5
+ * CRC					0...16: 17 20...64:21
+ * CRC delimiter (CD)			1
+ * ACK slot (AS)			1
+ * ACK delimiter (AD)			1
+ * End-of-frame (EOF)			7
+ * Inter frame spacing			3
+ *
+ * assuming CRC21, rounded up and ignoring bitstuffing
+ */
+#define CANFD_FRAME_OVERHEAD_SFF DIV_ROUND_UP(61, 8)
+
+/*
+ * Size of a CAN-FD Extended Frame
+ *
+ * Name of Field			Bits
+ * ---------------------------------------------------------
+ * Start-of-frame			1
+ * Identifier A				11
+ * Substitute remote request (SRR)	1
+ * Identifier extension bit (IDE)	1
+ * Identifier B				18
+ * Reserved bit (r1)			1
+ * Flexible data rate format (FDF)	1
+ * Reserved bit (r0)			1
+ * Bit Rate Switch (BRS)		1
+ * Error Status Indicator (ESI)		1
+ * Data length code (DLC)		4
+ * Data field				0...512
+ * Stuff Bit Count (SBC)		0...16: 4 20...64:5
+ * CRC					0...16: 17 20...64:21
+ * CRC delimiter (CD)			1
+ * ACK slot (AS)			1
+ * ACK delimiter (AD)			1
+ * End-of-frame (EOF)			7
+ * Inter frame spacing			3
+ *
+ * assuming CRC21, rounded up and ignoring bitstuffing
+ */
+#define CANFD_FRAME_OVERHEAD_EFF DIV_ROUND_UP(80, 8)
+
+/*
+ * Maximum size of a Classical CAN frame
+ * (rounded up and ignoring bitstuffing)
+ */
+#define CAN_FRAME_LEN_MAX (CAN_FRAME_OVERHEAD_EFF + CAN_MAX_DLEN)
+
+/*
+ * Maximum size of a CAN-FD frame
+ * (rounded up and ignoring bitstuffing)
+ */
+#define CANFD_FRAME_LEN_MAX (CANFD_FRAME_OVERHEAD_EFF + CANFD_MAX_DLEN)
+
 /*
  * can_cc_dlc2len(value) - convert a given data length code (dlc) of a
  * Classical CAN frame into a valid data length of max. 8 bytes.
@@ -45,6 +162,9 @@ u8 can_fd_dlc2len(u8 dlc);
 /* map the sanitized data length to an appropriate data length code */
 u8 can_fd_len2dlc(u8 len);
 
+/* calculate the CAN Frame length in bytes of a given skb */
+unsigned int can_skb_get_frame_len(const struct sk_buff *skb);
+
 /* map the data length to an appropriate data link layer length */
 static inline u8 canfd_sanitize_len(u8 len)
 {
-- 
cgit v1.2.3


From f0ef72febc9a6a569d92cdf6c7996015dfa8e8bb Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:26 +0100
Subject: can: dev: extend struct can_skb_priv to hold CAN frame length

In order to implement byte queue limits (bql) in CAN drivers, the length of the
CAN frame needs to be passed into the networking stack after queueing and after
transmission completion.

To avoid to calculate this length twice, extend the struct can_skb_priv to hold
the length of the CAN frame and extend __can_get_echo_skb() to return that
value.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-12-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/rx-offload.c | 2 +-
 drivers/net/can/dev/skb.c        | 9 +++++++--
 include/linux/can/skb.h          | 4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index 3c1912c0430b..6a26b5282df1 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -271,7 +271,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
 	u8 len;
 	int err;
 
-	skb = __can_get_echo_skb(dev, idx, &len);
+	skb = __can_get_echo_skb(dev, idx, &len, NULL);
 	if (!skb)
 		return 0;
 
diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index 26cd597ff771..24f782a23409 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -76,7 +76,8 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 EXPORT_SYMBOL_GPL(can_put_echo_skb);
 
 struct sk_buff *
-__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr)
+__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr,
+		   unsigned int *frame_len_ptr)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
@@ -91,6 +92,7 @@ __can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr)
 		 * length is supported on both CAN and CANFD frames.
 		 */
 		struct sk_buff *skb = priv->echo_skb[idx];
+		struct can_skb_priv *can_skb_priv = can_skb_prv(skb);
 		struct canfd_frame *cf = (struct canfd_frame *)skb->data;
 
 		/* get the real payload length for netdev statistics */
@@ -99,6 +101,9 @@ __can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr)
 		else
 			*len_ptr = cf->len;
 
+		if (frame_len_ptr)
+			*frame_len_ptr = can_skb_priv->frame_len;
+
 		priv->echo_skb[idx] = NULL;
 
 		return skb;
@@ -118,7 +123,7 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
 	struct sk_buff *skb;
 	u8 len;
 
-	skb = __can_get_echo_skb(dev, idx, &len);
+	skb = __can_get_echo_skb(dev, idx, &len, NULL);
 	if (!skb)
 		return 0;
 
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index c90ebbd3008c..5db9da30843c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -20,7 +20,7 @@ void can_flush_echo_skb(struct net_device *dev);
 int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 		     unsigned int idx);
 struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
-				   u8 *len_ptr);
+				   u8 *len_ptr, unsigned int *frame_len_ptr);
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
 void can_free_echo_skb(struct net_device *dev, unsigned int idx);
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
@@ -42,11 +42,13 @@ struct sk_buff *alloc_can_err_skb(struct net_device *dev,
  * struct can_skb_priv - private additional data inside CAN sk_buffs
  * @ifindex:	ifindex of the first interface the CAN frame appeared on
  * @skbcnt:	atomic counter to have an unique id together with skb pointer
+ * @frame_len:	length of CAN frame in data link layer
  * @cf:		align to the following CAN frame at skb->data
  */
 struct can_skb_priv {
 	int ifindex;
 	int skbcnt;
+	unsigned int frame_len;
 	struct can_frame cf[];
 };
 
-- 
cgit v1.2.3


From 1dcb6e57db833419483d0df2d956b1cc2a802683 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Mon, 11 Jan 2021 15:19:27 +0100
Subject: can: dev: can_put_echo_skb(): extend to handle frame_len

Add a frame_len argument to can_put_echo_skb() which is used to save length of
the CAN frame into field frame_len of struct can_skb_priv so that it can be
later used after transmission completion. Convert all users of this function,
too.

Drivers which implement BQL call can_put_echo_skb() with the output of
can_skb_get_frame_len(skb) and drivers which do not simply pass zero as an
input (in the same way that NULL would be given to can_get_echo_skb()). This
way, we have a nice symmetry between the two echo functions.

Link: https://lore.kernel.org/r/20210111061335.39983-1-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-13-mkl@pengutronix.de
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
---
 drivers/net/can/at91_can.c                       | 2 +-
 drivers/net/can/c_can/c_can.c                    | 2 +-
 drivers/net/can/cc770/cc770.c                    | 2 +-
 drivers/net/can/dev/skb.c                        | 5 ++++-
 drivers/net/can/flexcan.c                        | 2 +-
 drivers/net/can/grcan.c                          | 2 +-
 drivers/net/can/ifi_canfd/ifi_canfd.c            | 2 +-
 drivers/net/can/kvaser_pciefd.c                  | 2 +-
 drivers/net/can/m_can/m_can.c                    | 4 ++--
 drivers/net/can/mscan/mscan.c                    | 2 +-
 drivers/net/can/pch_can.c                        | 2 +-
 drivers/net/can/peak_canfd/peak_canfd.c          | 2 +-
 drivers/net/can/rcar/rcar_can.c                  | 2 +-
 drivers/net/can/rcar/rcar_canfd.c                | 2 +-
 drivers/net/can/sja1000/sja1000.c                | 2 +-
 drivers/net/can/softing/softing_main.c           | 2 +-
 drivers/net/can/spi/hi311x.c                     | 2 +-
 drivers/net/can/spi/mcp251x.c                    | 2 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c   | 2 +-
 drivers/net/can/sun4i_can.c                      | 2 +-
 drivers/net/can/ti_hecc.c                        | 2 +-
 drivers/net/can/usb/ems_usb.c                    | 2 +-
 drivers/net/can/usb/esd_usb2.c                   | 2 +-
 drivers/net/can/usb/gs_usb.c                     | 2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 2 +-
 drivers/net/can/usb/mcba_usb.c                   | 2 +-
 drivers/net/can/usb/peak_usb/pcan_usb_core.c     | 2 +-
 drivers/net/can/usb/ucan.c                       | 2 +-
 drivers/net/can/usb/usb_8dev.c                   | 2 +-
 drivers/net/can/xilinx_can.c                     | 4 ++--
 include/linux/can/skb.h                          | 2 +-
 31 files changed, 36 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 5284f0ab3b06..90b223a80ed4 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -484,7 +484,7 @@ static netdev_tx_t at91_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	stats->tx_bytes += cf->len;
 
 	/* _NOTE_: subtract AT91_MB_TX_FIRST offset from mb! */
-	can_put_echo_skb(skb, dev, mb - get_mb_tx_first(priv));
+	can_put_echo_skb(skb, dev, mb - get_mb_tx_first(priv), 0);
 
 	/*
 	 * we have to stop the queue and deliver all messages in case
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 63f48b016ecd..13638954a25c 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -476,7 +476,7 @@ static netdev_tx_t c_can_start_xmit(struct sk_buff *skb,
 	 */
 	c_can_setup_tx_object(dev, IF_TX, frame, idx);
 	priv->dlc[idx] = frame->len;
-	can_put_echo_skb(skb, dev, idx);
+	can_put_echo_skb(skb, dev, idx, 0);
 
 	/* Update the active bits */
 	atomic_add((1 << idx), &priv->tx_active);
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index 8d9f332c35e0..e53ca338368a 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -702,7 +702,7 @@ static void cc770_tx_interrupt(struct net_device *dev, unsigned int o)
 	stats->tx_bytes += cf->len;
 	stats->tx_packets++;
 
-	can_put_echo_skb(priv->tx_skb, dev, 0);
+	can_put_echo_skb(priv->tx_skb, dev, 0, 0);
 	can_get_echo_skb(dev, 0);
 	priv->tx_skb = NULL;
 
diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index 24f782a23409..c184b4dce19e 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -38,7 +38,7 @@ void can_flush_echo_skb(struct net_device *dev)
  * priv->echo_skb, if necessary.
  */
 int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
-		     unsigned int idx)
+		     unsigned int idx, unsigned int frame_len)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
@@ -62,6 +62,9 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		skb->dev = dev;
 
+		/* save frame_len to reuse it when transmission is completed */
+		can_skb_prv(skb)->frame_len = frame_len;
+
 		/* save this skb for tx interrupt echo handling */
 		priv->echo_skb[idx] = skb;
 	} else {
diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 7ab20a6b0d1d..202d08f8e1a4 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -815,7 +815,7 @@ static netdev_tx_t flexcan_start_xmit(struct sk_buff *skb, struct net_device *de
 		priv->write(data, &priv->tx_mb->data[i / sizeof(u32)]);
 	}
 
-	can_put_echo_skb(skb, dev, 0);
+	can_put_echo_skb(skb, dev, 0, 0);
 
 	priv->write(can_id, &priv->tx_mb->can_id);
 	priv->write(ctrl, &priv->tx_mb->can_ctrl);
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index f5d94a692576..8086cdc10000 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -1448,7 +1448,7 @@ static netdev_tx_t grcan_start_xmit(struct sk_buff *skb,
 	 * taken.
 	 */
 	priv->txdlc[slotindex] = cf->len; /* Store dlc for statistics */
-	can_put_echo_skb(skb, dev, slotindex);
+	can_put_echo_skb(skb, dev, slotindex, 0);
 
 	/* Make sure everything is written before allowing hardware to
 	 * read from the memory
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 86b0e1406a21..56ac9e1dace7 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -922,7 +922,7 @@ static netdev_tx_t ifi_canfd_start_xmit(struct sk_buff *skb,
 	writel(0, priv->base + IFI_CANFD_TXFIFO_REPEATCOUNT);
 	writel(0, priv->base + IFI_CANFD_TXFIFO_SUSPEND_US);
 
-	can_put_echo_skb(skb, ndev, 0);
+	can_put_echo_skb(skb, ndev, 0, 0);
 
 	/* Start the transmission */
 	writel(IFI_CANFD_TXSTCMD_ADD_MSG, priv->base + IFI_CANFD_TXSTCMD);
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 969cedb9b0b6..0cf82f0646a3 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -778,7 +778,7 @@ static netdev_tx_t kvaser_pciefd_start_xmit(struct sk_buff *skb,
 	spin_lock_irqsave(&can->echo_lock, irq_flags);
 
 	/* Prepare and save echo skb in internal slot */
-	can_put_echo_skb(skb, netdev, can->echo_idx);
+	can_put_echo_skb(skb, netdev, can->echo_idx, 0);
 
 	/* Move echo index to the next slot */
 	can->echo_idx = (can->echo_idx + 1) % can->can.echo_skb_max;
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index da551fd0f502..fff7432103cb 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1483,7 +1483,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
 					 M_CAN_FIFO_DATA(i / 4),
 					 *(u32 *)(cf->data + i));
 
-		can_put_echo_skb(skb, dev, 0);
+		can_put_echo_skb(skb, dev, 0, 0);
 
 		if (cdev->can.ctrlmode & CAN_CTRLMODE_FD) {
 			cccr = m_can_read(cdev, M_CAN_CCCR);
@@ -1554,7 +1554,7 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
 		/* Push loopback echo.
 		 * Will be looped back on TX interrupt based on message marker
 		 */
-		can_put_echo_skb(skb, dev, putidx);
+		can_put_echo_skb(skb, dev, putidx, 0);
 
 		/* Enable TX FIFO element to start transfer  */
 		m_can_write(cdev, M_CAN_TXBAR, (1 << putidx));
diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c
index 5ed00a1558e1..a28fdaa411c6 100644
--- a/drivers/net/can/mscan/mscan.c
+++ b/drivers/net/can/mscan/mscan.c
@@ -270,7 +270,7 @@ static netdev_tx_t mscan_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	list_add_tail(&priv->tx_queue[buf_id].list, &priv->tx_head);
 
-	can_put_echo_skb(skb, dev, buf_id);
+	can_put_echo_skb(skb, dev, buf_id, 0);
 
 	/* Enable interrupt. */
 	priv->tx_active |= 1 << buf_id;
diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c
index 4f9e7ec192aa..a4c35b48d8e9 100644
--- a/drivers/net/can/pch_can.c
+++ b/drivers/net/can/pch_can.c
@@ -924,7 +924,7 @@ static netdev_tx_t pch_xmit(struct sk_buff *skb, struct net_device *ndev)
 			  &priv->regs->ifregs[1].data[i / 2]);
 	}
 
-	can_put_echo_skb(skb, ndev, tx_obj_no - PCH_RX_OBJ_END - 1);
+	can_put_echo_skb(skb, ndev, tx_obj_no - PCH_RX_OBJ_END - 1, 0);
 
 	/* Set the size of the data. Update if2_mcont */
 	iowrite32(cf->len | PCH_IF_MCONT_NEWDAT | PCH_IF_MCONT_TXRQXT |
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index c5334b0c3038..179a8e10fbb8 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -716,7 +716,7 @@ static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
 	spin_lock_irqsave(&priv->echo_lock, flags);
 
 	/* prepare and save echo skb in internal slot */
-	can_put_echo_skb(skb, ndev, priv->echo_idx);
+	can_put_echo_skb(skb, ndev, priv->echo_idx, 0);
 
 	/* move echo index to the next slot */
 	priv->echo_idx = (priv->echo_idx + 1) % priv->can.echo_skb_max;
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index c803327f8f79..0b7e488bc4fe 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -617,7 +617,7 @@ static netdev_tx_t rcar_can_start_xmit(struct sk_buff *skb,
 	writeb(cf->len, &priv->regs->mb[RCAR_CAN_TX_FIFO_MBX].dlc);
 
 	priv->tx_dlc[priv->tx_head % RCAR_CAN_FIFO_DEPTH] = cf->len;
-	can_put_echo_skb(skb, ndev, priv->tx_head % RCAR_CAN_FIFO_DEPTH);
+	can_put_echo_skb(skb, ndev, priv->tx_head % RCAR_CAN_FIFO_DEPTH, 0);
 	priv->tx_head++;
 	/* Start Tx: write 0xff to the TFPCR register to increment
 	 * the CPU-side pointer for the transmit FIFO to the next
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 2778ed5c61d1..38376f29bc56 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1390,7 +1390,7 @@ static netdev_tx_t rcar_canfd_start_xmit(struct sk_buff *skb,
 	}
 
 	priv->tx_len[priv->tx_head % RCANFD_FIFO_DEPTH] = cf->len;
-	can_put_echo_skb(skb, ndev, priv->tx_head % RCANFD_FIFO_DEPTH);
+	can_put_echo_skb(skb, ndev, priv->tx_head % RCANFD_FIFO_DEPTH, 0);
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	priv->tx_head++;
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index b6a7003c51d2..e98482c7bf33 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -318,7 +318,7 @@ static netdev_tx_t sja1000_start_xmit(struct sk_buff *skb,
 	for (i = 0; i < cf->len; i++)
 		priv->write_reg(priv, dreg++, cf->data[i]);
 
-	can_put_echo_skb(skb, dev, 0);
+	can_put_echo_skb(skb, dev, 0, 0);
 
 	if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT)
 		cmd_reg_val |= CMD_AT;
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index 40070c930202..a5314448c5ae 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -104,7 +104,7 @@ static netdev_tx_t softing_netdev_start_xmit(struct sk_buff *skb,
 	card->tx.last_bus = priv->index;
 	++card->tx.pending;
 	++priv->tx.pending;
-	can_put_echo_skb(skb, dev, priv->tx.echo_put);
+	can_put_echo_skb(skb, dev, priv->tx.echo_put, 0);
 	++priv->tx.echo_put;
 	if (priv->tx.echo_put >= TX_ECHO_SKB_MAX)
 		priv->tx.echo_put = 0;
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index f9455de94786..8c83a9e5a9e4 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -586,7 +586,7 @@ static void hi3110_tx_work_handler(struct work_struct *ws)
 			frame = (struct can_frame *)priv->tx_skb->data;
 			hi3110_hw_tx(spi, frame);
 			priv->tx_len = 1 + frame->len;
-			can_put_echo_skb(priv->tx_skb, net, 0);
+			can_put_echo_skb(priv->tx_skb, net, 0, 0);
 			priv->tx_skb = NULL;
 		}
 	}
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 25859d16d06f..40866754aafc 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -1002,7 +1002,7 @@ static void mcp251x_tx_work_handler(struct work_struct *ws)
 				frame->len = CAN_FRAME_MAX_DATA_LEN;
 			mcp251x_hw_tx(spi, frame, 0);
 			priv->tx_len = 1 + frame->len;
-			can_put_echo_skb(priv->tx_skb, net, 0);
+			can_put_echo_skb(priv->tx_skb, net, 0, 0);
 			priv->tx_skb = NULL;
 		}
 	}
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 36235afb0bc6..95bba456a4cd 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -2436,7 +2436,7 @@ static netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
 	if (tx_ring->head - tx_ring->tail >= tx_ring->obj_num)
 		netif_stop_queue(ndev);
 
-	can_put_echo_skb(skb, ndev, tx_head);
+	can_put_echo_skb(skb, ndev, tx_head, 0);
 
 	err = mcp251xfd_tx_obj_write(priv, tx_obj);
 	if (err)
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index 783b63218b7b..b75175d59104 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -448,7 +448,7 @@ static netdev_tx_t sun4ican_start_xmit(struct sk_buff *skb, struct net_device *d
 
 	writel(msg_flag_n, priv->base + SUN4I_REG_BUF0_ADDR);
 
-	can_put_echo_skb(skb, dev, 0);
+	can_put_echo_skb(skb, dev, 0, 0);
 
 	if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK)
 		sun4i_can_write_cmdreg(priv, SUN4I_CMD_SELF_RCV_REQ);
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index a6850ff0b55b..485c19bc98c2 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -513,7 +513,7 @@ static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev)
 			       be32_to_cpu(*(__be32 *)(cf->data + 4)));
 	else
 		*(u32 *)(cf->data + 4) = 0;
-	can_put_echo_skb(skb, ndev, mbxno);
+	can_put_echo_skb(skb, ndev, mbxno, 0);
 
 	spin_lock_irqsave(&priv->mbx_lock, flags);
 	--priv->tx_head;
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 25eee4466364..5e5330060464 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -801,7 +801,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne
 	urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 	usb_anchor_urb(urb, &dev->tx_submitted);
 
-	can_put_echo_skb(skb, netdev, context->echo_index);
+	can_put_echo_skb(skb, netdev, context->echo_index, 0);
 
 	atomic_inc(&dev->active_tx_urbs);
 
diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
index 9eed75a4b678..68d8a85f00c4 100644
--- a/drivers/net/can/usb/esd_usb2.c
+++ b/drivers/net/can/usb/esd_usb2.c
@@ -783,7 +783,7 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
 
 	usb_anchor_urb(urb, &priv->tx_submitted);
 
-	can_put_echo_skb(skb, netdev, context->echo_index);
+	can_put_echo_skb(skb, netdev, context->echo_index, 0);
 
 	atomic_inc(&priv->active_tx_jobs);
 
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 0487095e1fd0..5ce9ba5d29d6 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -525,7 +525,7 @@ static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb,
 	urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 	usb_anchor_urb(urb, &dev->tx_submitted);
 
-	can_put_echo_skb(skb, netdev, idx);
+	can_put_echo_skb(skb, netdev, idx, 0);
 
 	atomic_inc(&dev->active_tx_urbs);
 
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
index e2d58846c40c..2b7efd296758 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
@@ -578,7 +578,7 @@ static netdev_tx_t kvaser_usb_start_xmit(struct sk_buff *skb,
 
 	context->priv = priv;
 
-	can_put_echo_skb(skb, netdev, context->echo_index);
+	can_put_echo_skb(skb, netdev, context->echo_index, 0);
 
 	usb_fill_bulk_urb(urb, dev->udev,
 			  usb_sndbulkpipe(dev->udev,
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index df54eb7d4b36..5347c89992ce 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -355,7 +355,7 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb,
 	if (cf->can_id & CAN_RTR_FLAG)
 		usb_msg.dlc |= MCBA_DLC_RTR_MASK;
 
-	can_put_echo_skb(skb, priv->netdev, ctx->ndx);
+	can_put_echo_skb(skb, priv->netdev, ctx->ndx, 0);
 
 	err = mcba_usb_xmit(priv, (struct mcba_usb_msg *)&usb_msg, ctx);
 	if (err)
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 251835ea15aa..95672750419a 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -365,7 +365,7 @@ static netdev_tx_t peak_usb_ndo_start_xmit(struct sk_buff *skb,
 
 	usb_anchor_urb(urb, &dev->tx_submitted);
 
-	can_put_echo_skb(skb, netdev, context->echo_index);
+	can_put_echo_skb(skb, netdev, context->echo_index, 0);
 
 	atomic_inc(&dev->active_tx_urbs);
 
diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
index 7d92da8954fe..5add27614e2b 100644
--- a/drivers/net/can/usb/ucan.c
+++ b/drivers/net/can/usb/ucan.c
@@ -1137,7 +1137,7 @@ static netdev_tx_t ucan_start_xmit(struct sk_buff *skb,
 
 	/* put the skb on can loopback stack */
 	spin_lock_irqsave(&up->echo_skb_lock, flags);
-	can_put_echo_skb(skb, up->netdev, echo_index);
+	can_put_echo_skb(skb, up->netdev, echo_index, 0);
 	spin_unlock_irqrestore(&up->echo_skb_lock, flags);
 
 	/* transmit it */
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 44478304ff46..2e824d9d8167 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -664,7 +664,7 @@ static netdev_tx_t usb_8dev_start_xmit(struct sk_buff *skb,
 	urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 	usb_anchor_urb(urb, &priv->tx_submitted);
 
-	can_put_echo_skb(skb, netdev, context->echo_index);
+	can_put_echo_skb(skb, netdev, context->echo_index, 0);
 
 	atomic_inc(&priv->active_tx_urbs);
 
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 3f54edee92eb..8d5132a3f2c9 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -592,9 +592,9 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb,
 
 	if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) &&
 	    (priv->devtype.flags & XCAN_FLAG_TXFEMP))
-		can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max);
+		can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0);
 	else
-		can_put_echo_skb(skb, ndev, 0);
+		can_put_echo_skb(skb, ndev, 0, 0);
 
 	priv->tx_head++;
 
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index 5db9da30843c..eaac4a637ae0 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -18,7 +18,7 @@
 
 void can_flush_echo_skb(struct net_device *dev);
 int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
-		     unsigned int idx);
+		     unsigned int idx, unsigned int frame_len);
 struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
 				   u8 *len_ptr, unsigned int *frame_len_ptr);
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
-- 
cgit v1.2.3


From 9420e1d495e2a3b5f673148b7e3ebc861b1441f7 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:28 +0100
Subject: can: dev: can_get_echo_skb(): extend to return can frame length

In order to implement byte queue limits (bql) in CAN drivers, the length of the
CAN frame needs to be passed into the networking stack after queueing and after
transmission completion.

To avoid to calculate this length twice, extend can_get_echo_skb() to return
that value. Convert all users of this function, too.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-14-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/at91_can.c                        | 2 +-
 drivers/net/can/c_can/c_can.c                     | 2 +-
 drivers/net/can/cc770/cc770.c                     | 2 +-
 drivers/net/can/dev/skb.c                         | 5 +++--
 drivers/net/can/grcan.c                           | 2 +-
 drivers/net/can/ifi_canfd/ifi_canfd.c             | 2 +-
 drivers/net/can/kvaser_pciefd.c                   | 4 ++--
 drivers/net/can/m_can/m_can.c                     | 4 ++--
 drivers/net/can/mscan/mscan.c                     | 2 +-
 drivers/net/can/pch_can.c                         | 2 +-
 drivers/net/can/peak_canfd/peak_canfd.c           | 2 +-
 drivers/net/can/rcar/rcar_can.c                   | 2 +-
 drivers/net/can/rcar/rcar_canfd.c                 | 2 +-
 drivers/net/can/sja1000/sja1000.c                 | 2 +-
 drivers/net/can/softing/softing_main.c            | 2 +-
 drivers/net/can/spi/hi311x.c                      | 2 +-
 drivers/net/can/spi/mcp251x.c                     | 2 +-
 drivers/net/can/sun4i_can.c                       | 2 +-
 drivers/net/can/usb/ems_usb.c                     | 2 +-
 drivers/net/can/usb/esd_usb2.c                    | 2 +-
 drivers/net/can/usb/gs_usb.c                      | 2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c | 2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c  | 2 +-
 drivers/net/can/usb/mcba_usb.c                    | 2 +-
 drivers/net/can/usb/peak_usb/pcan_usb_core.c      | 2 +-
 drivers/net/can/usb/ucan.c                        | 2 +-
 drivers/net/can/usb/usb_8dev.c                    | 2 +-
 drivers/net/can/xilinx_can.c                      | 2 +-
 include/linux/can/skb.h                           | 3 ++-
 29 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index 90b223a80ed4..9ad9b39f480e 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -856,7 +856,7 @@ static void at91_irq_tx(struct net_device *dev, u32 reg_sr)
 		if (likely(reg_msr & AT91_MSR_MRDY &&
 			   ~reg_msr & AT91_MSR_MABT)) {
 			/* _NOTE_: subtract AT91_MB_TX_FIRST offset from mb! */
-			can_get_echo_skb(dev, mb - get_mb_tx_first(priv));
+			can_get_echo_skb(dev, mb - get_mb_tx_first(priv), NULL);
 			dev->stats.tx_packets++;
 			can_led_event(dev, CAN_LED_EVENT_TX);
 		}
diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
index 13638954a25c..ef474bae47a1 100644
--- a/drivers/net/can/c_can/c_can.c
+++ b/drivers/net/can/c_can/c_can.c
@@ -733,7 +733,7 @@ static void c_can_do_tx(struct net_device *dev)
 		pend &= ~(1 << idx);
 		obj = idx + C_CAN_MSG_OBJ_TX_FIRST;
 		c_can_inval_tx_object(dev, IF_RX, obj);
-		can_get_echo_skb(dev, idx);
+		can_get_echo_skb(dev, idx, NULL);
 		bytes += priv->dlc[idx];
 		pkts++;
 	}
diff --git a/drivers/net/can/cc770/cc770.c b/drivers/net/can/cc770/cc770.c
index e53ca338368a..f8a130f594e2 100644
--- a/drivers/net/can/cc770/cc770.c
+++ b/drivers/net/can/cc770/cc770.c
@@ -703,7 +703,7 @@ static void cc770_tx_interrupt(struct net_device *dev, unsigned int o)
 	stats->tx_packets++;
 
 	can_put_echo_skb(priv->tx_skb, dev, 0, 0);
-	can_get_echo_skb(dev, 0);
+	can_get_echo_skb(dev, 0, NULL);
 	priv->tx_skb = NULL;
 
 	netif_wake_queue(dev);
diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index c184b4dce19e..53683d4312f1 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -121,12 +121,13 @@ __can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr,
  * is handled in the device driver. The driver must protect
  * access to priv->echo_skb, if necessary.
  */
-unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
+unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx,
+			      unsigned int *frame_len_ptr)
 {
 	struct sk_buff *skb;
 	u8 len;
 
-	skb = __can_get_echo_skb(dev, idx, &len, NULL);
+	skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr);
 	if (!skb)
 		return 0;
 
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index 8086cdc10000..4a8453290530 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -517,7 +517,7 @@ static int catch_up_echo_skb(struct net_device *dev, int budget, bool echo)
 			stats->tx_packets++;
 			stats->tx_bytes += priv->txdlc[i];
 			priv->txdlc[i] = 0;
-			can_get_echo_skb(dev, i);
+			can_get_echo_skb(dev, i, NULL);
 		} else {
 			/* For cleanup of untransmitted messages */
 			can_free_echo_skb(dev, i);
diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c
index 56ac9e1dace7..5bb957a26bc6 100644
--- a/drivers/net/can/ifi_canfd/ifi_canfd.c
+++ b/drivers/net/can/ifi_canfd/ifi_canfd.c
@@ -629,7 +629,7 @@ static irqreturn_t ifi_canfd_isr(int irq, void *dev_id)
 
 	/* TX IRQ */
 	if (isr & IFI_CANFD_INTERRUPT_TXFIFO_REMOVE) {
-		stats->tx_bytes += can_get_echo_skb(ndev, 0);
+		stats->tx_bytes += can_get_echo_skb(ndev, 0, NULL);
 		stats->tx_packets++;
 		can_led_event(ndev, CAN_LED_EVENT_TX);
 	}
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 0cf82f0646a3..37e05010ca91 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -1467,7 +1467,7 @@ static int kvaser_pciefd_handle_eack_packet(struct kvaser_pciefd *pcie,
 				  can->reg_base + KVASER_PCIEFD_KCAN_CTRL_REG);
 	} else {
 		int echo_idx = p->header[0] & KVASER_PCIEFD_PACKET_SEQ_MSK;
-		int dlc = can_get_echo_skb(can->can.dev, echo_idx);
+		int dlc = can_get_echo_skb(can->can.dev, echo_idx, NULL);
 		struct net_device_stats *stats = &can->can.dev->stats;
 
 		stats->tx_bytes += dlc;
@@ -1533,7 +1533,7 @@ static int kvaser_pciefd_handle_ack_packet(struct kvaser_pciefd *pcie,
 		netdev_dbg(can->can.dev, "Packet was flushed\n");
 	} else {
 		int echo_idx = p->header[0] & KVASER_PCIEFD_PACKET_SEQ_MSK;
-		int dlc = can_get_echo_skb(can->can.dev, echo_idx);
+		int dlc = can_get_echo_skb(can->can.dev, echo_idx, NULL);
 		u8 count = ioread32(can->reg_base +
 				    KVASER_PCIEFD_KCAN_TX_NPACKETS_REG) & 0xff;
 
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index fff7432103cb..3752520a7d4b 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -930,7 +930,7 @@ static void m_can_echo_tx_event(struct net_device *dev)
 						(fgi << TXEFA_EFAI_SHIFT)));
 
 		/* update stats */
-		stats->tx_bytes += can_get_echo_skb(dev, msg_mark);
+		stats->tx_bytes += can_get_echo_skb(dev, msg_mark, NULL);
 		stats->tx_packets++;
 	}
 }
@@ -972,7 +972,7 @@ static irqreturn_t m_can_isr(int irq, void *dev_id)
 	if (cdev->version == 30) {
 		if (ir & IR_TC) {
 			/* Transmission Complete Interrupt*/
-			stats->tx_bytes += can_get_echo_skb(dev, 0);
+			stats->tx_bytes += can_get_echo_skb(dev, 0, NULL);
 			stats->tx_packets++;
 			can_led_event(dev, CAN_LED_EVENT_TX);
 			netif_wake_queue(dev);
diff --git a/drivers/net/can/mscan/mscan.c b/drivers/net/can/mscan/mscan.c
index a28fdaa411c6..fa32e418eb29 100644
--- a/drivers/net/can/mscan/mscan.c
+++ b/drivers/net/can/mscan/mscan.c
@@ -448,7 +448,7 @@ static irqreturn_t mscan_isr(int irq, void *dev_id)
 			out_8(&regs->cantbsel, mask);
 			stats->tx_bytes += in_8(&regs->tx.dlr);
 			stats->tx_packets++;
-			can_get_echo_skb(dev, entry->id);
+			can_get_echo_skb(dev, entry->id, NULL);
 			priv->tx_active &= ~mask;
 			list_del(pos);
 		}
diff --git a/drivers/net/can/pch_can.c b/drivers/net/can/pch_can.c
index a4c35b48d8e9..92a54a5fd4c5 100644
--- a/drivers/net/can/pch_can.c
+++ b/drivers/net/can/pch_can.c
@@ -711,7 +711,7 @@ static void pch_can_tx_complete(struct net_device *ndev, u32 int_stat)
 	struct net_device_stats *stats = &(priv->ndev->stats);
 	u32 dlc;
 
-	can_get_echo_skb(ndev, int_stat - PCH_RX_OBJ_END - 1);
+	can_get_echo_skb(ndev, int_stat - PCH_RX_OBJ_END - 1, NULL);
 	iowrite32(PCH_CMASK_RX_TX_GET | PCH_CMASK_CLRINTPND,
 		  &priv->regs->ifregs[1].cmask);
 	pch_can_rw_msg_obj(&priv->regs->ifregs[1].creq, int_stat);
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
index 179a8e10fbb8..00847cbaf7b6 100644
--- a/drivers/net/can/peak_canfd/peak_canfd.c
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -266,7 +266,7 @@ static int pucan_handle_can_rx(struct peak_canfd_priv *priv,
 		unsigned long flags;
 
 		spin_lock_irqsave(&priv->echo_lock, flags);
-		can_get_echo_skb(priv->ndev, msg->client);
+		can_get_echo_skb(priv->ndev, msg->client, NULL);
 
 		/* count bytes of the echo instead of skb */
 		stats->tx_bytes += cf_len;
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 0b7e488bc4fe..4870c4ea190a 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -386,7 +386,7 @@ static void rcar_can_tx_done(struct net_device *ndev)
 		stats->tx_bytes += priv->tx_dlc[priv->tx_tail %
 						RCAR_CAN_FIFO_DEPTH];
 		priv->tx_dlc[priv->tx_tail % RCAR_CAN_FIFO_DEPTH] = 0;
-		can_get_echo_skb(ndev, priv->tx_tail % RCAR_CAN_FIFO_DEPTH);
+		can_get_echo_skb(ndev, priv->tx_tail % RCAR_CAN_FIFO_DEPTH, NULL);
 		priv->tx_tail++;
 		netif_wake_queue(ndev);
 	}
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 38376f29bc56..d8d233e62990 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -1044,7 +1044,7 @@ static void rcar_canfd_tx_done(struct net_device *ndev)
 		stats->tx_packets++;
 		stats->tx_bytes += priv->tx_len[sent];
 		priv->tx_len[sent] = 0;
-		can_get_echo_skb(ndev, sent);
+		can_get_echo_skb(ndev, sent, NULL);
 
 		spin_lock_irqsave(&priv->tx_lock, flags);
 		priv->tx_tail++;
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index e98482c7bf33..9e86488ba55f 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -531,7 +531,7 @@ irqreturn_t sja1000_interrupt(int irq, void *dev_id)
 				stats->tx_bytes +=
 					priv->read_reg(priv, SJA1000_FI) & 0xf;
 				stats->tx_packets++;
-				can_get_echo_skb(dev, 0);
+				can_get_echo_skb(dev, 0, NULL);
 			}
 			netif_wake_queue(dev);
 			can_led_event(dev, CAN_LED_EVENT_TX);
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index a5314448c5ae..c44f3411e561 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -284,7 +284,7 @@ static int softing_handle_1(struct softing *card)
 			skb = priv->can.echo_skb[priv->tx.echo_get];
 			if (skb)
 				skb->tstamp = ktime;
-			can_get_echo_skb(netdev, priv->tx.echo_get);
+			can_get_echo_skb(netdev, priv->tx.echo_get, NULL);
 			++priv->tx.echo_get;
 			if (priv->tx.echo_get >= TX_ECHO_SKB_MAX)
 				priv->tx.echo_get = 0;
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index 8c83a9e5a9e4..c3e020c90111 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -725,7 +725,7 @@ static irqreturn_t hi3110_can_ist(int irq, void *dev_id)
 			net->stats.tx_bytes += priv->tx_len - 1;
 			can_led_event(net, CAN_LED_EVENT_TX);
 			if (priv->tx_len) {
-				can_get_echo_skb(net, 0);
+				can_get_echo_skb(net, 0, NULL);
 				priv->tx_len = 0;
 			}
 			netif_wake_queue(net);
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 40866754aafc..f69fb4238a65 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -1171,7 +1171,7 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
 			net->stats.tx_bytes += priv->tx_len - 1;
 			can_led_event(net, CAN_LED_EVENT_TX);
 			if (priv->tx_len) {
-				can_get_echo_skb(net, 0);
+				can_get_echo_skb(net, 0, NULL);
 				priv->tx_len = 0;
 			}
 			netif_wake_queue(net);
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index b75175d59104..54aa7c25c4de 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -655,7 +655,7 @@ static irqreturn_t sun4i_can_interrupt(int irq, void *dev_id)
 			    readl(priv->base +
 				  SUN4I_REG_RBUF_RBACK_START_ADDR) & 0xf;
 			stats->tx_packets++;
-			can_get_echo_skb(dev, 0);
+			can_get_echo_skb(dev, 0, NULL);
 			netif_wake_queue(dev);
 			can_led_event(dev, CAN_LED_EVENT_TX);
 		}
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 5e5330060464..18f40eb20360 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -518,7 +518,7 @@ static void ems_usb_write_bulk_callback(struct urb *urb)
 	netdev->stats.tx_packets++;
 	netdev->stats.tx_bytes += context->dlc;
 
-	can_get_echo_skb(netdev, context->echo_index);
+	can_get_echo_skb(netdev, context->echo_index, NULL);
 
 	/* Release context */
 	context->echo_index = MAX_TX_URBS;
diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
index 68d8a85f00c4..562acbf454fd 100644
--- a/drivers/net/can/usb/esd_usb2.c
+++ b/drivers/net/can/usb/esd_usb2.c
@@ -357,7 +357,7 @@ static void esd_usb2_tx_done_msg(struct esd_usb2_net_priv *priv,
 	if (!msg->msg.txdone.status) {
 		stats->tx_packets++;
 		stats->tx_bytes += context->len;
-		can_get_echo_skb(netdev, context->echo_index);
+		can_get_echo_skb(netdev, context->echo_index, NULL);
 	} else {
 		stats->tx_errors++;
 		can_free_echo_skb(netdev, context->echo_index);
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 5ce9ba5d29d6..a00dc1904415 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -370,7 +370,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
 			goto resubmit_urb;
 		}
 
-		can_get_echo_skb(netdev, hf->echo_id);
+		can_get_echo_skb(netdev, hf->echo_id, NULL);
 
 		gs_free_tx_context(txc);
 
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
index 480bd2ecb296..dcee8dc828ec 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_hydra.c
@@ -1151,7 +1151,7 @@ static void kvaser_usb_hydra_tx_acknowledge(const struct kvaser_usb *dev,
 
 	spin_lock_irqsave(&priv->tx_contexts_lock, irq_flags);
 
-	can_get_echo_skb(priv->netdev, context->echo_index);
+	can_get_echo_skb(priv->netdev, context->echo_index, NULL);
 	context->echo_index = dev->max_tx_urbs;
 	--priv->active_tx_contexts;
 	netif_wake_queue(priv->netdev);
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index 98c016ef0607..59ba7c7beec0 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -594,7 +594,7 @@ static void kvaser_usb_leaf_tx_acknowledge(const struct kvaser_usb *dev,
 
 	spin_lock_irqsave(&priv->tx_contexts_lock, flags);
 
-	can_get_echo_skb(priv->netdev, context->echo_index);
+	can_get_echo_skb(priv->netdev, context->echo_index, NULL);
 	context->echo_index = dev->max_tx_urbs;
 	--priv->active_tx_contexts;
 	netif_wake_queue(priv->netdev);
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index 5347c89992ce..4232a7126c1b 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -237,7 +237,7 @@ static void mcba_usb_write_bulk_callback(struct urb *urb)
 		netdev->stats.tx_bytes += ctx->dlc;
 
 		can_led_event(netdev, CAN_LED_EVENT_TX);
-		can_get_echo_skb(netdev, ctx->ndx);
+		can_get_echo_skb(netdev, ctx->ndx, NULL);
 	}
 
 	if (urb->status)
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 95672750419a..573b11559d73 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -309,7 +309,7 @@ static void peak_usb_write_bulk_callback(struct urb *urb)
 	}
 
 	/* should always release echo skb and corresponding context */
-	can_get_echo_skb(netdev, context->echo_index);
+	can_get_echo_skb(netdev, context->echo_index, NULL);
 	context->echo_index = PCAN_USB_MAX_TX_URBS;
 
 	/* do wakeup tx queue in case of success only */
diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
index 5add27614e2b..fa403c080871 100644
--- a/drivers/net/can/usb/ucan.c
+++ b/drivers/net/can/usb/ucan.c
@@ -672,7 +672,7 @@ static void ucan_tx_complete_msg(struct ucan_priv *up,
 			/* update statistics */
 			up->netdev->stats.tx_packets++;
 			up->netdev->stats.tx_bytes += dlc;
-			can_get_echo_skb(up->netdev, echo_index);
+			can_get_echo_skb(up->netdev, echo_index, NULL);
 		} else {
 			up->netdev->stats.tx_dropped++;
 			can_free_echo_skb(up->netdev, echo_index);
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index 2e824d9d8167..e8c42430a4fc 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -585,7 +585,7 @@ static void usb_8dev_write_bulk_callback(struct urb *urb)
 	netdev->stats.tx_packets++;
 	netdev->stats.tx_bytes += context->dlc;
 
-	can_get_echo_skb(netdev, context->echo_index);
+	can_get_echo_skb(netdev, context->echo_index, NULL);
 
 	can_led_event(netdev, CAN_LED_EVENT_TX);
 
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 8d5132a3f2c9..37fa19c62d73 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -1292,7 +1292,7 @@ static void xcan_tx_interrupt(struct net_device *ndev, u32 isr)
 
 	while (frames_sent--) {
 		stats->tx_bytes += can_get_echo_skb(ndev, priv->tx_tail %
-						    priv->tx_max);
+						    priv->tx_max, NULL);
 		priv->tx_tail++;
 		stats->tx_packets++;
 	}
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index eaac4a637ae0..685f34cfba20 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -21,7 +21,8 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 		     unsigned int idx, unsigned int frame_len);
 struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
 				   u8 *len_ptr, unsigned int *frame_len_ptr);
-unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
+unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx,
+			      unsigned int *frame_len_ptr);
 void can_free_echo_skb(struct net_device *dev, unsigned int idx);
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
 struct sk_buff *alloc_canfd_skb(struct net_device *dev,
-- 
cgit v1.2.3


From 99842c9685ab00fa8689e8bd12bde62b706b6198 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 11 Jan 2021 15:19:29 +0100
Subject: can: dev: can_rx_offload_get_echo_skb(): extend to return can frame
 length

In order to implement byte queue limits (bql) in CAN drivers, the length of the
CAN frame needs to be passed into the networking stack after queueing and after
transmission completion.

To avoid to calculate this length twice, extend can_rx_offload_get_echo_skb()
to return that value. Convert all users of this function, too.

Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/r/20210111141930.693847-15-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/rx-offload.c               | 5 +++--
 drivers/net/can/flexcan.c                      | 5 +++--
 drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +-
 drivers/net/can/ti_hecc.c                      | 2 +-
 include/linux/can/rx-offload.h                 | 3 ++-
 5 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/rx-offload.c b/drivers/net/can/dev/rx-offload.c
index 6a26b5282df1..ab2c1543786c 100644
--- a/drivers/net/can/dev/rx-offload.c
+++ b/drivers/net/can/dev/rx-offload.c
@@ -263,7 +263,8 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload,
 EXPORT_SYMBOL_GPL(can_rx_offload_queue_sorted);
 
 unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
-					 unsigned int idx, u32 timestamp)
+					 unsigned int idx, u32 timestamp,
+					 unsigned int *frame_len_ptr)
 {
 	struct net_device *dev = offload->dev;
 	struct net_device_stats *stats = &dev->stats;
@@ -271,7 +272,7 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
 	u8 len;
 	int err;
 
-	skb = __can_get_echo_skb(dev, idx, &len, NULL);
+	skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr);
 	if (!skb)
 		return 0;
 
diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 202d08f8e1a4..5d9157c655e9 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -1122,8 +1122,9 @@ static irqreturn_t flexcan_irq(int irq, void *dev_id)
 		u32 reg_ctrl = priv->read(&priv->tx_mb->can_ctrl);
 
 		handled = IRQ_HANDLED;
-		stats->tx_bytes += can_rx_offload_get_echo_skb(&priv->offload,
-							       0, reg_ctrl << 16);
+		stats->tx_bytes +=
+			can_rx_offload_get_echo_skb(&priv->offload, 0,
+						    reg_ctrl << 16, NULL);
 		stats->tx_packets++;
 		can_led_event(dev, CAN_LED_EVENT_TX);
 
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 95bba456a4cd..63bbe0930e53 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -1271,7 +1271,7 @@ mcp251xfd_handle_tefif_one(struct mcp251xfd_priv *priv,
 	stats->tx_bytes +=
 		can_rx_offload_get_echo_skb(&priv->offload,
 					    mcp251xfd_get_tef_tail(priv),
-					    hw_tef_obj->ts);
+					    hw_tef_obj->ts, NULL);
 	stats->tx_packets++;
 	priv->tef->tail++;
 
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 485c19bc98c2..73245d8836a9 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -757,7 +757,7 @@ static irqreturn_t ti_hecc_interrupt(int irq, void *dev_id)
 			stamp = hecc_read_stamp(priv, mbxno);
 			stats->tx_bytes +=
 				can_rx_offload_get_echo_skb(&priv->offload,
-							    mbxno, stamp);
+							    mbxno, stamp, NULL);
 			stats->tx_packets++;
 			can_led_event(ndev, CAN_LED_EVENT_TX);
 			--priv->tx_tail;
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index f1b38088b765..40882df7105e 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -44,7 +44,8 @@ int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload);
 int can_rx_offload_queue_sorted(struct can_rx_offload *offload,
 				struct sk_buff *skb, u32 timestamp);
 unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
-					 unsigned int idx, u32 timestamp);
+					 unsigned int idx, u32 timestamp,
+					 unsigned int *frame_len_ptr);
 int can_rx_offload_queue_tail(struct can_rx_offload *offload,
 			      struct sk_buff *skb);
 void can_rx_offload_del(struct can_rx_offload *offload);
-- 
cgit v1.2.3


From a5418be9dffe70ccbb0b4bd5ea3881c81927e965 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 8 Dec 2020 09:46:56 +0530
Subject: sched/core: Rename schedutil_cpu_util() and allow rest of the kernel
 to use it

There is nothing schedutil specific in schedutil_cpu_util(), rename it
to effective_cpu_util(). Also create and expose another wrapper
sched_cpu_util() which can be used by other parts of the kernel, like
thermal core (that will be done in a later commit).

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lkml.kernel.org/r/db011961fb3bb8bef1c0eda5cd64564637d3ef31.1607400596.git.viresh.kumar@linaro.org
---
 include/linux/sched.h            |  5 +++++
 kernel/sched/core.c              | 10 ++++++++--
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/sched/fair.c              |  6 +++---
 kernel/sched/sched.h             | 10 +++++-----
 5 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e3a5eeec509..31169e70476a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1968,6 +1968,11 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+#ifdef CONFIG_SMP
+/* Returns effective CPU energy utilization, as seen by the scheduler */
+unsigned long sched_cpu_util(int cpu, unsigned long max);
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_RSEQ
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d89d682d5337..4fe4cbf0bf08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5683,8 +5683,8 @@ struct task_struct *idle_task(int cpu)
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
  */
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum schedutil_type type,
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+				 unsigned long max, enum cpu_util_type type,
 				 struct task_struct *p)
 {
 	unsigned long dl_util, util, irq;
@@ -5768,6 +5768,12 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
 
 	return min(max, util);
 }
+
+unsigned long sched_cpu_util(int cpu, unsigned long max)
+{
+	return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+				  ENERGY_UTIL, NULL);
+}
 #endif /* CONFIG_SMP */
 
 /**
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1dfa69246485..41e498b0008a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -178,7 +178,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 
 	sg_cpu->max = max;
 	sg_cpu->bw_dl = cpu_bw_dl(rq);
-	sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
 					  FREQUENCY_UTIL, NULL);
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04a3ce20da67..39c5bda90bd4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6543,7 +6543,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 		 * is already enough to scale the EM reported power
 		 * consumption at the (eventually clamped) cpu_capacity.
 		 */
-		sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+		sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
 					       ENERGY_UTIL, NULL);
 
 		/*
@@ -6553,7 +6553,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+		cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
 					      FREQUENCY_UTIL, tsk);
 		max_util = max(max_util, cpu_util);
 	}
@@ -6651,7 +6651,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			 * IOW, placing the task there would make the CPU
 			 * overutilized. Take uclamp into account to see how
 			 * much capacity we can get out of the CPU; this is
-			 * aligned with schedutil_cpu_util().
+			 * aligned with sched_cpu_util().
 			 */
 			util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
 			if (!fits_capacity(util, cpu_cap))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 242d4c5a5efc..045b01064c1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2559,22 +2559,22 @@ static inline unsigned long capacity_orig_of(int cpu)
 }
 
 /**
- * enum schedutil_type - CPU utilization type
+ * enum cpu_util_type - CPU utilization type
  * @FREQUENCY_UTIL:	Utilization used to select frequency
  * @ENERGY_UTIL:	Utilization used during energy calculation
  *
  * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
  * need to be aggregated differently depending on the usage made of them. This
- * enum is used within schedutil_freq_util() to differentiate the types of
+ * enum is used within effective_cpu_util() to differentiate the types of
  * utilization expected by the callers, and adjust the aggregation accordingly.
  */
-enum schedutil_type {
+enum cpu_util_type {
 	FREQUENCY_UTIL,
 	ENERGY_UTIL,
 };
 
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-				 unsigned long max, enum schedutil_type type,
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+				 unsigned long max, enum cpu_util_type type,
 				 struct task_struct *p);
 
 static inline unsigned long cpu_bw_dl(struct rq *rq)
-- 
cgit v1.2.3


From dfd5e3f5fe27bda91d5cc028c86ffbb7a0614489 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 9 Dec 2020 16:06:21 +0100
Subject: locking/lockdep: Mark local_lock_t

The local_lock_t's are special, because they cannot form IRQ
inversions, make sure we can tell them apart from the rest of the
locks.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/local_lock_internal.h |  5 ++++-
 include/linux/lockdep.h             | 15 ++++++++++++---
 include/linux/lockdep_types.h       | 18 ++++++++++++++----
 kernel/locking/lockdep.c            | 16 +++++++++-------
 4 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 4a8795b21d77..ded90b097e6e 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -18,6 +18,7 @@ typedef struct {
 	.dep_map = {					\
 		.name = #lockname,			\
 		.wait_type_inner = LD_WAIT_CONFIG,	\
+		.lock_type = LD_LOCK_PERCPU,			\
 	}
 #else
 # define LL_DEP_MAP_INIT(lockname)
@@ -30,7 +31,9 @@ do {								\
 	static struct lock_class_key __key;			\
 								\
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
-	lockdep_init_map_wait(&(lock)->dep_map, #lock, &__key, 0, LD_WAIT_CONFIG);\
+	lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, 0, \
+			      LD_WAIT_CONFIG, LD_WAIT_INV,	\
+			      LD_LOCK_PERCPU);			\
 } while (0)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b9e9adec73e8..7b7ebf2e28ec 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -185,12 +185,19 @@ extern void lockdep_unregister_key(struct lock_class_key *key);
  * to lockdep:
  */
 
-extern void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
-	struct lock_class_key *key, int subclass, short inner, short outer);
+extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
+	struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type);
+
+static inline void
+lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+		       struct lock_class_key *key, int subclass, u8 inner, u8 outer)
+{
+	lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL);
+}
 
 static inline void
 lockdep_init_map_wait(struct lockdep_map *lock, const char *name,
-		      struct lock_class_key *key, int subclass, short inner)
+		      struct lock_class_key *key, int subclass, u8 inner)
 {
 	lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV);
 }
@@ -340,6 +347,8 @@ static inline void lockdep_set_selftest_task(struct task_struct *task)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
+# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
+		do { (void)(name); (void)(key); } while (0)
 # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \
 		do { (void)(name); (void)(key); } while (0)
 # define lockdep_init_map_wait(lock, name, key, sub, inner) \
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 9a1fd49df17f..2ec9ff5a7fff 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -30,6 +30,12 @@ enum lockdep_wait_type {
 	LD_WAIT_MAX,		/* must be last */
 };
 
+enum lockdep_lock_type {
+	LD_LOCK_NORMAL = 0,	/* normal, catch all */
+	LD_LOCK_PERCPU,		/* percpu */
+	LD_LOCK_MAX,
+};
+
 #ifdef CONFIG_LOCKDEP
 
 /*
@@ -119,8 +125,10 @@ struct lock_class {
 	int				name_version;
 	const char			*name;
 
-	short				wait_type_inner;
-	short				wait_type_outer;
+	u8				wait_type_inner;
+	u8				wait_type_outer;
+	u8				lock_type;
+	/* u8				hole; */
 
 #ifdef CONFIG_LOCK_STAT
 	unsigned long			contention_point[LOCKSTAT_POINTS];
@@ -169,8 +177,10 @@ struct lockdep_map {
 	struct lock_class_key		*key;
 	struct lock_class		*class_cache[NR_LOCKDEP_CACHING_CLASSES];
 	const char			*name;
-	short				wait_type_outer; /* can be taken in this context */
-	short				wait_type_inner; /* presents this context */
+	u8				wait_type_outer; /* can be taken in this context */
+	u8				wait_type_inner; /* presents this context */
+	u8				lock_type;
+	/* u8				hole; */
 #ifdef CONFIG_LOCK_STAT
 	int				cpu;
 	unsigned long			ip;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c1418b47f625..b061e2991700 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1290,6 +1290,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	class->name_version = count_matching_names(class);
 	class->wait_type_inner = lock->wait_type_inner;
 	class->wait_type_outer = lock->wait_type_outer;
+	class->lock_type = lock->lock_type;
 	/*
 	 * We use RCU's safe list-add method to make
 	 * parallel walking of the hash-list safe:
@@ -4503,9 +4504,9 @@ print_lock_invalid_wait_context(struct task_struct *curr,
  */
 static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 {
-	short next_inner = hlock_class(next)->wait_type_inner;
-	short next_outer = hlock_class(next)->wait_type_outer;
-	short curr_inner;
+	u8 next_inner = hlock_class(next)->wait_type_inner;
+	u8 next_outer = hlock_class(next)->wait_type_outer;
+	u8 curr_inner;
 	int depth;
 
 	if (!curr->lockdep_depth || !next_inner || next->trylock)
@@ -4528,7 +4529,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
 
 	for (; depth < curr->lockdep_depth; depth++) {
 		struct held_lock *prev = curr->held_locks + depth;
-		short prev_inner = hlock_class(prev)->wait_type_inner;
+		u8 prev_inner = hlock_class(prev)->wait_type_inner;
 
 		if (prev_inner) {
 			/*
@@ -4577,9 +4578,9 @@ static inline int check_wait_context(struct task_struct *curr,
 /*
  * Initialize a lock instance's lock-class mapping info:
  */
-void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
 			    struct lock_class_key *key, int subclass,
-			    short inner, short outer)
+			    u8 inner, u8 outer, u8 lock_type)
 {
 	int i;
 
@@ -4602,6 +4603,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
 
 	lock->wait_type_outer = outer;
 	lock->wait_type_inner = inner;
+	lock->lock_type = lock_type;
 
 	/*
 	 * No key, no joy, we need to hash something.
@@ -4636,7 +4638,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
 		raw_local_irq_restore(flags);
 	}
 }
-EXPORT_SYMBOL_GPL(lockdep_init_map_waits);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
 
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-- 
cgit v1.2.3


From b0f87e8ac263e3b82ec314542cb7fb07a47fc8b7 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 12 Jan 2021 18:00:58 +0900
Subject: mfd: rohm-generic: Add BD9571 and BD9574

Add chip IDs for BD9571MWV and BD9574MWF.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rohm-generic.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 4283b5b33e04..affacf8c6e91 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -12,6 +12,8 @@ enum rohm_chip_type {
 	ROHM_CHIP_TYPE_BD71847,
 	ROHM_CHIP_TYPE_BD70528,
 	ROHM_CHIP_TYPE_BD71828,
+	ROHM_CHIP_TYPE_BD9571,
+	ROHM_CHIP_TYPE_BD9574,
 	ROHM_CHIP_TYPE_AMOUNT
 };
 
-- 
cgit v1.2.3


From bfb26be7fe90186e5d9fe704cc124ab77bb7d127 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 12 Jan 2021 18:01:04 +0900
Subject: mfd: bd9571mwv: Use the SPDX license identifier

Use the SPDX license identifier instead of a local description.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/bd9571mwv.c       | 10 +---------
 include/linux/mfd/bd9571mwv.h | 10 +---------
 2 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/bd9571mwv.c b/drivers/mfd/bd9571mwv.c
index 19d57a45134c..e68c3fabf9fc 100644
--- a/drivers/mfd/bd9571mwv.c
+++ b/drivers/mfd/bd9571mwv.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * ROHM BD9571MWV-M MFD driver
  *
  * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65086 driver
  */
 
diff --git a/include/linux/mfd/bd9571mwv.h b/include/linux/mfd/bd9571mwv.h
index eb05569f752b..bcc709293b24 100644
--- a/include/linux/mfd/bd9571mwv.h
+++ b/include/linux/mfd/bd9571mwv.h
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * ROHM BD9571MWV-M driver
  *
  * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed "as is" WITHOUT ANY WARRANTY of any
- * kind, whether expressed or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License version 2 for more details.
- *
  * Based on the TPS65086 driver
  */
 
-- 
cgit v1.2.3


From f16e1fd197f85a943b5880009f4aefe05a17df0d Mon Sep 17 00:00:00 2001
From: Khiem Nguyen <khiem.nguyen.xt@renesas.com>
Date: Tue, 12 Jan 2021 18:01:06 +0900
Subject: mfd: bd9571mwv: Make the driver more generic

Since the driver supports BD9571MWV PMIC only, this patch makes
the functions and data structure become more generic so that
it can support other PMIC variants as well. Also remove printing
part name which Lee Jones suggested.

Signed-off-by: Khiem Nguyen <khiem.nguyen.xt@renesas.com>
Co-developed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/bd9571mwv.c       | 77 +++++++++++++++++++++++--------------------
 include/linux/mfd/bd9571mwv.h | 18 ++--------
 2 files changed, 43 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/bd9571mwv.c b/drivers/mfd/bd9571mwv.c
index 49e968e20ddc..2c1fcbb6ff2b 100644
--- a/drivers/mfd/bd9571mwv.c
+++ b/drivers/mfd/bd9571mwv.c
@@ -3,6 +3,7 @@
  * ROHM BD9571MWV-M MFD driver
  *
  * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
+ * Copyright (C) 2020 Renesas Electronics Corporation
  *
  * Based on the TPS65086 driver
  */
@@ -102,13 +103,12 @@ static struct regmap_irq_chip bd9571mwv_irq_chip = {
 	.num_irqs	= ARRAY_SIZE(bd9571mwv_irqs),
 };
 
-static int bd9571mwv_identify(struct bd9571mwv *bd)
+static int bd957x_identify(struct device *dev, struct regmap *regmap)
 {
-	struct device *dev = bd->dev;
 	unsigned int value;
 	int ret;
 
-	ret = regmap_read(bd->regmap, BD9571MWV_VENDOR_CODE, &value);
+	ret = regmap_read(regmap, BD9571MWV_VENDOR_CODE, &value);
 	if (ret) {
 		dev_err(dev, "Failed to read vendor code register (ret=%i)\n",
 			ret);
@@ -121,66 +121,71 @@ static int bd9571mwv_identify(struct bd9571mwv *bd)
 		return -EINVAL;
 	}
 
-	ret = regmap_read(bd->regmap, BD9571MWV_PRODUCT_CODE, &value);
+	ret = regmap_read(regmap, BD9571MWV_PRODUCT_CODE, &value);
 	if (ret) {
 		dev_err(dev, "Failed to read product code register (ret=%i)\n",
 			ret);
 		return ret;
 	}
-
-	if (value != BD9571MWV_PRODUCT_CODE_VAL) {
-		dev_err(dev, "Invalid product code ID %02x (expected %02x)\n",
-			value, BD9571MWV_PRODUCT_CODE_VAL);
-		return -EINVAL;
-	}
-
-	ret = regmap_read(bd->regmap, BD9571MWV_PRODUCT_REVISION, &value);
+	ret = regmap_read(regmap, BD9571MWV_PRODUCT_REVISION, &value);
 	if (ret) {
 		dev_err(dev, "Failed to read revision register (ret=%i)\n",
 			ret);
 		return ret;
 	}
 
-	dev_info(dev, "Device: BD9571MWV rev. %d\n", value & 0xff);
-
 	return 0;
 }
 
 static int bd9571mwv_probe(struct i2c_client *client,
-			  const struct i2c_device_id *ids)
+			   const struct i2c_device_id *ids)
 {
-	struct bd9571mwv *bd;
-	int ret;
-
-	bd = devm_kzalloc(&client->dev, sizeof(*bd), GFP_KERNEL);
-	if (!bd)
-		return -ENOMEM;
+	const struct regmap_config *regmap_config;
+	const struct regmap_irq_chip *irq_chip;
+	const struct mfd_cell *cells;
+	struct device *dev = &client->dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
+	int ret, num_cells, irq = client->irq;
+
+	/* Read the PMIC product code */
+	ret = i2c_smbus_read_byte_data(client, BD9571MWV_PRODUCT_CODE);
+	if (ret < 0) {
+		dev_err(dev, "Failed to read product code\n");
+		return ret;
+	}
 
-	i2c_set_clientdata(client, bd);
-	bd->dev = &client->dev;
-	bd->irq = client->irq;
+	switch (ret) {
+	case BD9571MWV_PRODUCT_CODE_BD9571MWV:
+		regmap_config = &bd9571mwv_regmap_config;
+		irq_chip = &bd9571mwv_irq_chip;
+		cells = bd9571mwv_cells;
+		num_cells = ARRAY_SIZE(bd9571mwv_cells);
+		break;
+	default:
+		dev_err(dev, "Unsupported device 0x%x\n", ret);
+		return -ENODEV;
+	}
 
-	bd->regmap = devm_regmap_init_i2c(client, &bd9571mwv_regmap_config);
-	if (IS_ERR(bd->regmap)) {
-		dev_err(bd->dev, "Failed to initialize register map\n");
-		return PTR_ERR(bd->regmap);
+	regmap = devm_regmap_init_i2c(client, regmap_config);
+	if (IS_ERR(regmap)) {
+		dev_err(dev, "Failed to initialize register map\n");
+		return PTR_ERR(regmap);
 	}
 
-	ret = bd9571mwv_identify(bd);
+	ret = bd957x_identify(dev, regmap);
 	if (ret)
 		return ret;
 
-	ret = devm_regmap_add_irq_chip(bd->dev, bd->regmap, bd->irq,
-				       IRQF_ONESHOT, 0, &bd9571mwv_irq_chip,
-				       &bd->irq_data);
+	ret = devm_regmap_add_irq_chip(dev, regmap, irq, IRQF_ONESHOT, 0,
+				       irq_chip, &irq_data);
 	if (ret) {
-		dev_err(bd->dev, "Failed to register IRQ chip\n");
+		dev_err(dev, "Failed to register IRQ chip\n");
 		return ret;
 	}
 
-	return devm_mfd_add_devices(bd->dev, PLATFORM_DEVID_AUTO,
-				    bd9571mwv_cells, ARRAY_SIZE(bd9571mwv_cells),
-				    NULL, 0, regmap_irq_get_domain(bd->irq_data));
+	return devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, cells, num_cells,
+				    NULL, 0, regmap_irq_get_domain(irq_data));
 }
 
 static const struct of_device_id bd9571mwv_of_match_table[] = {
diff --git a/include/linux/mfd/bd9571mwv.h b/include/linux/mfd/bd9571mwv.h
index bcc709293b24..e1716eca6bb5 100644
--- a/include/linux/mfd/bd9571mwv.h
+++ b/include/linux/mfd/bd9571mwv.h
@@ -3,6 +3,7 @@
  * ROHM BD9571MWV-M driver
  *
  * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
+ * Copyright (C) 2020 Renesas Electronics Corporation
  *
  * Based on the TPS65086 driver
  */
@@ -17,7 +18,7 @@
 #define BD9571MWV_VENDOR_CODE			0x00
 #define BD9571MWV_VENDOR_CODE_VAL		0xdb
 #define BD9571MWV_PRODUCT_CODE			0x01
-#define BD9571MWV_PRODUCT_CODE_VAL		0x60
+#define BD9571MWV_PRODUCT_CODE_BD9571MWV	0x60
 #define BD9571MWV_PRODUCT_REVISION		0x02
 
 #define BD9571MWV_I2C_FUSA_MODE			0x10
@@ -94,19 +95,4 @@ enum bd9571mwv_irqs {
 	BD9571MWV_IRQ_WDT_OF,
 	BD9571MWV_IRQ_BKUP_TRG,
 };
-
-/**
- * struct bd9571mwv - state holder for the bd9571mwv driver
- *
- * Device data may be used to access the BD9571MWV chip
- */
-struct bd9571mwv {
-	struct device *dev;
-	struct regmap *regmap;
-
-	/* IRQ Data */
-	int irq;
-	struct regmap_irq_chip_data *irq_data;
-};
-
 #endif /* __LINUX_MFD_BD9571MWV_H */
-- 
cgit v1.2.3


From b2548da647bb04737196ffd945505d47a166239b Mon Sep 17 00:00:00 2001
From: Khiem Nguyen <khiem.nguyen.xt@renesas.com>
Date: Tue, 12 Jan 2021 18:01:07 +0900
Subject: mfd: bd9571mwv: Add support for BD9574MWF

The new PMIC BD9574MWF inherits features from BD9571MWV.
Add the support of new PMIC to existing bd9571mwv driver.

Signed-off-by: Khiem Nguyen <khiem.nguyen.xt@renesas.com>
Co-developed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/bd9571mwv.c       | 76 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/bd9571mwv.h | 17 ++++++++--
 2 files changed, 89 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/bd9571mwv.c b/drivers/mfd/bd9571mwv.c
index 2c1fcbb6ff2b..e15b1acfb063 100644
--- a/drivers/mfd/bd9571mwv.c
+++ b/drivers/mfd/bd9571mwv.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * ROHM BD9571MWV-M MFD driver
+ * ROHM BD9571MWV-M and BD9574MVF-M core driver
  *
  * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
  * Copyright (C) 2020 Renesas Electronics Corporation
@@ -11,6 +11,7 @@
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/core.h>
+#include <linux/mfd/rohm-generic.h>
 #include <linux/module.h>
 
 #include <linux/mfd/bd9571mwv.h>
@@ -103,6 +104,72 @@ static struct regmap_irq_chip bd9571mwv_irq_chip = {
 	.num_irqs	= ARRAY_SIZE(bd9571mwv_irqs),
 };
 
+static const struct mfd_cell bd9574mwf_cells[] = {
+	{ .name = "bd9574mwf-regulator", },
+	{ .name = "bd9574mwf-gpio", },
+};
+
+static const struct regmap_range bd9574mwf_readable_yes_ranges[] = {
+	regmap_reg_range(BD9571MWV_VENDOR_CODE, BD9571MWV_PRODUCT_REVISION),
+	regmap_reg_range(BD9571MWV_BKUP_MODE_CNT, BD9571MWV_BKUP_MODE_CNT),
+	regmap_reg_range(BD9571MWV_DVFS_VINIT, BD9571MWV_DVFS_SETVMAX),
+	regmap_reg_range(BD9571MWV_DVFS_SETVID, BD9571MWV_DVFS_MONIVDAC),
+	regmap_reg_range(BD9571MWV_GPIO_IN, BD9571MWV_GPIO_IN),
+	regmap_reg_range(BD9571MWV_GPIO_INT, BD9571MWV_GPIO_INTMASK),
+	regmap_reg_range(BD9571MWV_INT_INTREQ, BD9571MWV_INT_INTMASK),
+};
+
+static const struct regmap_access_table bd9574mwf_readable_table = {
+	.yes_ranges	= bd9574mwf_readable_yes_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(bd9574mwf_readable_yes_ranges),
+};
+
+static const struct regmap_range bd9574mwf_writable_yes_ranges[] = {
+	regmap_reg_range(BD9571MWV_BKUP_MODE_CNT, BD9571MWV_BKUP_MODE_CNT),
+	regmap_reg_range(BD9571MWV_DVFS_SETVID, BD9571MWV_DVFS_SETVID),
+	regmap_reg_range(BD9571MWV_GPIO_DIR, BD9571MWV_GPIO_OUT),
+	regmap_reg_range(BD9571MWV_GPIO_INT_SET, BD9571MWV_GPIO_INTMASK),
+	regmap_reg_range(BD9571MWV_INT_INTREQ, BD9571MWV_INT_INTMASK),
+};
+
+static const struct regmap_access_table bd9574mwf_writable_table = {
+	.yes_ranges	= bd9574mwf_writable_yes_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(bd9574mwf_writable_yes_ranges),
+};
+
+static const struct regmap_range bd9574mwf_volatile_yes_ranges[] = {
+	regmap_reg_range(BD9571MWV_DVFS_MONIVDAC, BD9571MWV_DVFS_MONIVDAC),
+	regmap_reg_range(BD9571MWV_GPIO_IN, BD9571MWV_GPIO_IN),
+	regmap_reg_range(BD9571MWV_GPIO_INT, BD9571MWV_GPIO_INT),
+	regmap_reg_range(BD9571MWV_INT_INTREQ, BD9571MWV_INT_INTREQ),
+};
+
+static const struct regmap_access_table bd9574mwf_volatile_table = {
+	.yes_ranges	= bd9574mwf_volatile_yes_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(bd9574mwf_volatile_yes_ranges),
+};
+
+static const struct regmap_config bd9574mwf_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.cache_type	= REGCACHE_RBTREE,
+	.rd_table	= &bd9574mwf_readable_table,
+	.wr_table	= &bd9574mwf_writable_table,
+	.volatile_table	= &bd9574mwf_volatile_table,
+	.max_register	= 0xff,
+};
+
+static struct regmap_irq_chip bd9574mwf_irq_chip = {
+	.name		= "bd9574mwf",
+	.status_base	= BD9571MWV_INT_INTREQ,
+	.mask_base	= BD9571MWV_INT_INTMASK,
+	.ack_base	= BD9571MWV_INT_INTREQ,
+	.init_ack_masked = true,
+	.num_regs	= 1,
+	.irqs		= bd9571mwv_irqs,
+	.num_irqs	= ARRAY_SIZE(bd9571mwv_irqs),
+};
+
 static int bd957x_identify(struct device *dev, struct regmap *regmap)
 {
 	unsigned int value;
@@ -162,6 +229,12 @@ static int bd9571mwv_probe(struct i2c_client *client,
 		cells = bd9571mwv_cells;
 		num_cells = ARRAY_SIZE(bd9571mwv_cells);
 		break;
+	case BD9571MWV_PRODUCT_CODE_BD9574MWF:
+		regmap_config = &bd9574mwf_regmap_config;
+		irq_chip = &bd9574mwf_irq_chip;
+		cells = bd9574mwf_cells;
+		num_cells = ARRAY_SIZE(bd9574mwf_cells);
+		break;
 	default:
 		dev_err(dev, "Unsupported device 0x%x\n", ret);
 		return -ENODEV;
@@ -190,6 +263,7 @@ static int bd9571mwv_probe(struct i2c_client *client,
 
 static const struct of_device_id bd9571mwv_of_match_table[] = {
 	{ .compatible = "rohm,bd9571mwv", },
+	{ .compatible = "rohm,bd9574mwf", },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, bd9571mwv_of_match_table);
diff --git a/include/linux/mfd/bd9571mwv.h b/include/linux/mfd/bd9571mwv.h
index e1716eca6bb5..8efd99d07c9e 100644
--- a/include/linux/mfd/bd9571mwv.h
+++ b/include/linux/mfd/bd9571mwv.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * ROHM BD9571MWV-M driver
+ * ROHM BD9571MWV-M and BD9574MWF-M driver
  *
  * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
  * Copyright (C) 2020 Renesas Electronics Corporation
@@ -14,11 +14,12 @@
 #include <linux/device.h>
 #include <linux/regmap.h>
 
-/* List of registers for BD9571MWV */
+/* List of registers for BD9571MWV and BD9574MWF */
 #define BD9571MWV_VENDOR_CODE			0x00
 #define BD9571MWV_VENDOR_CODE_VAL		0xdb
 #define BD9571MWV_PRODUCT_CODE			0x01
 #define BD9571MWV_PRODUCT_CODE_BD9571MWV	0x60
+#define BD9571MWV_PRODUCT_CODE_BD9574MWF	0x74
 #define BD9571MWV_PRODUCT_REVISION		0x02
 
 #define BD9571MWV_I2C_FUSA_MODE			0x10
@@ -48,6 +49,7 @@
 #define BD9571MWV_VD33_VID			0x44
 
 #define BD9571MWV_DVFS_VINIT			0x50
+#define BD9574MWF_VD09_VINIT			0x51
 #define BD9571MWV_DVFS_SETVMAX			0x52
 #define BD9571MWV_DVFS_BOOSTVID			0x53
 #define BD9571MWV_DVFS_SETVID			0x54
@@ -61,6 +63,7 @@
 #define BD9571MWV_GPIO_INT_SET			0x64
 #define BD9571MWV_GPIO_INT			0x65
 #define BD9571MWV_GPIO_INTMASK			0x66
+#define BD9574MWF_GPIO_MUX			0x67
 
 #define BD9571MWV_REG_KEEP(n)			(0x70 + (n))
 
@@ -70,6 +73,8 @@
 #define BD9571MWV_PROT_ERROR_STATUS2		0x83
 #define BD9571MWV_PROT_ERROR_STATUS3		0x84
 #define BD9571MWV_PROT_ERROR_STATUS4		0x85
+#define BD9574MWF_PROT_ERROR_STATUS5		0x86
+#define BD9574MWF_SYSTEM_ERROR_STATUS		0x87
 
 #define BD9571MWV_INT_INTREQ			0x90
 #define BD9571MWV_INT_INTREQ_MD1_INT		BIT(0)
@@ -82,6 +87,12 @@
 #define BD9571MWV_INT_INTREQ_BKUP_TRG_INT	BIT(7)
 #define BD9571MWV_INT_INTMASK			0x91
 
+#define BD9574MWF_SSCG_CNT			0xA0
+#define BD9574MWF_POFFB_MRB			0xA1
+#define BD9574MWF_SMRB_WR_PROT			0xA2
+#define BD9574MWF_SMRB_ASSERT			0xA3
+#define BD9574MWF_SMRB_STATUS			0xA4
+
 #define BD9571MWV_ACCESS_KEY			0xff
 
 /* Define the BD9571MWV IRQ numbers */
@@ -91,7 +102,7 @@ enum bd9571mwv_irqs {
 	BD9571MWV_IRQ_MD2_E2,
 	BD9571MWV_IRQ_PROT_ERR,
 	BD9571MWV_IRQ_GP,
-	BD9571MWV_IRQ_128H_OF,
+	BD9571MWV_IRQ_128H_OF,	/* BKUP_HOLD on BD9574MWF */
 	BD9571MWV_IRQ_WDT_OF,
 	BD9571MWV_IRQ_BKUP_TRG,
 };
-- 
cgit v1.2.3


From 097d43d8570457c1af9b09fab15412697b177f35 Mon Sep 17 00:00:00 2001
From: Daeseok Youn <daeseok.youn@gmail.com>
Date: Thu, 14 Jan 2021 16:08:17 +0900
Subject: mm: memblock: remove return value of memblock_free_all()

No one checks the return value of memblock_free_all().
Make the return value void.

memblock_free_all() is used on mem_init() for each
architecture, and the total count of freed pages will be added
to _totalram_pages variable by calling totalram_pages_add().

so do not need to return total count of freed pages.

Signed-off-by: Daeseok Youn <daeseok.youn@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 include/linux/memblock.h | 2 +-
 mm/memblock.c            | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index b93c44b9121e..acf1e6a85066 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -117,7 +117,7 @@ int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 
-unsigned long memblock_free_all(void);
+void memblock_free_all(void);
 void reset_node_managed_pages(pg_data_t *pgdat);
 void reset_all_zones_managed_pages(void);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index d24bcfa88d2f..daf06f2847ed 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2087,10 +2087,8 @@ void __init reset_all_zones_managed_pages(void)
 
 /**
  * memblock_free_all - release free pages to the buddy allocator
- *
- * Return: the number of pages actually released.
  */
-unsigned long __init memblock_free_all(void)
+void __init memblock_free_all(void)
 {
 	unsigned long pages;
 
@@ -2099,8 +2097,6 @@ unsigned long __init memblock_free_all(void)
 
 	pages = free_low_memory_core_early();
 	totalram_pages_add(pages);
-
-	return pages;
 }
 
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
-- 
cgit v1.2.3


From 5e6dca82bcaa49348f9e5fcb48df4881f6d6c4ae Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Tue, 12 Jan 2021 11:46:24 -0800
Subject: x86/entry: Emit a symbol for register restoring thunk

Arnd found a randconfig that produces the warning:

  arch/x86/entry/thunk_64.o: warning: objtool: missing symbol for insn at
  offset 0x3e

when building with LLVM_IAS=1 (Clang's integrated assembler). Josh
notes:

  With the LLVM assembler not generating section symbols, objtool has no
  way to reference this code when it generates ORC unwinder entries,
  because this code is outside of any ELF function.

  The limitation now being imposed by objtool is that all code must be
  contained in an ELF symbol.  And .L symbols don't create such symbols.

  So basically, you can use an .L symbol *inside* a function or a code
  segment, you just can't use the .L symbol to contain the code using a
  SYM_*_START/END annotation pair.

Fangrui notes that this optimization is helpful for reducing image size
when compiling with -ffunction-sections and -fdata-sections. I have
observed on the order of tens of thousands of symbols for the kernel
images built with those flags.

A patch has been authored against GNU binutils to match this behavior
of not generating unused section symbols ([1]), so this will
also become a problem for users of GNU binutils once they upgrade to 2.36.

Omit the .L prefix on a label so that the assembler will emit an entry
into the symbol table for the label, with STB_LOCAL binding. This
enables objtool to generate proper unwind info here with LLVM_IAS=1 or
GNU binutils 2.36+.

 [ bp: Massage commit message. ]

Reported-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Suggested-by: Borislav Petkov <bp@alien8.de>
Suggested-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/20210112194625.4181814-1-ndesaulniers@google.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1209
Link: https://reviews.llvm.org/D93783
Link: https://sourceware.org/binutils/docs/as/Symbol-Names.html
Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d1bcae833b32f1408485ce69f844dcd7ded093a8 [1]
---
 Documentation/asm-annotations.rst | 5 +++++
 arch/x86/entry/thunk_64.S         | 8 ++++----
 include/linux/linkage.h           | 5 +++++
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/asm-annotations.rst b/Documentation/asm-annotations.rst
index 32ea57483378..76424e0431f4 100644
--- a/Documentation/asm-annotations.rst
+++ b/Documentation/asm-annotations.rst
@@ -100,6 +100,11 @@ Instruction Macros
 ~~~~~~~~~~~~~~~~~~
 This section covers ``SYM_FUNC_*`` and ``SYM_CODE_*`` enumerated above.
 
+``objtool`` requires that all code must be contained in an ELF symbol. Symbol
+names that have a ``.L`` prefix do not emit symbol table entries. ``.L``
+prefixed symbols can be used within a code region, but should be avoided for
+denoting a range of code via ``SYM_*_START/END`` annotations.
+
 * ``SYM_FUNC_START`` and ``SYM_FUNC_START_LOCAL`` are supposed to be **the
   most frequent markings**. They are used for functions with standard calling
   conventions -- global and local. Like in C, they both align the functions to
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index ccd32877a3c4..c9a9fbf1655f 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -31,7 +31,7 @@ SYM_FUNC_START_NOALIGN(\name)
 	.endif
 
 	call \func
-	jmp  .L_restore
+	jmp  __thunk_restore
 SYM_FUNC_END(\name)
 	_ASM_NOKPROBE(\name)
 	.endm
@@ -44,7 +44,7 @@ SYM_FUNC_END(\name)
 #endif
 
 #ifdef CONFIG_PREEMPTION
-SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
+SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
 	popq %r11
 	popq %r10
 	popq %r9
@@ -56,6 +56,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
 	popq %rdi
 	popq %rbp
 	ret
-	_ASM_NOKPROBE(.L_restore)
-SYM_CODE_END(.L_restore)
+	_ASM_NOKPROBE(__thunk_restore)
+SYM_CODE_END(__thunk_restore)
 #endif
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 5bcfbd972e97..dbf8506decca 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -178,6 +178,11 @@
  * Objtool generates debug info for both FUNC & CODE, but needs special
  * annotations for each CODE's start (to describe the actual stack frame).
  *
+ * Objtool requires that all code must be contained in an ELF symbol. Symbol
+ * names that have a  .L prefix do not emit symbol table entries. .L
+ * prefixed symbols can be used within a code region, but should be avoided for
+ * denoting a range of code via ``SYM_*_START/END`` annotations.
+ *
  * ALIAS -- does not generate debug info -- the aliased function will
  */
 
-- 
cgit v1.2.3


From a8cccdd954732a558d481407ab7c3106b89c34ae Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:24 -0800
Subject: init: lto: ensure initcall ordering

With LTO, the compiler doesn't necessarily obey the link order for
initcalls, and initcall variables need globally unique names to avoid
collisions at link time.

This change exports __KBUILD_MODNAME and adds the initcall_id() macro,
which uses it together with __COUNTER__ and __LINE__ to help ensure
these variables have unique names, and moves each variable to its own
section when LTO is enabled, so the correct order can be specified using
a linker script.

The generate_initcall_ordering.pl script uses nm to find initcalls from
the object files passed to the linker, and generates a linker script
that specifies the same order for initcalls that we would have without
LTO. With LTO enabled, the script is called in link-vmlinux.sh through
jobserver-exec to limit the number of jobs spawned.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-8-samitolvanen@google.com
---
 include/linux/init.h               |  52 ++++++-
 scripts/Makefile.lib               |   6 +-
 scripts/generate_initcall_order.pl | 270 +++++++++++++++++++++++++++++++++++++
 scripts/link-vmlinux.sh            |  15 +++
 4 files changed, 334 insertions(+), 9 deletions(-)
 create mode 100755 scripts/generate_initcall_order.pl

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index e668832ef66a..a57bf0dfd75f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -184,19 +184,57 @@ extern bool initcall_debug;
  * as KEEP() in the linker script.
  */
 
+/* Format: <modname>__<counter>_<line>_<fn> */
+#define __initcall_id(fn)					\
+	__PASTE(__KBUILD_MODNAME,				\
+	__PASTE(__,						\
+	__PASTE(__COUNTER__,					\
+	__PASTE(_,						\
+	__PASTE(__LINE__,					\
+	__PASTE(_, fn))))))
+
+/* Format: __<prefix>__<iid><id> */
+#define __initcall_name(prefix, __iid, id)			\
+	__PASTE(__,						\
+	__PASTE(prefix,						\
+	__PASTE(__,						\
+	__PASTE(__iid, id))))
+
+#ifdef CONFIG_LTO_CLANG
+/*
+ * With LTO, the compiler doesn't necessarily obey link order for
+ * initcalls. In order to preserve the correct order, we add each
+ * variable into its own section and generate a linker script (in
+ * scripts/link-vmlinux.sh) to specify the order of the sections.
+ */
+#define __initcall_section(__sec, __iid)			\
+	#__sec ".init.." #__iid
+#else
+#define __initcall_section(__sec, __iid)			\
+	#__sec ".init"
+#endif
+
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-#define ___define_initcall(fn, id, __sec)			\
+#define ____define_initcall(fn, __name, __sec)			\
 	__ADDRESSABLE(fn)					\
-	asm(".section	\"" #__sec ".init\", \"a\"	\n"	\
-	"__initcall_" #fn #id ":			\n"	\
+	asm(".section	\"" __sec "\", \"a\"		\n"	\
+	    __stringify(__name) ":			\n"	\
 	    ".long	" #fn " - .			\n"	\
 	    ".previous					\n");
 #else
-#define ___define_initcall(fn, id, __sec) \
-	static initcall_t __initcall_##fn##id __used \
-		__attribute__((__section__(#__sec ".init"))) = fn;
+#define ____define_initcall(fn, __name, __sec)			\
+	static initcall_t __name __used 			\
+		__attribute__((__section__(__sec))) = fn;
 #endif
 
+#define __unique_initcall(fn, id, __sec, __iid)			\
+	____define_initcall(fn,					\
+		__initcall_name(initcall, __iid, id),		\
+		__initcall_section(__sec, __iid))
+
+#define ___define_initcall(fn, id, __sec)			\
+	__unique_initcall(fn, id, __sec, __initcall_id(fn))
+
 #define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
 
 /*
@@ -236,7 +274,7 @@ extern bool initcall_debug;
 #define __exitcall(fn)						\
 	static exitcall_t __exitcall_##fn __exit_call = fn
 
-#define console_initcall(fn)	___define_initcall(fn,, .con_initcall)
+#define console_initcall(fn)	___define_initcall(fn, con, .con_initcall)
 
 struct obs_kernel_param {
 	const char *str;
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 213677a5ed33..3ff3dbb3a830 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -117,9 +117,11 @@ target-stem = $(basename $(patsubst $(obj)/%,%,$@))
 # These flags are needed for modversions and compiling, so we define them here
 # $(modname_flags) defines KBUILD_MODNAME as the name of the module it will
 # end up in (or would, if it gets compiled in)
-name-fix = $(call stringify,$(subst $(comma),_,$(subst -,_,$1)))
+name-fix-token = $(subst $(comma),_,$(subst -,_,$1))
+name-fix = $(call stringify,$(call name-fix-token,$1))
 basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget))
-modname_flags  = -DKBUILD_MODNAME=$(call name-fix,$(modname))
+modname_flags  = -DKBUILD_MODNAME=$(call name-fix,$(modname)) \
+		 -D__KBUILD_MODNAME=kmod_$(call name-fix-token,$(modname))
 modfile_flags  = -DKBUILD_MODFILE=$(call stringify,$(modfile))
 
 _c_flags       = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \
diff --git a/scripts/generate_initcall_order.pl b/scripts/generate_initcall_order.pl
new file mode 100755
index 000000000000..1a88d3f1b913
--- /dev/null
+++ b/scripts/generate_initcall_order.pl
@@ -0,0 +1,270 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generates a linker script that specifies the correct initcall order.
+#
+# Copyright (C) 2019 Google LLC
+
+use strict;
+use warnings;
+use IO::Handle;
+use IO::Select;
+use POSIX ":sys_wait_h";
+
+my $nm = $ENV{'NM'} || die "$0: ERROR: NM not set?";
+my $objtree = $ENV{'objtree'} || '.';
+
+## currently active child processes
+my $jobs = {};		# child process pid -> file handle
+## results from child processes
+my $results = {};	# object index -> [ { level, secname }, ... ]
+
+## reads _NPROCESSORS_ONLN to determine the maximum number of processes to
+## start
+sub get_online_processors {
+	open(my $fh, "getconf _NPROCESSORS_ONLN 2>/dev/null |")
+		or die "$0: ERROR: failed to execute getconf: $!";
+	my $procs = <$fh>;
+	close($fh);
+
+	if (!($procs =~ /^\d+$/)) {
+		return 1;
+	}
+
+	return int($procs);
+}
+
+## writes results to the parent process
+## format: <file index> <initcall level> <base initcall section name>
+sub write_results {
+	my ($index, $initcalls) = @_;
+
+	# sort by the counter value to ensure the order of initcalls within
+	# each object file is correct
+	foreach my $counter (sort { $a <=> $b } keys(%{$initcalls})) {
+		my $level = $initcalls->{$counter}->{'level'};
+
+		# section name for the initcall function
+		my $secname = $initcalls->{$counter}->{'module'} . '__' .
+			      $counter . '_' .
+			      $initcalls->{$counter}->{'line'} . '_' .
+			      $initcalls->{$counter}->{'function'};
+
+		print "$index $level $secname\n";
+	}
+}
+
+## reads a result line from a child process and adds it to the $results array
+sub read_results{
+	my ($fh) = @_;
+
+	# each child prints out a full line w/ autoflush and exits after the
+	# last line, so even if buffered I/O blocks here, it shouldn't block
+	# very long
+	my $data = <$fh>;
+
+	if (!defined($data)) {
+		return 0;
+	}
+
+	chomp($data);
+
+	my ($index, $level, $secname) = $data =~
+		/^(\d+)\ ([^\ ]+)\ (.*)$/;
+
+	if (!defined($index) ||
+		!defined($level) ||
+		!defined($secname)) {
+		die "$0: ERROR: child process returned invalid data: $data\n";
+	}
+
+	$index = int($index);
+
+	if (!exists($results->{$index})) {
+		$results->{$index} = [];
+	}
+
+	push (@{$results->{$index}}, {
+		'level'   => $level,
+		'secname' => $secname
+	});
+
+	return 1;
+}
+
+## finds initcalls from an object file or all object files in an archive, and
+## writes results back to the parent process
+sub find_initcalls {
+	my ($index, $file) = @_;
+
+	die "$0: ERROR: file $file doesn't exist?" if (! -f $file);
+
+	open(my $fh, "\"$nm\" --defined-only \"$file\" 2>/dev/null |")
+		or die "$0: ERROR: failed to execute \"$nm\": $!";
+
+	my $initcalls = {};
+
+	while (<$fh>) {
+		chomp;
+
+		# check for the start of a new object file (if processing an
+		# archive)
+		my ($path)= $_ =~ /^(.+)\:$/;
+
+		if (defined($path)) {
+			write_results($index, $initcalls);
+			$initcalls = {};
+			next;
+		}
+
+		# look for an initcall
+		my ($module, $counter, $line, $symbol) = $_ =~
+			/[a-z]\s+__initcall__(\S*)__(\d+)_(\d+)_(.*)$/;
+
+		if (!defined($module)) {
+			$module = ''
+		}
+
+		if (!defined($counter) ||
+			!defined($line) ||
+			!defined($symbol)) {
+			next;
+		}
+
+		# parse initcall level
+		my ($function, $level) = $symbol =~
+			/^(.*)((early|rootfs|con|[0-9])s?)$/;
+
+		die "$0: ERROR: invalid initcall name $symbol in $file($path)"
+			if (!defined($function) || !defined($level));
+
+		$initcalls->{$counter} = {
+			'module'   => $module,
+			'line'     => $line,
+			'function' => $function,
+			'level'    => $level,
+		};
+	}
+
+	close($fh);
+	write_results($index, $initcalls);
+}
+
+## waits for any child process to complete, reads the results, and adds them to
+## the $results array for later processing
+sub wait_for_results {
+	my ($select) = @_;
+
+	my $pid = 0;
+	do {
+		# unblock children that may have a full write buffer
+		foreach my $fh ($select->can_read(0)) {
+			read_results($fh);
+		}
+
+		# check for children that have exited, read the remaining data
+		# from them, and clean up
+		$pid = waitpid(-1, WNOHANG);
+		if ($pid > 0) {
+			if (!exists($jobs->{$pid})) {
+				next;
+			}
+
+			my $fh = $jobs->{$pid};
+			$select->remove($fh);
+
+			while (read_results($fh)) {
+				# until eof
+			}
+
+			close($fh);
+			delete($jobs->{$pid});
+		}
+	} while ($pid > 0);
+}
+
+## forks a child to process each file passed in the command line and collects
+## the results
+sub process_files {
+	my $index = 0;
+	my $njobs = $ENV{'PARALLELISM'} || get_online_processors();
+	my $select = IO::Select->new();
+
+	while (my $file = shift(@ARGV)) {
+		# fork a child process and read it's stdout
+		my $pid = open(my $fh, '-|');
+
+		if (!defined($pid)) {
+			die "$0: ERROR: failed to fork: $!";
+		} elsif ($pid) {
+			# save the child process pid and the file handle
+			$select->add($fh);
+			$jobs->{$pid} = $fh;
+		} else {
+			# in the child process
+			STDOUT->autoflush(1);
+			find_initcalls($index, "$objtree/$file");
+			exit;
+		}
+
+		$index++;
+
+		# limit the number of children to $njobs
+		if (scalar(keys(%{$jobs})) >= $njobs) {
+			wait_for_results($select);
+		}
+	}
+
+	# wait for the remaining children to complete
+	while (scalar(keys(%{$jobs})) > 0) {
+		wait_for_results($select);
+	}
+}
+
+sub generate_initcall_lds() {
+	process_files();
+
+	my $sections = {};	# level -> [ secname, ...]
+
+	# sort results to retain link order and split to sections per
+	# initcall level
+	foreach my $index (sort { $a <=> $b } keys(%{$results})) {
+		foreach my $result (@{$results->{$index}}) {
+			my $level = $result->{'level'};
+
+			if (!exists($sections->{$level})) {
+				$sections->{$level} = [];
+			}
+
+			push(@{$sections->{$level}}, $result->{'secname'});
+		}
+	}
+
+	die "$0: ERROR: no initcalls?" if (!keys(%{$sections}));
+
+	# print out a linker script that defines the order of initcalls for
+	# each level
+	print "SECTIONS {\n";
+
+	foreach my $level (sort(keys(%{$sections}))) {
+		my $section;
+
+		if ($level eq 'con') {
+			$section = '.con_initcall.init';
+		} else {
+			$section = ".initcall${level}.init";
+		}
+
+		print "\t${section} : {\n";
+
+		foreach my $secname (@{$sections->{$level}}) {
+			print "\t\t*(${section}..${secname}) ;\n";
+		}
+
+		print "\t}\n";
+	}
+
+	print "}\n";
+}
+
+generate_initcall_lds();
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 78e55fe7210b..c5919d5a0b4f 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -43,6 +43,17 @@ info()
 	fi
 }
 
+# Generate a linker script to ensure correct ordering of initcalls.
+gen_initcalls()
+{
+	info GEN .tmp_initcalls.lds
+
+	${PYTHON} ${srctree}/scripts/jobserver-exec		\
+	${PERL} ${srctree}/scripts/generate_initcall_order.pl	\
+		${KBUILD_VMLINUX_OBJS} ${KBUILD_VMLINUX_LIBS}	\
+		> .tmp_initcalls.lds
+}
+
 # If CONFIG_LTO_CLANG is selected, collect generated symbol versions into
 # .tmp_symversions.lds
 gen_symversions()
@@ -72,6 +83,9 @@ modpost_link()
 		--end-group"
 
 	if [ -n "${CONFIG_LTO_CLANG}" ]; then
+		gen_initcalls
+		lds="-T .tmp_initcalls.lds"
+
 		if [ -n "${CONFIG_MODVERSIONS}" ]; then
 			gen_symversions
 			lds="${lds} -T .tmp_symversions.lds"
@@ -262,6 +276,7 @@ cleanup()
 {
 	rm -f .btf.*
 	rm -f .tmp_System.map
+	rm -f .tmp_initcalls.lds
 	rm -f .tmp_symversions.lds
 	rm -f .tmp_vmlinux*
 	rm -f System.map
-- 
cgit v1.2.3


From 3578ad11f3fba07e64c26d8db68cfd3dde28c59e Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:25 -0800
Subject: init: lto: fix PREL32 relocations

With LTO, the compiler can rename static functions to avoid global
naming collisions. As initcall functions are typically static,
renaming can break references to them in inline assembly. This
change adds a global stub with a stable name for each initcall to
fix the issue when PREL32 relocations are used.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-9-samitolvanen@google.com
---
 include/linux/init.h | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index a57bf0dfd75f..a01f01c1a5c5 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -209,26 +209,49 @@ extern bool initcall_debug;
  */
 #define __initcall_section(__sec, __iid)			\
 	#__sec ".init.." #__iid
+
+/*
+ * With LTO, the compiler can rename static functions to avoid
+ * global naming collisions. We use a global stub function for
+ * initcalls to create a stable symbol name whose address can be
+ * taken in inline assembly when PREL32 relocations are used.
+ */
+#define __initcall_stub(fn, __iid, id)				\
+	__initcall_name(initstub, __iid, id)
+
+#define __define_initcall_stub(__stub, fn)			\
+	int __init __stub(void);				\
+	int __init __stub(void)					\
+	{ 							\
+		return fn();					\
+	}							\
+	__ADDRESSABLE(__stub)
 #else
 #define __initcall_section(__sec, __iid)			\
 	#__sec ".init"
+
+#define __initcall_stub(fn, __iid, id)	fn
+
+#define __define_initcall_stub(__stub, fn)			\
+	__ADDRESSABLE(fn)
 #endif
 
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-#define ____define_initcall(fn, __name, __sec)			\
-	__ADDRESSABLE(fn)					\
+#define ____define_initcall(fn, __stub, __name, __sec)		\
+	__define_initcall_stub(__stub, fn)			\
 	asm(".section	\"" __sec "\", \"a\"		\n"	\
 	    __stringify(__name) ":			\n"	\
-	    ".long	" #fn " - .			\n"	\
+	    ".long	" __stringify(__stub) " - .	\n"	\
 	    ".previous					\n");
 #else
-#define ____define_initcall(fn, __name, __sec)			\
+#define ____define_initcall(fn, __unused, __name, __sec)	\
 	static initcall_t __name __used 			\
 		__attribute__((__section__(__sec))) = fn;
 #endif
 
 #define __unique_initcall(fn, id, __sec, __iid)			\
 	____define_initcall(fn,					\
+		__initcall_stub(fn, __iid, id),			\
 		__initcall_name(initcall, __iid, id),		\
 		__initcall_section(__sec, __iid))
 
-- 
cgit v1.2.3


From 09a4e4d9c52a3c5e39e4f409b2c083ab13c6afc2 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 11 Dec 2020 10:46:26 -0800
Subject: PCI: Fix PREL32 relocations for LTO

With Clang's Link Time Optimization (LTO), the compiler can rename
static functions to avoid global naming collisions. As PCI fixup
functions are typically static, renaming can break references
to them in inline assembly. This change adds a global stub to
DECLARE_PCI_FIXUP_SECTION to fix the issue when PREL32 relocations
are used.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201211184633.3213045-10-samitolvanen@google.com
---
 include/linux/pci.h | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index b32126d26997..2276e5d00192 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1917,7 +1917,7 @@ enum pci_fixup_pass {
 };
 
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+#define ___DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				    class_shift, hook)			\
 	__ADDRESSABLE(hook)						\
 	asm(".section "	#sec ", \"a\"				\n"	\
@@ -1926,10 +1926,33 @@ enum pci_fixup_pass {
 	    ".long "	#class ", " #class_shift "		\n"	\
 	    ".long "	#hook " - .				\n"	\
 	    ".previous						\n");
+
+/*
+ * Clang's LTO may rename static functions in C, but has no way to
+ * handle such renamings when referenced from inline asm. To work
+ * around this, create global C stubs for these cases.
+ */
+#ifdef CONFIG_LTO_CLANG
+#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook, stub)		\
+	void stub(struct pci_dev *dev);					\
+	void stub(struct pci_dev *dev)					\
+	{ 								\
+		hook(dev); 						\
+	}								\
+	___DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, stub)
+#else
+#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook, stub)		\
+	___DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook)
+#endif
+
 #define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				  class_shift, hook)			\
 	__DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
-				  class_shift, hook)
+				  class_shift, hook, __UNIQUE_ID(hook))
 #else
 /* Anonymous variables would be nice... */
 #define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class,	\
-- 
cgit v1.2.3


From 215b674b84dd052098fe6389e32a5afaff8b4d56 Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra@google.com>
Date: Fri, 8 Jan 2021 14:22:20 -0800
Subject: security: add inode_init_security_anon() LSM hook

This change adds a new LSM hook, inode_init_security_anon(), that will
be used while creating secure anonymous inodes. The hook allows/denies
its creation and assigns a security context to the inode.

The new hook accepts an optional context_inode parameter that callers
can use to provide additional contextual information to security modules
for granting/denying permission to create an anon-inode of the same type.
This context_inode's security_context can also be used to initialize the
newly created anon-inode's security_context.

Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  2 ++
 include/linux/lsm_hooks.h     |  9 +++++++++
 include/linux/security.h      | 10 ++++++++++
 security/security.c           |  8 ++++++++
 4 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 7aaa753b8608..dfd261dcbcb0 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -113,6 +113,8 @@ LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode)
 LSM_HOOK(int, 0, inode_init_security, struct inode *inode,
 	 struct inode *dir, const struct qstr *qstr, const char **name,
 	 void **value, size_t *len)
+LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode,
+	 const struct qstr *name, const struct inode *context_inode)
 LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry,
 	 umode_t mode)
 LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir,
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a19adef1f088..bdfc8a76a4f7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -233,6 +233,15 @@
  *	Returns 0 if @name and @value have been successfully set,
  *	-EOPNOTSUPP if no security attribute is needed, or
  *	-ENOMEM on memory allocation failure.
+ * @inode_init_security_anon:
+ *      Set up the incore security field for the new anonymous inode
+ *      and return whether the inode creation is permitted by the security
+ *      module or not.
+ *      @inode contains the inode structure
+ *      @name name of the anonymous inode class
+ *      @context_inode optional related inode
+ *	Returns 0 on success, -EACCES if the security module denies the
+ *	creation of this inode, or another -errno upon other errors.
  * @inode_create:
  *	Check permission to create a regular file.
  *	@dir contains inode structure of the parent of the new file.
diff --git a/include/linux/security.h b/include/linux/security.h
index c35ea0ffccd9..b0d14f04b16d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -324,6 +324,9 @@ void security_inode_free(struct inode *inode);
 int security_inode_init_security(struct inode *inode, struct inode *dir,
 				 const struct qstr *qstr,
 				 initxattrs initxattrs, void *fs_data);
+int security_inode_init_security_anon(struct inode *inode,
+				      const struct qstr *name,
+				      const struct inode *context_inode);
 int security_old_inode_init_security(struct inode *inode, struct inode *dir,
 				     const struct qstr *qstr, const char **name,
 				     void **value, size_t *len);
@@ -738,6 +741,13 @@ static inline int security_inode_init_security(struct inode *inode,
 	return 0;
 }
 
+static inline int security_inode_init_security_anon(struct inode *inode,
+						    const struct qstr *name,
+						    const struct inode *context_inode)
+{
+	return 0;
+}
+
 static inline int security_old_inode_init_security(struct inode *inode,
 						   struct inode *dir,
 						   const struct qstr *qstr,
diff --git a/security/security.c b/security/security.c
index 7b09cfbae94f..401663b5b70e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1059,6 +1059,14 @@ out:
 }
 EXPORT_SYMBOL(security_inode_init_security);
 
+int security_inode_init_security_anon(struct inode *inode,
+				      const struct qstr *name,
+				      const struct inode *context_inode)
+{
+	return call_int_hook(inode_init_security_anon, 0, inode, name,
+			     context_inode);
+}
+
 int security_old_inode_init_security(struct inode *inode, struct inode *dir,
 				     const struct qstr *qstr, const char **name,
 				     void **value, size_t *len)
-- 
cgit v1.2.3


From e7e832ce6fa769f800cd7eaebdb0459ad31e0416 Mon Sep 17 00:00:00 2001
From: Daniel Colascione <dancol@google.com>
Date: Fri, 8 Jan 2021 14:22:21 -0800
Subject: fs: add LSM-supporting anon-inode interface

This change adds a new function, anon_inode_getfd_secure, that creates
anonymous-node file with individual non-S_PRIVATE inode to which security
modules can apply policy. Existing callers continue using the original
singleton-inode kind of anonymous-inode file. We can transition anonymous
inode users to the new kind of anonymous inode in individual patches for
the sake of bisection and review.

The new function accepts an optional context_inode parameter that callers
can use to provide additional contextual information to security modules.
For example, in case of userfaultfd, the created inode is a 'logical child'
of the context_inode (userfaultfd inode of the parent process) in the sense
that it provides the security context required during creation of the child
process' userfaultfd inode.

Signed-off-by: Daniel Colascione <dancol@google.com>
[LG: Delete obsolete comments to alloc_anon_inode()]
[LG: Add context_inode description in comments to anon_inode_getfd_secure()]
[LG: Remove definition of anon_inode_getfile_secure() as there are no callers]
[LG: Make __anon_inode_getfile() static]
[LG: Use correct error cast in __anon_inode_getfile()]
[LG: Fix error handling in __anon_inode_getfile()]
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/anon_inodes.c            | 150 ++++++++++++++++++++++++++++++++------------
 fs/libfs.c                  |   5 --
 include/linux/anon_inodes.h |   5 ++
 3 files changed, 115 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 89714308c25b..023337d65a03 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -55,61 +55,79 @@ static struct file_system_type anon_inode_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-/**
- * anon_inode_getfile - creates a new file instance by hooking it up to an
- *                      anonymous inode, and a dentry that describe the "class"
- *                      of the file
- *
- * @name:    [in]    name of the "class" of the new file
- * @fops:    [in]    file operations for the new file
- * @priv:    [in]    private data for the new file (will be file's private_data)
- * @flags:   [in]    flags
- *
- * Creates a new file by hooking it on a single inode. This is useful for files
- * that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfile() will share a single inode,
- * hence saving memory and avoiding code duplication for the file/inode/dentry
- * setup.  Returns the newly created file* or an error pointer.
- */
-struct file *anon_inode_getfile(const char *name,
-				const struct file_operations *fops,
-				void *priv, int flags)
+static struct inode *anon_inode_make_secure_inode(
+	const char *name,
+	const struct inode *context_inode)
 {
-	struct file *file;
+	struct inode *inode;
+	const struct qstr qname = QSTR_INIT(name, strlen(name));
+	int error;
+
+	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	if (IS_ERR(inode))
+		return inode;
+	inode->i_flags &= ~S_PRIVATE;
+	error =	security_inode_init_security_anon(inode, &qname, context_inode);
+	if (error) {
+		iput(inode);
+		return ERR_PTR(error);
+	}
+	return inode;
+}
 
-	if (IS_ERR(anon_inode_inode))
-		return ERR_PTR(-ENODEV);
+static struct file *__anon_inode_getfile(const char *name,
+					 const struct file_operations *fops,
+					 void *priv, int flags,
+					 const struct inode *context_inode,
+					 bool secure)
+{
+	struct inode *inode;
+	struct file *file;
 
 	if (fops->owner && !try_module_get(fops->owner))
 		return ERR_PTR(-ENOENT);
 
-	/*
-	 * We know the anon_inode inode count is always greater than zero,
-	 * so ihold() is safe.
-	 */
-	ihold(anon_inode_inode);
-	file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name,
+	if (secure) {
+		inode =	anon_inode_make_secure_inode(name, context_inode);
+		if (IS_ERR(inode)) {
+			file = ERR_CAST(inode);
+			goto err;
+		}
+	} else {
+		inode =	anon_inode_inode;
+		if (IS_ERR(inode)) {
+			file = ERR_PTR(-ENODEV);
+			goto err;
+		}
+		/*
+		 * We know the anon_inode inode count is always
+		 * greater than zero, so ihold() is safe.
+		 */
+		ihold(inode);
+	}
+
+	file = alloc_file_pseudo(inode, anon_inode_mnt, name,
 				 flags & (O_ACCMODE | O_NONBLOCK), fops);
 	if (IS_ERR(file))
-		goto err;
+		goto err_iput;
 
-	file->f_mapping = anon_inode_inode->i_mapping;
+	file->f_mapping = inode->i_mapping;
 
 	file->private_data = priv;
 
 	return file;
 
+err_iput:
+	iput(inode);
 err:
-	iput(anon_inode_inode);
 	module_put(fops->owner);
 	return file;
 }
-EXPORT_SYMBOL_GPL(anon_inode_getfile);
 
 /**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
- *                    anonymous inode, and a dentry that describe the "class"
- *                    of the file
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
+ *                      anonymous inode, and a dentry that describe the "class"
+ *                      of the file
  *
  * @name:    [in]    name of the "class" of the new file
  * @fops:    [in]    file operations for the new file
@@ -118,12 +136,23 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
  *
  * Creates a new file by hooking it on a single inode. This is useful for files
  * that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfd() will share a single inode,
+ * All the files created with anon_inode_getfile() will share a single inode,
  * hence saving memory and avoiding code duplication for the file/inode/dentry
- * setup.  Returns new descriptor or an error code.
+ * setup.  Returns the newly created file* or an error pointer.
  */
-int anon_inode_getfd(const char *name, const struct file_operations *fops,
-		     void *priv, int flags)
+struct file *anon_inode_getfile(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags)
+{
+	return __anon_inode_getfile(name, fops, priv, flags, NULL, false);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile);
+
+static int __anon_inode_getfd(const char *name,
+			      const struct file_operations *fops,
+			      void *priv, int flags,
+			      const struct inode *context_inode,
+			      bool secure)
 {
 	int error, fd;
 	struct file *file;
@@ -133,7 +162,8 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		return error;
 	fd = error;
 
-	file = anon_inode_getfile(name, fops, priv, flags);
+	file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
+				    secure);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_put_unused_fd;
@@ -146,8 +176,48 @@ err_put_unused_fd:
 	put_unused_fd(fd);
 	return error;
 }
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to
+ *                    an anonymous inode and a dentry that describe
+ *                    the "class" of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ * Creates a new file by hooking it on a single inode. This is
+ * useful for files that do not need to have a full-fledged inode in
+ * order to operate correctly.  All the files created with
+ * anon_inode_getfd() will use the same singleton inode, reducing
+ * memory use and avoiding code duplication for the file/inode/dentry
+ * setup.  Returns a newly created file descriptor or an error code.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+		     void *priv, int flags)
+{
+	return __anon_inode_getfd(name, fops, priv, flags, NULL, false);
+}
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
+/**
+ * Like anon_inode_getfd(), but creates a new !S_PRIVATE anon inode rather than
+ * reuse the singleton anon inode, and calls the inode_init_security_anon() LSM
+ * hook. This allows the inode to have its own security context and for a LSM
+ * to reject creation of the inode.  An optional @context_inode argument is
+ * also added to provide the logical relationship with the new inode.  The LSM
+ * may use @context_inode in inode_init_security_anon(), but a reference to it
+ * is not held.
+ */
+int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+			    void *priv, int flags,
+			    const struct inode *context_inode)
+{
+	return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
+
 static int __init anon_inode_init(void)
 {
 	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
diff --git a/fs/libfs.c b/fs/libfs.c
index d1c3bade9f30..e52818fb276a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1214,11 +1214,6 @@ static int anon_set_page_dirty(struct page *page)
 	return 0;
 };
 
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
 struct inode *alloc_anon_inode(struct super_block *s)
 {
 	static const struct address_space_operations anon_aops = {
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index d0d7d96261ad..71881a2b6f78 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -10,12 +10,17 @@
 #define _LINUX_ANON_INODES_H
 
 struct file_operations;
+struct inode;
 
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
 		     void *priv, int flags);
+int anon_inode_getfd_secure(const char *name,
+			    const struct file_operations *fops,
+			    void *priv, int flags,
+			    const struct inode *context_inode);
 
 #endif /* _LINUX_ANON_INODES_H */
 
-- 
cgit v1.2.3


From b1ae3587d16a8c8fc9453e147c8708d6f006ffbb Mon Sep 17 00:00:00 2001
From: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Date: Wed, 13 Jan 2021 12:56:25 +0100
Subject: net: phy: Add 100 base-x mode

Sparx-5 supports this mode and it is missing in the PHY core.

Signed-off-by: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/phy.rst | 5 +++++
 include/linux/phy.h              | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index b2f7ec794bc8..399f17976a6c 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -286,6 +286,11 @@ Some of the interface modes are described below:
     Note: due to legacy usage, some 10GBASE-R usage incorrectly makes
     use of this definition.
 
+``PHY_INTERFACE_MODE_100BASEX``
+    This defines IEEE 802.3 Clause 24.  The link operates at a fixed data
+    rate of 125Mpbs using a 4B/5B encoding scheme, resulting in an underlying
+    data rate of 100Mpbs.
+
 Pause frames / flow control
 ===========================
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9effb511acde..24fcc6456a9e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -104,6 +104,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax
  * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII
  * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII
+ * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX
  * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX
  * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX
  * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI
@@ -135,6 +136,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_MOCA,
 	PHY_INTERFACE_MODE_QSGMII,
 	PHY_INTERFACE_MODE_TRGMII,
+	PHY_INTERFACE_MODE_100BASEX,
 	PHY_INTERFACE_MODE_1000BASEX,
 	PHY_INTERFACE_MODE_2500BASEX,
 	PHY_INTERFACE_MODE_RXAUI,
@@ -217,6 +219,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "usxgmii";
 	case PHY_INTERFACE_MODE_10GKR:
 		return "10gbase-kr";
+	case PHY_INTERFACE_MODE_100BASEX:
+		return "100base-x";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From 91c960b0056672e74627776655c926388350fa30 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Thu, 14 Jan 2021 18:17:44 +0000
Subject: bpf: Rename BPF_XADD and prepare to encode other atomics in .imm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A subsequent patch will add additional atomic operations. These new
operations will use the same opcode field as the existing XADD, with
the immediate discriminating different operations.

In preparation, rename the instruction mode BPF_ATOMIC and start
calling the zero immediate BPF_ADD.

This is possible (doesn't break existing valid BPF progs) because the
immediate field is currently reserved MBZ and BPF_ADD is zero.

All uses are removed from the tree but the BPF_XADD definition is
kept around to avoid breaking builds for people including kernel
headers.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Björn Töpel <bjorn.topel@gmail.com>
Link: https://lore.kernel.org/bpf/20210114181751.768687-5-jackmanb@google.com
---
 Documentation/networking/filter.rst                | 30 ++++++++------
 arch/arm/net/bpf_jit_32.c                          |  7 ++--
 arch/arm64/net/bpf_jit_comp.c                      | 16 ++++++--
 arch/mips/net/ebpf_jit.c                           | 11 ++++--
 arch/powerpc/net/bpf_jit_comp64.c                  | 25 +++++++++---
 arch/riscv/net/bpf_jit_comp32.c                    | 20 ++++++++--
 arch/riscv/net/bpf_jit_comp64.c                    | 16 ++++++--
 arch/s390/net/bpf_jit_comp.c                       | 27 +++++++------
 arch/sparc/net/bpf_jit_comp_64.c                   | 17 ++++++--
 arch/x86/net/bpf_jit_comp.c                        | 46 ++++++++++++++++------
 arch/x86/net/bpf_jit_comp32.c                      |  6 +--
 drivers/net/ethernet/netronome/nfp/bpf/jit.c       | 14 +++++--
 drivers/net/ethernet/netronome/nfp/bpf/main.h      |  4 +-
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c  | 15 ++++---
 include/linux/filter.h                             | 16 ++++++--
 include/uapi/linux/bpf.h                           |  5 ++-
 kernel/bpf/core.c                                  | 31 ++++++++++-----
 kernel/bpf/disasm.c                                |  6 ++-
 kernel/bpf/verifier.c                              | 24 ++++++-----
 lib/test_bpf.c                                     | 14 +++----
 samples/bpf/bpf_insn.h                             |  4 +-
 samples/bpf/cookie_uid_helper_example.c            |  8 ++--
 samples/bpf/sock_example.c                         |  2 +-
 samples/bpf/test_cgrp2_attach.c                    |  5 ++-
 tools/include/linux/filter.h                       | 15 +++++--
 tools/include/uapi/linux/bpf.h                     |  5 ++-
 .../selftests/bpf/prog_tests/cgroup_attach_multi.c |  4 +-
 tools/testing/selftests/bpf/test_cgroup_storage.c  |  2 +-
 tools/testing/selftests/bpf/verifier/ctx.c         |  7 ++--
 .../selftests/bpf/verifier/direct_packet_access.c  |  4 +-
 tools/testing/selftests/bpf/verifier/leak_ptr.c    | 10 ++---
 tools/testing/selftests/bpf/verifier/meta_access.c |  4 +-
 tools/testing/selftests/bpf/verifier/unpriv.c      |  3 +-
 .../selftests/bpf/verifier/value_illegal_alu.c     |  2 +-
 tools/testing/selftests/bpf/verifier/xadd.c        | 18 ++++-----
 35 files changed, 291 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst
index debb59e374de..1583d59d806d 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -1006,13 +1006,13 @@ Size modifier is one of ...
 
 Mode modifier is one of::
 
-  BPF_IMM  0x00  /* used for 32-bit mov in classic BPF and 64-bit in eBPF */
-  BPF_ABS  0x20
-  BPF_IND  0x40
-  BPF_MEM  0x60
-  BPF_LEN  0x80  /* classic BPF only, reserved in eBPF */
-  BPF_MSH  0xa0  /* classic BPF only, reserved in eBPF */
-  BPF_XADD 0xc0  /* eBPF only, exclusive add */
+  BPF_IMM     0x00  /* used for 32-bit mov in classic BPF and 64-bit in eBPF */
+  BPF_ABS     0x20
+  BPF_IND     0x40
+  BPF_MEM     0x60
+  BPF_LEN     0x80  /* classic BPF only, reserved in eBPF */
+  BPF_MSH     0xa0  /* classic BPF only, reserved in eBPF */
+  BPF_ATOMIC  0xc0  /* eBPF only, atomic operations */
 
 eBPF has two non-generic instructions: (BPF_ABS | <size> | BPF_LD) and
 (BPF_IND | <size> | BPF_LD) which are used to access packet data.
@@ -1044,11 +1044,19 @@ Unlike classic BPF instruction set, eBPF has generic load/store operations::
     BPF_MEM | <size> | BPF_STX:  *(size *) (dst_reg + off) = src_reg
     BPF_MEM | <size> | BPF_ST:   *(size *) (dst_reg + off) = imm32
     BPF_MEM | <size> | BPF_LDX:  dst_reg = *(size *) (src_reg + off)
-    BPF_XADD | BPF_W  | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg
-    BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg
 
-Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and
-2 byte atomic increments are not supported.
+Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW.
+
+It also includes atomic operations, which use the immediate field for extra
+encoding.
+
+   .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W  | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg
+   .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg
+
+Note that 1 and 2 byte atomic operations are not supported.
+
+You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to
+the exclusive-add operation encoded when the immediate field is zero.
 
 eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists
 of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 0207b6ea6e8a..897634d0a67c 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1620,10 +1620,9 @@ exit:
 		}
 		emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code));
 		break;
-	/* STX XADD: lock *(u32 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_W:
-	/* STX XADD: lock *(u64 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_DW:
+	/* Atomic ops */
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
 		goto notyet;
 	/* STX: *(size *)(dst + off) = src */
 	case BPF_STX | BPF_MEM | BPF_W:
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index ef9f1d5e989d..f7b194878a99 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -875,10 +875,18 @@ emit_cond_jmp:
 		}
 		break;
 
-	/* STX XADD: lock *(u32 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_W:
-	/* STX XADD: lock *(u64 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_DW:
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (insn->imm != BPF_ADD) {
+			pr_err_once("unknown atomic op code %02x\n", insn->imm);
+			return -EINVAL;
+		}
+
+		/* STX XADD: lock *(u32 *)(dst + off) += src
+		 * and
+		 * STX XADD: lock *(u64 *)(dst + off) += src
+		 */
+
 		if (!off) {
 			reg = dst;
 		} else {
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 561154cbcc40..939dd06764bc 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -1423,8 +1423,8 @@ jeq_common:
 	case BPF_STX | BPF_H | BPF_MEM:
 	case BPF_STX | BPF_W | BPF_MEM:
 	case BPF_STX | BPF_DW | BPF_MEM:
-	case BPF_STX | BPF_W | BPF_XADD:
-	case BPF_STX | BPF_DW | BPF_XADD:
+	case BPF_STX | BPF_W | BPF_ATOMIC:
+	case BPF_STX | BPF_DW | BPF_ATOMIC:
 		if (insn->dst_reg == BPF_REG_10) {
 			ctx->flags |= EBPF_SEEN_FP;
 			dst = MIPS_R_SP;
@@ -1438,7 +1438,12 @@ jeq_common:
 		src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
 		if (src < 0)
 			return src;
-		if (BPF_MODE(insn->code) == BPF_XADD) {
+		if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+			if (insn->imm != BPF_ADD) {
+				pr_err("ATOMIC OP %02x NOT HANDLED\n", insn->imm);
+				return -EINVAL;
+			}
+
 			/*
 			 * If mem_off does not fit within the 9 bit ll/sc
 			 * instruction immediate field, use a temp reg.
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 022103c6a201..aaf1a887f653 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -683,10 +683,18 @@ emit_clear:
 			break;
 
 		/*
-		 * BPF_STX XADD (atomic_add)
+		 * BPF_STX ATOMIC (atomic ops)
 		 */
-		/* *(u32 *)(dst + off) += src */
-		case BPF_STX | BPF_XADD | BPF_W:
+		case BPF_STX | BPF_ATOMIC | BPF_W:
+			if (insn->imm != BPF_ADD) {
+				pr_err_ratelimited(
+					"eBPF filter atomic op code %02x (@%d) unsupported\n",
+					code, i);
+				return -ENOTSUPP;
+			}
+
+			/* *(u32 *)(dst + off) += src */
+
 			/* Get EA into TMP_REG_1 */
 			EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off));
 			tmp_idx = ctx->idx * 4;
@@ -699,8 +707,15 @@ emit_clear:
 			/* we're done if this succeeded */
 			PPC_BCC_SHORT(COND_NE, tmp_idx);
 			break;
-		/* *(u64 *)(dst + off) += src */
-		case BPF_STX | BPF_XADD | BPF_DW:
+		case BPF_STX | BPF_ATOMIC | BPF_DW:
+			if (insn->imm != BPF_ADD) {
+				pr_err_ratelimited(
+					"eBPF filter atomic op code %02x (@%d) unsupported\n",
+					code, i);
+				return -ENOTSUPP;
+			}
+			/* *(u64 *)(dst + off) += src */
+
 			EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off));
 			tmp_idx = ctx->idx * 4;
 			EMIT(PPC_RAW_LDARX(b2p[TMP_REG_2], 0, b2p[TMP_REG_1], 0));
diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
index 579575f9cdae..81de865f4c7c 100644
--- a/arch/riscv/net/bpf_jit_comp32.c
+++ b/arch/riscv/net/bpf_jit_comp32.c
@@ -881,7 +881,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off,
 	const s8 *rd = bpf_get_reg64(dst, tmp1, ctx);
 	const s8 *rs = bpf_get_reg64(src, tmp2, ctx);
 
-	if (mode == BPF_XADD && size != BPF_W)
+	if (mode == BPF_ATOMIC && size != BPF_W)
 		return -1;
 
 	emit_imm(RV_REG_T0, off, ctx);
@@ -899,7 +899,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off,
 		case BPF_MEM:
 			emit(rv_sw(RV_REG_T0, 0, lo(rs)), ctx);
 			break;
-		case BPF_XADD:
+		case BPF_ATOMIC: /* Only BPF_ADD supported */
 			emit(rv_amoadd_w(RV_REG_ZERO, lo(rs), RV_REG_T0, 0, 0),
 			     ctx);
 			break;
@@ -1260,7 +1260,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 	case BPF_STX | BPF_MEM | BPF_H:
 	case BPF_STX | BPF_MEM | BPF_W:
 	case BPF_STX | BPF_MEM | BPF_DW:
-	case BPF_STX | BPF_XADD | BPF_W:
 		if (BPF_CLASS(code) == BPF_ST) {
 			emit_imm32(tmp2, imm, ctx);
 			src = tmp2;
@@ -1271,8 +1270,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 			return -1;
 		break;
 
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+		if (insn->imm != BPF_ADD) {
+			pr_info_once(
+				"bpf-jit: not supported: atomic operation %02x ***\n",
+				insn->imm);
+			return -EFAULT;
+		}
+
+		if (emit_store_r64(dst, src, off, ctx, BPF_SIZE(code),
+				   BPF_MODE(code)))
+			return -1;
+		break;
+
 	/* No hardware support for 8-byte atomics in RV32. */
-	case BPF_STX | BPF_XADD | BPF_DW:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
 		/* Fallthrough. */
 
 notsupported:
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 8a56b5293117..b44ff52f84a6 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1027,10 +1027,18 @@ out_be:
 		emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
 		emit_sd(RV_REG_T1, 0, rs, ctx);
 		break;
-	/* STX XADD: lock *(u32 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_W:
-	/* STX XADD: lock *(u64 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_DW:
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (insn->imm != BPF_ADD) {
+			pr_err("bpf-jit: not supported: atomic operation %02x ***\n",
+			       insn->imm);
+			return -EINVAL;
+		}
+
+		/* atomic_add: lock *(u32 *)(dst + off) += src
+		 * atomic_add: lock *(u64 *)(dst + off) += src
+		 */
+
 		if (off) {
 			if (is_12b_int(off)) {
 				emit_addi(RV_REG_T1, rd, off, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 0a4182792876..f973e2ead197 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1205,18 +1205,23 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		jit->seen |= SEEN_MEM;
 		break;
 	/*
-	 * BPF_STX XADD (atomic_add)
+	 * BPF_ATOMIC
 	 */
-	case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */
-		/* laal %w0,%src,off(%dst) */
-		EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg,
-			      dst_reg, off);
-		jit->seen |= SEEN_MEM;
-		break;
-	case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */
-		/* laalg %w0,%src,off(%dst) */
-		EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg,
-			      dst_reg, off);
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+		if (insn->imm != BPF_ADD) {
+			pr_err("Unknown atomic operation %02x\n", insn->imm);
+			return -1;
+		}
+
+		/* *(u32/u64 *)(dst + off) += src
+		 *
+		 * BFW_W:  laal  %w0,%src,off(%dst)
+		 * BPF_DW: laalg %w0,%src,off(%dst)
+		 */
+		EMIT6_DISP_LH(0xeb000000,
+			      BPF_SIZE(insn->code) == BPF_W ? 0x00fa : 0x00ea,
+			      REG_W0, src_reg, dst_reg, off);
 		jit->seen |= SEEN_MEM;
 		break;
 	/*
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 3364e2a00989..4b8d3c65d266 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1366,12 +1366,18 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		break;
 	}
 
-	/* STX XADD: lock *(u32 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_W: {
+	case BPF_STX | BPF_ATOMIC | BPF_W: {
 		const u8 tmp = bpf2sparc[TMP_REG_1];
 		const u8 tmp2 = bpf2sparc[TMP_REG_2];
 		const u8 tmp3 = bpf2sparc[TMP_REG_3];
 
+		if (insn->imm != BPF_ADD) {
+			pr_err_once("unknown atomic op %02x\n", insn->imm);
+			return -EINVAL;
+		}
+
+		/* lock *(u32 *)(dst + off) += src */
+
 		if (insn->dst_reg == BPF_REG_FP)
 			ctx->saw_frame_pointer = true;
 
@@ -1390,11 +1396,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		break;
 	}
 	/* STX XADD: lock *(u64 *)(dst + off) += src */
-	case BPF_STX | BPF_XADD | BPF_DW: {
+	case BPF_STX | BPF_ATOMIC | BPF_DW: {
 		const u8 tmp = bpf2sparc[TMP_REG_1];
 		const u8 tmp2 = bpf2sparc[TMP_REG_2];
 		const u8 tmp3 = bpf2sparc[TMP_REG_3];
 
+		if (insn->imm != BPF_ADD) {
+			pr_err_once("unknown atomic op %02x\n", insn->imm);
+			return -EINVAL;
+		}
+
 		if (insn->dst_reg == BPF_REG_FP)
 			ctx->saw_frame_pointer = true;
 
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 93f32e0ba0ef..b1829a534da1 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -795,6 +795,33 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
+static int emit_atomic(u8 **pprog, u8 atomic_op,
+		       u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	EMIT1(0xF0); /* lock prefix */
+
+	maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);
+
+	/* emit opcode */
+	switch (atomic_op) {
+	case BPF_ADD:
+		/* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
+		EMIT1(simple_alu_opcodes[atomic_op]);
+		break;
+	default:
+		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+		return -EFAULT;
+	}
+
+	emit_insn_suffix(&prog, dst_reg, src_reg, off);
+
+	*pprog = prog;
+	return 0;
+}
+
 static bool ex_handler_bpf(const struct exception_table_entry *x,
 			   struct pt_regs *regs, int trapnr,
 			   unsigned long error_code, unsigned long fault_addr)
@@ -839,6 +866,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	int i, cnt = 0, excnt = 0;
 	int proglen = 0;
 	u8 *prog = temp;
+	int err;
 
 	detect_reg_usage(insn, insn_cnt, callee_regs_used,
 			 &tail_call_seen);
@@ -1250,18 +1278,12 @@ st:			if (is_imm8(insn->off))
 			}
 			break;
 
-			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
-		case BPF_STX | BPF_XADD | BPF_W:
-			/* Emit 'lock add dword ptr [rax + off], eax' */
-			if (is_ereg(dst_reg) || is_ereg(src_reg))
-				EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
-			else
-				EMIT2(0xF0, 0x01);
-			goto xadd;
-		case BPF_STX | BPF_XADD | BPF_DW:
-			EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
-xadd:
-			emit_modrm_dstoff(&prog, dst_reg, src_reg, insn->off);
+		case BPF_STX | BPF_ATOMIC | BPF_W:
+		case BPF_STX | BPF_ATOMIC | BPF_DW:
+			err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
+					  insn->off, BPF_SIZE(insn->code));
+			if (err)
+				return err;
 			break;
 
 			/* call */
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 96fde03aa987..d17b67c69f89 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2243,10 +2243,8 @@ emit_jmp:
 				return -EFAULT;
 			}
 			break;
-		/* STX XADD: lock *(u32 *)(dst + off) += src */
-		case BPF_STX | BPF_XADD | BPF_W:
-		/* STX XADD: lock *(u64 *)(dst + off) += src */
-		case BPF_STX | BPF_XADD | BPF_DW:
+		case BPF_STX | BPF_ATOMIC | BPF_W:
+		case BPF_STX | BPF_ATOMIC | BPF_DW:
 			goto notyet;
 		case BPF_JMP | BPF_EXIT:
 			if (seen_exit) {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 0a721f6e8676..e31f8fbbc696 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3109,13 +3109,19 @@ mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
 	return 0;
 }
 
-static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
+	if (meta->insn.imm != BPF_ADD)
+		return -EOPNOTSUPP;
+
 	return mem_xadd(nfp_prog, meta, false);
 }
 
-static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
+	if (meta->insn.imm != BPF_ADD)
+		return -EOPNOTSUPP;
+
 	return mem_xadd(nfp_prog, meta, true);
 }
 
@@ -3475,8 +3481,8 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
-	[BPF_STX | BPF_XADD | BPF_W] =	mem_xadd4,
-	[BPF_STX | BPF_XADD | BPF_DW] =	mem_xadd8,
+	[BPF_STX | BPF_ATOMIC | BPF_W] =	mem_atomic4,
+	[BPF_STX | BPF_ATOMIC | BPF_DW] =	mem_atomic8,
 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index fac9c6f9e197..d0e17eebddd9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -428,9 +428,9 @@ static inline bool is_mbpf_classic_store_pkt(const struct nfp_insn_meta *meta)
 	return is_mbpf_classic_store(meta) && meta->ptr.type == PTR_TO_PACKET;
 }
 
-static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
+static inline bool is_mbpf_atomic(const struct nfp_insn_meta *meta)
 {
-	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD);
+	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_ATOMIC);
 }
 
 static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e92ee510fd52..9d235c0ce46a 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -479,7 +479,7 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 			pr_vlog(env, "map writes not supported\n");
 			return -EOPNOTSUPP;
 		}
-		if (is_mbpf_xadd(meta)) {
+		if (is_mbpf_atomic(meta)) {
 			err = nfp_bpf_map_mark_used(env, meta, reg,
 						    NFP_MAP_USE_ATOMIC_CNT);
 			if (err)
@@ -523,12 +523,17 @@ exit_check_ptr:
 }
 
 static int
-nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-		   struct bpf_verifier_env *env)
+nfp_bpf_check_atomic(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		     struct bpf_verifier_env *env)
 {
 	const struct bpf_reg_state *sreg = cur_regs(env) + meta->insn.src_reg;
 	const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg;
 
+	if (meta->insn.imm != BPF_ADD) {
+		pr_vlog(env, "atomic op not implemented: %d\n", meta->insn.imm);
+		return -EOPNOTSUPP;
+	}
+
 	if (dreg->type != PTR_TO_MAP_VALUE) {
 		pr_vlog(env, "atomic add not to a map value pointer: %d\n",
 			dreg->type);
@@ -655,8 +660,8 @@ int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx,
 	if (is_mbpf_store(meta))
 		return nfp_bpf_check_store(nfp_prog, meta, env);
 
-	if (is_mbpf_xadd(meta))
-		return nfp_bpf_check_xadd(nfp_prog, meta, env);
+	if (is_mbpf_atomic(meta))
+		return nfp_bpf_check_atomic(nfp_prog, meta, env);
 
 	if (is_mbpf_alu(meta))
 		return nfp_bpf_check_alu(nfp_prog, meta, env);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5edf2b660881..392e94b79668 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -259,15 +259,23 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = 0 })
 
-/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
 
-#define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
+/*
+ * Atomic operations:
+ *
+ *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
+ */
+
+#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)			\
 	((struct bpf_insn) {					\
-		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
 		.dst_reg = DST,					\
 		.src_reg = SRC,					\
 		.off   = OFF,					\
-		.imm   = 0 })
+		.imm   = OP })
+
+/* Legacy alias */
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)
 
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1ad32456f89..6b3996343e63 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -19,7 +19,8 @@
 
 /* ld/ldx fields */
 #define BPF_DW		0x18	/* double word (64-bit) */
-#define BPF_XADD	0xc0	/* exclusive add */
+#define BPF_ATOMIC	0xc0	/* atomic memory ops - op type in immediate */
+#define BPF_XADD	0xc0	/* exclusive add - legacy name */
 
 /* alu/jmp fields */
 #define BPF_MOV		0xb0	/* mov reg to reg */
@@ -2448,7 +2449,7 @@ union bpf_attr {
  *		running simultaneously.
  *
  *		A user should care about the synchronization by himself.
- *		For example, by using the **BPF_STX_XADD** instruction to alter
+ *		For example, by using the **BPF_ATOMIC** instructions to alter
  *		the shared data.
  *	Return
  *		A pointer to the local storage area.
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 69c3c308de5e..4836ebf459cf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1309,8 +1309,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 	INSN_3(STX, MEM,  H),			\
 	INSN_3(STX, MEM,  W),			\
 	INSN_3(STX, MEM,  DW),			\
-	INSN_3(STX, XADD, W),			\
-	INSN_3(STX, XADD, DW),			\
+	INSN_3(STX, ATOMIC, W),			\
+	INSN_3(STX, ATOMIC, DW),		\
 	/*   Immediate based. */		\
 	INSN_3(ST, MEM, B),			\
 	INSN_3(ST, MEM, H),			\
@@ -1618,13 +1618,25 @@ out:
 	LDX_PROBE(DW, 8)
 #undef LDX_PROBE
 
-	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
-		atomic_add((u32) SRC, (atomic_t *)(unsigned long)
-			   (DST + insn->off));
+	STX_ATOMIC_W:
+		switch (IMM) {
+		case BPF_ADD:
+			/* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+			atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+				   (DST + insn->off));
+		default:
+			goto default_label;
+		}
 		CONT;
-	STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
-		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
-			     (DST + insn->off));
+	STX_ATOMIC_DW:
+		switch (IMM) {
+		case BPF_ADD:
+			/* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+			atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+				     (DST + insn->off));
+		default:
+			goto default_label;
+		}
 		CONT;
 
 	default_label:
@@ -1634,7 +1646,8 @@ out:
 		 *
 		 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
 		 */
-		pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+		pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
+			insn->code, insn->imm);
 		BUG_ON(1);
 		return 0;
 }
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index b44d8c447afd..37c8d6e9b4cc 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -153,14 +153,16 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg,
 				insn->off, insn->src_reg);
-		else if (BPF_MODE(insn->code) == BPF_XADD)
+		else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+			 insn->imm == BPF_ADD) {
 			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
-		else
+		} else {
 			verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+		}
 	} else if (class == BPF_ST) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
 			verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ae2aee48cf82..cfc137b81ac6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3604,13 +3604,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	return err;
 }
 
-static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
 	int err;
 
-	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
-	    insn->imm != 0) {
-		verbose(env, "BPF_XADD uses reserved fields\n");
+	if (insn->imm != BPF_ADD) {
+		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
+		return -EINVAL;
+	}
+
+	if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
+		verbose(env, "invalid atomic operand size\n");
 		return -EINVAL;
 	}
 
@@ -3633,19 +3637,19 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	    is_pkt_reg(env, insn->dst_reg) ||
 	    is_flow_key_reg(env, insn->dst_reg) ||
 	    is_sk_reg(env, insn->dst_reg)) {
-		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
+		verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
 			insn->dst_reg,
 			reg_type_str[reg_state(env, insn->dst_reg)->type]);
 		return -EACCES;
 	}
 
-	/* check whether atomic_add can read the memory */
+	/* check whether we can read the memory */
 	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1, true);
 	if (err)
 		return err;
 
-	/* check whether atomic_add can write into the same memory */
+	/* check whether we can write into the same memory */
 	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 				BPF_SIZE(insn->code), BPF_WRITE, -1, true);
 }
@@ -9524,8 +9528,8 @@ static int do_check(struct bpf_verifier_env *env)
 		} else if (class == BPF_STX) {
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
-			if (BPF_MODE(insn->code) == BPF_XADD) {
-				err = check_xadd(env, env->insn_idx, insn);
+			if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+				err = check_atomic(env, env->insn_idx, insn);
 				if (err)
 					return err;
 				env->insn_idx++;
@@ -10010,7 +10014,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 
 		if (BPF_CLASS(insn->code) == BPF_STX &&
 		    ((BPF_MODE(insn->code) != BPF_MEM &&
-		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+		      BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) {
 			verbose(env, "BPF_STX uses reserved fields\n");
 			return -EINVAL;
 		}
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index ca7d635bccd9..49ec9e8d8aed 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -4295,13 +4295,13 @@ static struct bpf_test tests[] = {
 		{ { 0, 0xffffffff } },
 		.stack_depth = 40,
 	},
-	/* BPF_STX | BPF_XADD | BPF_W/DW */
+	/* BPF_STX | BPF_ATOMIC | BPF_W/DW */
 	{
 		"STX_XADD_W: Test: 0x12 + 0x10 = 0x22",
 		.u.insns_int = {
 			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
 			BPF_ST_MEM(BPF_W, R10, -40, 0x10),
-			BPF_STX_XADD(BPF_W, R10, R0, -40),
+			BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40),
 			BPF_LDX_MEM(BPF_W, R0, R10, -40),
 			BPF_EXIT_INSN(),
 		},
@@ -4316,7 +4316,7 @@ static struct bpf_test tests[] = {
 			BPF_ALU64_REG(BPF_MOV, R1, R10),
 			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
 			BPF_ST_MEM(BPF_W, R10, -40, 0x10),
-			BPF_STX_XADD(BPF_W, R10, R0, -40),
+			BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40),
 			BPF_ALU64_REG(BPF_MOV, R0, R10),
 			BPF_ALU64_REG(BPF_SUB, R0, R1),
 			BPF_EXIT_INSN(),
@@ -4331,7 +4331,7 @@ static struct bpf_test tests[] = {
 		.u.insns_int = {
 			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
 			BPF_ST_MEM(BPF_W, R10, -40, 0x10),
-			BPF_STX_XADD(BPF_W, R10, R0, -40),
+			BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40),
 			BPF_EXIT_INSN(),
 		},
 		INTERNAL,
@@ -4352,7 +4352,7 @@ static struct bpf_test tests[] = {
 		.u.insns_int = {
 			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
 			BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
-			BPF_STX_XADD(BPF_DW, R10, R0, -40),
+			BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40),
 			BPF_LDX_MEM(BPF_DW, R0, R10, -40),
 			BPF_EXIT_INSN(),
 		},
@@ -4367,7 +4367,7 @@ static struct bpf_test tests[] = {
 			BPF_ALU64_REG(BPF_MOV, R1, R10),
 			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
 			BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
-			BPF_STX_XADD(BPF_DW, R10, R0, -40),
+			BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40),
 			BPF_ALU64_REG(BPF_MOV, R0, R10),
 			BPF_ALU64_REG(BPF_SUB, R0, R1),
 			BPF_EXIT_INSN(),
@@ -4382,7 +4382,7 @@ static struct bpf_test tests[] = {
 		.u.insns_int = {
 			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
 			BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
-			BPF_STX_XADD(BPF_DW, R10, R0, -40),
+			BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40),
 			BPF_EXIT_INSN(),
 		},
 		INTERNAL,
diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h
index 544237980582..db67a2847395 100644
--- a/samples/bpf/bpf_insn.h
+++ b/samples/bpf/bpf_insn.h
@@ -138,11 +138,11 @@ struct bpf_insn;
 
 #define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
 	((struct bpf_insn) {					\
-		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
 		.dst_reg = DST,					\
 		.src_reg = SRC,					\
 		.off   = OFF,					\
-		.imm   = 0 })
+		.imm   = BPF_ADD })
 
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
index deb0e3e0324d..c5ff7a13918c 100644
--- a/samples/bpf/cookie_uid_helper_example.c
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -147,12 +147,12 @@ static void prog_load(void)
 		 */
 		BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
 		BPF_MOV64_IMM(BPF_REG_1, 1),
-		BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1,
-				offsetof(struct stats, packets)),
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1,
+			      offsetof(struct stats, packets)),
 		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
 				offsetof(struct __sk_buff, len)),
-		BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1,
-				offsetof(struct stats, bytes)),
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1,
+			      offsetof(struct stats, bytes)),
 		BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
 				offsetof(struct __sk_buff, len)),
 		BPF_EXIT_INSN(),
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 00aae1d33fca..23d1930e1927 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -54,7 +54,7 @@ static int test_sock(void)
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 		BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
 		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
 		BPF_EXIT_INSN(),
 	};
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 20fbd1241db3..390ff38d2ac6 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -53,7 +53,7 @@ static int prog_load(int map_fd, int verdict)
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 		BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
 
 		/* Count bytes */
 		BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
@@ -64,7 +64,8 @@ static int prog_load(int map_fd, int verdict)
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
 
 		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
 		BPF_EXIT_INSN(),
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index ca28b6ab8db7..e870c9039f0d 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -169,15 +169,22 @@
 		.off   = OFF,					\
 		.imm   = 0 })
 
-/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
+/*
+ * Atomic operations:
+ *
+ *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
+ */
 
-#define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
+#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)			\
 	((struct bpf_insn) {					\
-		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,	\
 		.dst_reg = DST,					\
 		.src_reg = SRC,					\
 		.off   = OFF,					\
-		.imm   = 0 })
+		.imm   = OP })
+
+/* Legacy alias */
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)
 
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a1ad32456f89..6b3996343e63 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -19,7 +19,8 @@
 
 /* ld/ldx fields */
 #define BPF_DW		0x18	/* double word (64-bit) */
-#define BPF_XADD	0xc0	/* exclusive add */
+#define BPF_ATOMIC	0xc0	/* atomic memory ops - op type in immediate */
+#define BPF_XADD	0xc0	/* exclusive add - legacy name */
 
 /* alu/jmp fields */
 #define BPF_MOV		0xb0	/* mov reg to reg */
@@ -2448,7 +2449,7 @@ union bpf_attr {
  *		running simultaneously.
  *
  *		A user should care about the synchronization by himself.
- *		For example, by using the **BPF_STX_XADD** instruction to alter
+ *		For example, by using the **BPF_ATOMIC** instructions to alter
  *		the shared data.
  *	Return
  *		A pointer to the local storage area.
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
index b549fcfacc0b..0a1fc9816cef 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
@@ -45,13 +45,13 @@ static int prog_load_cnt(int verdict, int val)
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 		BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
 
 		BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd),
 		BPF_MOV64_IMM(BPF_REG_2, 0),
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
 		BPF_MOV64_IMM(BPF_REG_1, val),
-		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0),
+		BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
 
 		BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd),
 		BPF_MOV64_IMM(BPF_REG_2, 0),
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
index d946252a25bb..0cda61da5d39 100644
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -29,7 +29,7 @@ int main(int argc, char **argv)
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
 			     BPF_FUNC_get_local_storage),
 		BPF_MOV64_IMM(BPF_REG_1, 1),
-		BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+		BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0),
 		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
 		BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1),
 		BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c
index 93d6b1641481..23080862aafd 100644
--- a/tools/testing/selftests/bpf/verifier/ctx.c
+++ b/tools/testing/selftests/bpf/verifier/ctx.c
@@ -10,14 +10,13 @@
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 },
 {
-	"context stores via XADD",
+	"context stores via BPF_ATOMIC",
 	.insns = {
 	BPF_MOV64_IMM(BPF_REG_0, 0),
-	BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1,
-		     BPF_REG_0, offsetof(struct __sk_buff, mark), 0),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, mark)),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "BPF_XADD stores into R1 ctx is not allowed",
+	.errstr = "BPF_ATOMIC stores into R1 ctx is not allowed",
 	.result = REJECT,
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 },
diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c
index ae72536603fe..ac1e19d0f520 100644
--- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c
+++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c
@@ -333,7 +333,7 @@
 	BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
 	BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
-	BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_4, BPF_REG_5, 0),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
 	BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -488,7 +488,7 @@
 	BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 11),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
 	BPF_MOV64_IMM(BPF_REG_4, 0xffffffff),
-	BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_4, -8),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_4, -8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
 	BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 49),
 	BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2),
diff --git a/tools/testing/selftests/bpf/verifier/leak_ptr.c b/tools/testing/selftests/bpf/verifier/leak_ptr.c
index d6eec17f2cd2..73f0dea95546 100644
--- a/tools/testing/selftests/bpf/verifier/leak_ptr.c
+++ b/tools/testing/selftests/bpf/verifier/leak_ptr.c
@@ -5,7 +5,7 @@
 	BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
 		    offsetof(struct __sk_buff, cb[0])),
 	BPF_LD_MAP_FD(BPF_REG_2, 0),
-	BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2,
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_2,
 		      offsetof(struct __sk_buff, cb[0])),
 	BPF_EXIT_INSN(),
 	},
@@ -13,7 +13,7 @@
 	.errstr_unpriv = "R2 leaks addr into mem",
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "BPF_XADD stores into R1 ctx is not allowed",
+	.errstr = "BPF_ATOMIC stores into R1 ctx is not allowed",
 },
 {
 	"leak pointer into ctx 2",
@@ -21,14 +21,14 @@
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
 		    offsetof(struct __sk_buff, cb[0])),
-	BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10,
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_10,
 		      offsetof(struct __sk_buff, cb[0])),
 	BPF_EXIT_INSN(),
 	},
 	.errstr_unpriv = "R10 leaks addr into mem",
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "BPF_XADD stores into R1 ctx is not allowed",
+	.errstr = "BPF_ATOMIC stores into R1 ctx is not allowed",
 },
 {
 	"leak pointer into ctx 3",
@@ -56,7 +56,7 @@
 	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
 	BPF_MOV64_IMM(BPF_REG_3, 0),
 	BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
-	BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_6, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
diff --git a/tools/testing/selftests/bpf/verifier/meta_access.c b/tools/testing/selftests/bpf/verifier/meta_access.c
index 205292b8dd65..b45e8af41420 100644
--- a/tools/testing/selftests/bpf/verifier/meta_access.c
+++ b/tools/testing/selftests/bpf/verifier/meta_access.c
@@ -171,7 +171,7 @@
 	BPF_MOV64_IMM(BPF_REG_5, 42),
 	BPF_MOV64_IMM(BPF_REG_6, 24),
 	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
-	BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
 	BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
 	BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5),
@@ -196,7 +196,7 @@
 	BPF_MOV64_IMM(BPF_REG_5, 42),
 	BPF_MOV64_IMM(BPF_REG_6, 24),
 	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
-	BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
 	BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
 	BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5),
diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c
index a3fe0fbaed41..ee298627abae 100644
--- a/tools/testing/selftests/bpf/verifier/unpriv.c
+++ b/tools/testing/selftests/bpf/verifier/unpriv.c
@@ -207,7 +207,8 @@
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
 	BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 1),
-	BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_10, BPF_REG_0, -8, 0),
+	BPF_RAW_INSN(BPF_STX | BPF_ATOMIC | BPF_DW,
+		     BPF_REG_10, BPF_REG_0, -8, BPF_ADD),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc),
 	BPF_EXIT_INSN(),
diff --git a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c
index ed1c2cea1dea..489062867218 100644
--- a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c
+++ b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c
@@ -82,7 +82,7 @@
 	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 	BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
-	BPF_STX_XADD(BPF_DW, BPF_REG_2, BPF_REG_3, 0),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_2, BPF_REG_3, 0),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0),
 	BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
 	BPF_EXIT_INSN(),
diff --git a/tools/testing/selftests/bpf/verifier/xadd.c b/tools/testing/selftests/bpf/verifier/xadd.c
index c5de2e62cc8b..b96ef3526815 100644
--- a/tools/testing/selftests/bpf/verifier/xadd.c
+++ b/tools/testing/selftests/bpf/verifier/xadd.c
@@ -3,7 +3,7 @@
 	.insns = {
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
-	BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -7),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
 	BPF_EXIT_INSN(),
 	},
@@ -22,7 +22,7 @@
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
 	BPF_EXIT_INSN(),
 	BPF_MOV64_IMM(BPF_REG_1, 1),
-	BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 3),
 	BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3),
 	BPF_EXIT_INSN(),
 	},
@@ -45,13 +45,13 @@
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
 	BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0),
-	BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1),
-	BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 1),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 2),
 	BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1),
 	BPF_EXIT_INSN(),
 	},
 	.result = REJECT,
-	.errstr = "BPF_XADD stores into R2 pkt is not allowed",
+	.errstr = "BPF_ATOMIC stores into R2 pkt is not allowed",
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
@@ -62,8 +62,8 @@
 	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
 	BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
 	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
-	BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
-	BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8),
+	BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8),
 	BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3),
 	BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
@@ -82,8 +82,8 @@
 	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
 	BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
 	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8),
-	BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8),
-	BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8),
+	BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8),
 	BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3),
 	BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2),
 	BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8),
-- 
cgit v1.2.3


From 5ca419f2864a2c60940dcf4bbaeb69546200e36f Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Thu, 14 Jan 2021 18:17:46 +0000
Subject: bpf: Add BPF_FETCH field / create atomic_fetch_add instruction

The BPF_FETCH field can be set in bpf_insn.imm, for BPF_ATOMIC
instructions, in order to have the previous value of the
atomically-modified memory location loaded into the src register
after an atomic op is carried out.

Suggested-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210114181751.768687-7-jackmanb@google.com
---
 arch/x86/net/bpf_jit_comp.c    |  4 ++++
 include/linux/filter.h         |  1 +
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/core.c              | 13 +++++++++++++
 kernel/bpf/disasm.c            |  7 +++++++
 kernel/bpf/verifier.c          | 33 ++++++++++++++++++++++++---------
 tools/include/linux/filter.h   |  1 +
 tools/include/uapi/linux/bpf.h |  3 +++
 8 files changed, 56 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b1829a534da1..eea7d8b0bb12 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -811,6 +811,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
 		/* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
 		EMIT1(simple_alu_opcodes[atomic_op]);
 		break;
+	case BPF_ADD | BPF_FETCH:
+		/* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
+		EMIT2(0x0F, 0xC1);
+		break;
 	default:
 		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
 		return -EFAULT;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 392e94b79668..23fca41b8540 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -264,6 +264,7 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
  * Atomic operations:
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
+ *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
  */
 
 #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)			\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6b3996343e63..ea262b009049 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -44,6 +44,9 @@
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
+/* atomic op type fields (stored in immediate) */
+#define BPF_FETCH	0x01	/* fetch previous value into src reg */
+
 /* Register numbers */
 enum {
 	BPF_REG_0 = 0,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4836ebf459cf..28d6000463e4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1624,16 +1624,29 @@ out:
 			/* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
 			atomic_add((u32) SRC, (atomic_t *)(unsigned long)
 				   (DST + insn->off));
+			break;
+		case BPF_ADD | BPF_FETCH:
+			SRC = (u32) atomic_fetch_add(
+				(u32) SRC,
+				(atomic_t *)(unsigned long) (DST + insn->off));
+			break;
 		default:
 			goto default_label;
 		}
 		CONT;
+
 	STX_ATOMIC_DW:
 		switch (IMM) {
 		case BPF_ADD:
 			/* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
 			atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
 				     (DST + insn->off));
+			break;
+		case BPF_ADD | BPF_FETCH:
+			SRC = (u64) atomic64_fetch_add(
+				(u64) SRC,
+				(atomic64_t *)(unsigned long) (DST + insn->off));
+			break;
 		default:
 			goto default_label;
 		}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 37c8d6e9b4cc..d2e20f6d0516 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -160,6 +160,13 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
+		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+			   insn->imm == (BPF_ADD | BPF_FETCH)) {
+			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n",
+				insn->code, insn->src_reg,
+				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off, insn->src_reg);
 		} else {
 			verbose(cbs->private_data, "BUG_%02x\n", insn->code);
 		}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d8a85f4e5b95..6aa1fc919761 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3608,7 +3608,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 {
 	int err;
 
-	if (insn->imm != BPF_ADD) {
+	switch (insn->imm) {
+	case BPF_ADD:
+	case BPF_ADD | BPF_FETCH:
+		break;
+	default:
 		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
 		return -EINVAL;
 	}
@@ -3650,8 +3654,20 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 		return err;
 
 	/* check whether we can write into the same memory */
-	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-				BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+			       BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+	if (err)
+		return err;
+
+	if (!(insn->imm & BPF_FETCH))
+		return 0;
+
+	/* check and record load of old value into src reg  */
+	err = check_reg_arg(env, insn->src_reg, DST_OP);
+	if (err)
+		return err;
+
+	return 0;
 }
 
 static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
@@ -9528,12 +9544,6 @@ static int do_check(struct bpf_verifier_env *env)
 		} else if (class == BPF_STX) {
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
-			if (((BPF_MODE(insn->code) != BPF_MEM &&
-			      BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) {
-				verbose(env, "BPF_STX uses reserved fields\n");
-				return -EINVAL;
-			}
-
 			if (BPF_MODE(insn->code) == BPF_ATOMIC) {
 				err = check_atomic(env, env->insn_idx, insn);
 				if (err)
@@ -9542,6 +9552,11 @@ static int do_check(struct bpf_verifier_env *env)
 				continue;
 			}
 
+			if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+				verbose(env, "BPF_STX uses reserved fields\n");
+				return -EINVAL;
+			}
+
 			/* check src1 operand */
 			err = check_reg_arg(env, insn->src_reg, SRC_OP);
 			if (err)
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index e870c9039f0d..7211ce9fba53 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -173,6 +173,7 @@
  * Atomic operations:
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
+ *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
  */
 
 #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)			\
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6b3996343e63..ea262b009049 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -44,6 +44,9 @@
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
+/* atomic op type fields (stored in immediate) */
+#define BPF_FETCH	0x01	/* fetch previous value into src reg */
+
 /* Register numbers */
 enum {
 	BPF_REG_0 = 0,
-- 
cgit v1.2.3


From 5ffa25502b5ab3d639829a2d1e316cff7f59a41e Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Thu, 14 Jan 2021 18:17:47 +0000
Subject: bpf: Add instructions for atomic_[cmp]xchg

This adds two atomic opcodes, both of which include the BPF_FETCH
flag. XCHG without the BPF_FETCH flag would naturally encode
atomic_set. This is not supported because it would be of limited
value to userspace (it doesn't imply any barriers). CMPXCHG without
BPF_FETCH woulud be an atomic compare-and-write. We don't have such
an operation in the kernel so it isn't provided to BPF either.

There are two significant design decisions made for the CMPXCHG
instruction:

 - To solve the issue that this operation fundamentally has 3
   operands, but we only have two register fields. Therefore the
   operand we compare against (the kernel's API calls it 'old') is
   hard-coded to be R0. x86 has similar design (and A64 doesn't
   have this problem).

   A potential alternative might be to encode the other operand's
   register number in the immediate field.

 - The kernel's atomic_cmpxchg returns the old value, while the C11
   userspace APIs return a boolean indicating the comparison
   result. Which should BPF do? A64 returns the old value. x86 returns
   the old value in the hard-coded register (and also sets a
   flag). That means return-old-value is easier to JIT, so that's
   what we use.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210114181751.768687-8-jackmanb@google.com
---
 arch/x86/net/bpf_jit_comp.c    |  8 ++++++++
 include/linux/filter.h         |  2 ++
 include/uapi/linux/bpf.h       |  4 +++-
 kernel/bpf/core.c              | 20 ++++++++++++++++++++
 kernel/bpf/disasm.c            | 15 +++++++++++++++
 kernel/bpf/verifier.c          | 19 +++++++++++++++++--
 tools/include/linux/filter.h   |  2 ++
 tools/include/uapi/linux/bpf.h |  4 +++-
 8 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index eea7d8b0bb12..308241187582 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
 		/* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
 		EMIT2(0x0F, 0xC1);
 		break;
+	case BPF_XCHG:
+		/* src_reg = atomic_xchg(dst_reg + off, src_reg); */
+		EMIT1(0x87);
+		break;
+	case BPF_CMPXCHG:
+		/* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
+		EMIT2(0x0F, 0xB1);
+		break;
 	default:
 		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
 		return -EFAULT;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 23fca41b8540..d563820f197d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
  *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
+ *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
  */
 
 #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)			\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ea262b009049..c001766adcbc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -45,7 +45,9 @@
 #define BPF_EXIT	0x90	/* function return */
 
 /* atomic op type fields (stored in immediate) */
-#define BPF_FETCH	0x01	/* fetch previous value into src reg */
+#define BPF_FETCH	0x01	/* not an opcode on its own, used to build others */
+#define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
+#define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
 
 /* Register numbers */
 enum {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 28d6000463e4..4df6daba43ef 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1630,6 +1630,16 @@ out:
 				(u32) SRC,
 				(atomic_t *)(unsigned long) (DST + insn->off));
 			break;
+		case BPF_XCHG:
+			SRC = (u32) atomic_xchg(
+				(atomic_t *)(unsigned long) (DST + insn->off),
+				(u32) SRC);
+			break;
+		case BPF_CMPXCHG:
+			BPF_R0 = (u32) atomic_cmpxchg(
+				(atomic_t *)(unsigned long) (DST + insn->off),
+				(u32) BPF_R0, (u32) SRC);
+			break;
 		default:
 			goto default_label;
 		}
@@ -1647,6 +1657,16 @@ out:
 				(u64) SRC,
 				(atomic64_t *)(unsigned long) (DST + insn->off));
 			break;
+		case BPF_XCHG:
+			SRC = (u64) atomic64_xchg(
+				(atomic64_t *)(unsigned long) (DST + insn->off),
+				(u64) SRC);
+			break;
+		case BPF_CMPXCHG:
+			BPF_R0 = (u64) atomic64_cmpxchg(
+				(atomic64_t *)(unsigned long) (DST + insn->off),
+				(u64) BPF_R0, (u64) SRC);
+			break;
 		default:
 			goto default_label;
 		}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d2e20f6d0516..ee8d1132767b 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -167,6 +167,21 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off, insn->src_reg);
+		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+			   insn->imm == BPF_CMPXCHG) {
+			verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
+				insn->code,
+				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+			   insn->imm == BPF_XCHG) {
+			verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
+				insn->code, insn->src_reg,
+				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off, insn->src_reg);
 		} else {
 			verbose(cbs->private_data, "BUG_%02x\n", insn->code);
 		}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6aa1fc919761..89a4d154ab37 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3606,11 +3606,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
+	int load_reg;
 	int err;
 
 	switch (insn->imm) {
 	case BPF_ADD:
 	case BPF_ADD | BPF_FETCH:
+	case BPF_XCHG:
+	case BPF_CMPXCHG:
 		break;
 	default:
 		verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
@@ -3632,6 +3635,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	if (err)
 		return err;
 
+	if (insn->imm == BPF_CMPXCHG) {
+		/* Check comparison of R0 with memory location */
+		err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+		if (err)
+			return err;
+	}
+
 	if (is_pointer_value(env, insn->src_reg)) {
 		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
@@ -3662,8 +3672,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	if (!(insn->imm & BPF_FETCH))
 		return 0;
 
-	/* check and record load of old value into src reg  */
-	err = check_reg_arg(env, insn->src_reg, DST_OP);
+	if (insn->imm == BPF_CMPXCHG)
+		load_reg = BPF_REG_0;
+	else
+		load_reg = insn->src_reg;
+
+	/* check and record load of old value */
+	err = check_reg_arg(env, load_reg, DST_OP);
 	if (err)
 		return err;
 
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index 7211ce9fba53..d75998b0d5ac 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -174,6 +174,8 @@
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
  *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
+ *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
  */
 
 #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)			\
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ea262b009049..c001766adcbc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -45,7 +45,9 @@
 #define BPF_EXIT	0x90	/* function return */
 
 /* atomic op type fields (stored in immediate) */
-#define BPF_FETCH	0x01	/* fetch previous value into src reg */
+#define BPF_FETCH	0x01	/* not an opcode on its own, used to build others */
+#define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
+#define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
 
 /* Register numbers */
 enum {
-- 
cgit v1.2.3


From 981f94c3e92146705baf97fb417a5ed1ab1a79a5 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <jackmanb@google.com>
Date: Thu, 14 Jan 2021 18:17:49 +0000
Subject: bpf: Add bitwise atomic instructions

This adds instructions for

atomic[64]_[fetch_]and
atomic[64]_[fetch_]or
atomic[64]_[fetch_]xor

All these operations are isomorphic enough to implement with the same
verifier, interpreter, and x86 JIT code, hence being a single commit.

The main interesting thing here is that x86 doesn't directly support
the fetch_ version these operations, so we need to generate a CMPXCHG
loop in the JIT. This requires the use of two temporary registers,
IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210114181751.768687-10-jackmanb@google.com
---
 arch/x86/net/bpf_jit_comp.c  | 50 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/filter.h       |  6 ++++++
 kernel/bpf/core.c            |  3 +++
 kernel/bpf/disasm.c          | 21 +++++++++++++++----
 kernel/bpf/verifier.c        |  6 ++++++
 tools/include/linux/filter.h |  6 ++++++
 6 files changed, 87 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 308241187582..1d4d50199293 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -808,6 +808,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
 	/* emit opcode */
 	switch (atomic_op) {
 	case BPF_ADD:
+	case BPF_SUB:
+	case BPF_AND:
+	case BPF_OR:
+	case BPF_XOR:
 		/* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
 		EMIT1(simple_alu_opcodes[atomic_op]);
 		break;
@@ -1292,8 +1296,52 @@ st:			if (is_imm8(insn->off))
 
 		case BPF_STX | BPF_ATOMIC | BPF_W:
 		case BPF_STX | BPF_ATOMIC | BPF_DW:
+			if (insn->imm == (BPF_AND | BPF_FETCH) ||
+			    insn->imm == (BPF_OR | BPF_FETCH) ||
+			    insn->imm == (BPF_XOR | BPF_FETCH)) {
+				u8 *branch_target;
+				bool is64 = BPF_SIZE(insn->code) == BPF_DW;
+
+				/*
+				 * Can't be implemented with a single x86 insn.
+				 * Need to do a CMPXCHG loop.
+				 */
+
+				/* Will need RAX as a CMPXCHG operand so save R0 */
+				emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
+				branch_target = prog;
+				/* Load old value */
+				emit_ldx(&prog, BPF_SIZE(insn->code),
+					 BPF_REG_0, dst_reg, insn->off);
+				/*
+				 * Perform the (commutative) operation locally,
+				 * put the result in the AUX_REG.
+				 */
+				emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
+				maybe_emit_mod(&prog, AUX_REG, src_reg, is64);
+				EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
+				      add_2reg(0xC0, AUX_REG, src_reg));
+				/* Attempt to swap in new value */
+				err = emit_atomic(&prog, BPF_CMPXCHG,
+						  dst_reg, AUX_REG, insn->off,
+						  BPF_SIZE(insn->code));
+				if (WARN_ON(err))
+					return err;
+				/*
+				 * ZF tells us whether we won the race. If it's
+				 * cleared we need to try again.
+				 */
+				EMIT2(X86_JNE, -(prog - branch_target) - 2);
+				/* Return the pre-modification value */
+				emit_mov_reg(&prog, is64, src_reg, BPF_REG_0);
+				/* Restore R0 after clobbering RAX */
+				emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
+				break;
+
+			}
+
 			err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
-					  insn->off, BPF_SIZE(insn->code));
+						  insn->off, BPF_SIZE(insn->code));
 			if (err)
 				return err;
 			break;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d563820f197d..7fdce5407214 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -264,7 +264,13 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
  * Atomic operations:
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
+ *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
+ *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
+ *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
  *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
+ *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
+ *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
  *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
  *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
  */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8669e685825f..5bbd4884ff7a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1642,6 +1642,9 @@ out:
 	STX_ATOMIC_W:
 		switch (IMM) {
 		ATOMIC_ALU_OP(BPF_ADD, add)
+		ATOMIC_ALU_OP(BPF_AND, and)
+		ATOMIC_ALU_OP(BPF_OR, or)
+		ATOMIC_ALU_OP(BPF_XOR, xor)
 #undef ATOMIC_ALU_OP
 
 		case BPF_XCHG:
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index ee8d1132767b..19ff8fed7f4b 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = {
 	[BPF_END >> 4]  = "endian",
 };
 
+static const char *const bpf_atomic_alu_string[16] = {
+	[BPF_ADD >> 4]  = "add",
+	[BPF_AND >> 4]  = "and",
+	[BPF_OR >> 4]  = "or",
+	[BPF_XOR >> 4]  = "or",
+};
+
 static const char *const bpf_ldst_string[] = {
 	[BPF_W >> 3]  = "u32",
 	[BPF_H >> 3]  = "u16",
@@ -154,17 +161,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
-			 insn->imm == BPF_ADD) {
-			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			 (insn->imm == BPF_ADD || insn->imm == BPF_ADD ||
+			  insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
+			verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
+				bpf_alu_string[BPF_OP(insn->imm) >> 4],
 				insn->src_reg);
 		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
-			   insn->imm == (BPF_ADD | BPF_FETCH)) {
-			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n",
+			   (insn->imm == (BPF_ADD | BPF_FETCH) ||
+			    insn->imm == (BPF_AND | BPF_FETCH) ||
+			    insn->imm == (BPF_OR | BPF_FETCH) ||
+			    insn->imm == (BPF_XOR | BPF_FETCH))) {
+			verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n",
 				insn->code, insn->src_reg,
 				BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+				bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off, insn->src_reg);
 		} else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 89a4d154ab37..0f82d5d46e2c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3612,6 +3612,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	switch (insn->imm) {
 	case BPF_ADD:
 	case BPF_ADD | BPF_FETCH:
+	case BPF_AND:
+	case BPF_AND | BPF_FETCH:
+	case BPF_OR:
+	case BPF_OR | BPF_FETCH:
+	case BPF_XOR:
+	case BPF_XOR | BPF_FETCH:
 	case BPF_XCHG:
 	case BPF_CMPXCHG:
 		break;
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index d75998b0d5ac..736bdeccdfe4 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -173,7 +173,13 @@
  * Atomic operations:
  *
  *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
+ *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
+ *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
+ *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
  *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
+ *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
+ *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
  *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
  *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
  */
-- 
cgit v1.2.3


From bd7525dacd7e204f8cae061941fb9001c89d6988 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 14 Jan 2021 14:40:42 +0100
Subject: bpf: Move stack_map_get_build_id into lib

Moving stack_map_get_build_id into lib with
declaration in linux/buildid.h header:

  int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id);

This function returns build id for given struct vm_area_struct.
There is no functional change to stack_map_get_build_id function.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210114134044.1418404-2-jolsa@kernel.org
---
 include/linux/buildid.h |  11 ++++
 kernel/bpf/stackmap.c   | 143 ++----------------------------------------------
 lib/Makefile            |   3 +-
 lib/buildid.c           | 136 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 140 deletions(-)
 create mode 100644 include/linux/buildid.h
 create mode 100644 lib/buildid.c

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
new file mode 100644
index 000000000000..08028a212589
--- /dev/null
+++ b/include/linux/buildid.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BUILDID_H
+#define _LINUX_BUILDID_H
+
+#include <linux/mm_types.h>
+
+#define BUILD_ID_SIZE_MAX 20
+
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id);
+
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index aea96b638473..55d254a59f07 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,9 @@
 #include <linux/kernel.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
-#include <linux/elf.h>
-#include <linux/pagemap.h>
 #include <linux/irq_work.h>
 #include <linux/btf_ids.h>
+#include <linux/buildid.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -143,140 +142,6 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-#define BPF_BUILD_ID 3
-/*
- * Parse build id from the note segment. This logic can be shared between
- * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
- * identical.
- */
-static inline int stack_map_parse_build_id(void *page_addr,
-					   unsigned char *build_id,
-					   void *note_start,
-					   Elf32_Word note_size)
-{
-	Elf32_Word note_offs = 0, new_offs;
-
-	/* check for overflow */
-	if (note_start < page_addr || note_start + note_size < note_start)
-		return -EINVAL;
-
-	/* only supports note that fits in the first page */
-	if (note_start + note_size > page_addr + PAGE_SIZE)
-		return -EINVAL;
-
-	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
-		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
-
-		if (nhdr->n_type == BPF_BUILD_ID &&
-		    nhdr->n_namesz == sizeof("GNU") &&
-		    nhdr->n_descsz > 0 &&
-		    nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
-			memcpy(build_id,
-			       note_start + note_offs +
-			       ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
-			       nhdr->n_descsz);
-			memset(build_id + nhdr->n_descsz, 0,
-			       BPF_BUILD_ID_SIZE - nhdr->n_descsz);
-			return 0;
-		}
-		new_offs = note_offs + sizeof(Elf32_Nhdr) +
-			ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
-		if (new_offs <= note_offs)  /* overflow */
-			break;
-		note_offs = new_offs;
-	}
-	return -EINVAL;
-}
-
-/* Parse build ID from 32-bit ELF */
-static int stack_map_get_build_id_32(void *page_addr,
-				     unsigned char *build_id)
-{
-	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
-	Elf32_Phdr *phdr;
-	int i;
-
-	/* only supports phdr that fits in one page */
-	if (ehdr->e_phnum >
-	    (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
-		return -EINVAL;
-
-	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
-
-	for (i = 0; i < ehdr->e_phnum; ++i) {
-		if (phdr[i].p_type == PT_NOTE &&
-		    !stack_map_parse_build_id(page_addr, build_id,
-					      page_addr + phdr[i].p_offset,
-					      phdr[i].p_filesz))
-			return 0;
-	}
-	return -EINVAL;
-}
-
-/* Parse build ID from 64-bit ELF */
-static int stack_map_get_build_id_64(void *page_addr,
-				     unsigned char *build_id)
-{
-	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
-	Elf64_Phdr *phdr;
-	int i;
-
-	/* only supports phdr that fits in one page */
-	if (ehdr->e_phnum >
-	    (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
-		return -EINVAL;
-
-	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
-
-	for (i = 0; i < ehdr->e_phnum; ++i) {
-		if (phdr[i].p_type == PT_NOTE &&
-		    !stack_map_parse_build_id(page_addr, build_id,
-					      page_addr + phdr[i].p_offset,
-					      phdr[i].p_filesz))
-			return 0;
-	}
-	return -EINVAL;
-}
-
-/* Parse build ID of ELF file mapped to vma */
-static int stack_map_get_build_id(struct vm_area_struct *vma,
-				  unsigned char *build_id)
-{
-	Elf32_Ehdr *ehdr;
-	struct page *page;
-	void *page_addr;
-	int ret;
-
-	/* only works for page backed storage  */
-	if (!vma->vm_file)
-		return -EINVAL;
-
-	page = find_get_page(vma->vm_file->f_mapping, 0);
-	if (!page)
-		return -EFAULT;	/* page not mapped */
-
-	ret = -EINVAL;
-	page_addr = kmap_atomic(page);
-	ehdr = (Elf32_Ehdr *)page_addr;
-
-	/* compare magic x7f "ELF" */
-	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
-		goto out;
-
-	/* only support executable file and shared object file */
-	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
-		goto out;
-
-	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
-		ret = stack_map_get_build_id_32(page_addr, build_id);
-	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
-		ret = stack_map_get_build_id_64(page_addr, build_id);
-out:
-	kunmap_atomic(page_addr);
-	put_page(page);
-	return ret;
-}
-
 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
@@ -317,18 +182,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 		for (i = 0; i < trace_nr; i++) {
 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
 			id_offs[i].ip = ips[i];
-			memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
+			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
 		}
 		return;
 	}
 
 	for (i = 0; i < trace_nr; i++) {
 		vma = find_vma(current->mm, ips[i]);
-		if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+		if (!vma || build_id_parse(vma, id_offs[i].build_id)) {
 			/* per entry fall back to ips */
 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
 			id_offs[i].ip = ips[i];
-			memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
+			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
 			continue;
 		}
 		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
diff --git a/lib/Makefile b/lib/Makefile
index afeff05fa8c5..a6b160c3a4fa 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -36,7 +36,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
-	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o
+	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
+	 buildid.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/buildid.c b/lib/buildid.c
new file mode 100644
index 000000000000..4a4f520c0e29
--- /dev/null
+++ b/lib/buildid.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/buildid.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
+
+#define BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int parse_build_id(void *page_addr,
+				 unsigned char *build_id,
+				 void *note_start,
+				 Elf32_Word note_size)
+{
+	Elf32_Word note_offs = 0, new_offs;
+
+	/* check for overflow */
+	if (note_start < page_addr || note_start + note_size < note_start)
+		return -EINVAL;
+
+	/* only supports note that fits in the first page */
+	if (note_start + note_size > page_addr + PAGE_SIZE)
+		return -EINVAL;
+
+	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+		if (nhdr->n_type == BUILD_ID &&
+		    nhdr->n_namesz == sizeof("GNU") &&
+		    nhdr->n_descsz > 0 &&
+		    nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
+			memcpy(build_id,
+			       note_start + note_offs +
+			       ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+			       nhdr->n_descsz);
+			memset(build_id + nhdr->n_descsz, 0,
+			       BUILD_ID_SIZE_MAX - nhdr->n_descsz);
+			return 0;
+		}
+		new_offs = note_offs + sizeof(Elf32_Nhdr) +
+			ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+		if (new_offs <= note_offs)  /* overflow */
+			break;
+		note_offs = new_offs;
+	}
+	return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int get_build_id_32(void *page_addr, unsigned char *build_id)
+{
+	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+	Elf32_Phdr *phdr;
+	int i;
+
+	/* only supports phdr that fits in one page */
+	if (ehdr->e_phnum >
+	    (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+		return -EINVAL;
+
+	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+	for (i = 0; i < ehdr->e_phnum; ++i) {
+		if (phdr[i].p_type == PT_NOTE &&
+		    !parse_build_id(page_addr, build_id,
+				    page_addr + phdr[i].p_offset,
+				    phdr[i].p_filesz))
+			return 0;
+	}
+	return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int get_build_id_64(void *page_addr, unsigned char *build_id)
+{
+	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+	Elf64_Phdr *phdr;
+	int i;
+
+	/* only supports phdr that fits in one page */
+	if (ehdr->e_phnum >
+	    (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+		return -EINVAL;
+
+	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+	for (i = 0; i < ehdr->e_phnum; ++i) {
+		if (phdr[i].p_type == PT_NOTE &&
+		    !parse_build_id(page_addr, build_id,
+				    page_addr + phdr[i].p_offset,
+				    phdr[i].p_filesz))
+			return 0;
+	}
+	return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id)
+{
+	Elf32_Ehdr *ehdr;
+	struct page *page;
+	void *page_addr;
+	int ret;
+
+	/* only works for page backed storage  */
+	if (!vma->vm_file)
+		return -EINVAL;
+
+	page = find_get_page(vma->vm_file->f_mapping, 0);
+	if (!page)
+		return -EFAULT;	/* page not mapped */
+
+	ret = -EINVAL;
+	page_addr = kmap_atomic(page);
+	ehdr = (Elf32_Ehdr *)page_addr;
+
+	/* compare magic x7f "ELF" */
+	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+		goto out;
+
+	/* only support executable file and shared object file */
+	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+		goto out;
+
+	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+		ret = get_build_id_32(page_addr, build_id);
+	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+		ret = get_build_id_64(page_addr, build_id);
+out:
+	kunmap_atomic(page_addr);
+	put_page(page);
+	return ret;
+}
-- 
cgit v1.2.3


From 921f88fc891922b325b3668cd026a386571ed602 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 14 Jan 2021 14:40:43 +0100
Subject: bpf: Add size arg to build_id_parse function

It's possible to have other build id types (other than default SHA1).
Currently there's also ld support for MD5 build id.

Adding size argument to build_id_parse function, that returns (if defined)
size of the parsed build id, so we can recognize the build id type.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210114134044.1418404-3-jolsa@kernel.org
---
 include/linux/buildid.h |  3 ++-
 kernel/bpf/stackmap.c   |  2 +-
 lib/buildid.c           | 29 +++++++++++++++++++++--------
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 08028a212589..40232f90db6e 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -6,6 +6,7 @@
 
 #define BUILD_ID_SIZE_MAX 20
 
-int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id);
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
+		   __u32 *size);
 
 #endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 55d254a59f07..cabaf7db8efc 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -189,7 +189,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 
 	for (i = 0; i < trace_nr; i++) {
 		vma = find_vma(current->mm, ips[i]);
-		if (!vma || build_id_parse(vma, id_offs[i].build_id)) {
+		if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
 			/* per entry fall back to ips */
 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
 			id_offs[i].ip = ips[i];
diff --git a/lib/buildid.c b/lib/buildid.c
index 4a4f520c0e29..6156997c3895 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -12,6 +12,7 @@
  */
 static inline int parse_build_id(void *page_addr,
 				 unsigned char *build_id,
+				 __u32 *size,
 				 void *note_start,
 				 Elf32_Word note_size)
 {
@@ -38,6 +39,8 @@ static inline int parse_build_id(void *page_addr,
 			       nhdr->n_descsz);
 			memset(build_id + nhdr->n_descsz, 0,
 			       BUILD_ID_SIZE_MAX - nhdr->n_descsz);
+			if (size)
+				*size = nhdr->n_descsz;
 			return 0;
 		}
 		new_offs = note_offs + sizeof(Elf32_Nhdr) +
@@ -50,7 +53,8 @@ static inline int parse_build_id(void *page_addr,
 }
 
 /* Parse build ID from 32-bit ELF */
-static int get_build_id_32(void *page_addr, unsigned char *build_id)
+static int get_build_id_32(void *page_addr, unsigned char *build_id,
+			   __u32 *size)
 {
 	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
 	Elf32_Phdr *phdr;
@@ -65,7 +69,7 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id)
 
 	for (i = 0; i < ehdr->e_phnum; ++i) {
 		if (phdr[i].p_type == PT_NOTE &&
-		    !parse_build_id(page_addr, build_id,
+		    !parse_build_id(page_addr, build_id, size,
 				    page_addr + phdr[i].p_offset,
 				    phdr[i].p_filesz))
 			return 0;
@@ -74,7 +78,8 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id)
 }
 
 /* Parse build ID from 64-bit ELF */
-static int get_build_id_64(void *page_addr, unsigned char *build_id)
+static int get_build_id_64(void *page_addr, unsigned char *build_id,
+			   __u32 *size)
 {
 	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
 	Elf64_Phdr *phdr;
@@ -89,7 +94,7 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id)
 
 	for (i = 0; i < ehdr->e_phnum; ++i) {
 		if (phdr[i].p_type == PT_NOTE &&
-		    !parse_build_id(page_addr, build_id,
+		    !parse_build_id(page_addr, build_id, size,
 				    page_addr + phdr[i].p_offset,
 				    phdr[i].p_filesz))
 			return 0;
@@ -97,8 +102,16 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id)
 	return -EINVAL;
 }
 
-/* Parse build ID of ELF file mapped to vma */
-int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id)
+/*
+ * Parse build ID of ELF file mapped to vma
+ * @vma:      vma object
+ * @build_id: buffer to store build id, at least BUILD_ID_SIZE long
+ * @size:     returns actual build id size in case of success
+ *
+ * Returns 0 on success, otherwise error (< 0).
+ */
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
+		   __u32 *size)
 {
 	Elf32_Ehdr *ehdr;
 	struct page *page;
@@ -126,9 +139,9 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id)
 		goto out;
 
 	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
-		ret = get_build_id_32(page_addr, build_id);
+		ret = get_build_id_32(page_addr, build_id, size);
 	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
-		ret = get_build_id_64(page_addr, build_id);
+		ret = get_build_id_64(page_addr, build_id, size);
 out:
 	kunmap_atomic(page_addr);
 	put_page(page);
-- 
cgit v1.2.3


From d6e645012d97164609260ac567b304681734c5e2 Mon Sep 17 00:00:00 2001
From: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Date: Thu, 7 Jan 2021 20:07:03 -0800
Subject: IMA: define a hook to measure kernel integrity critical data

IMA provides capabilities to measure file and buffer data.  However,
various data structures, policies, and states stored in kernel memory
also impact the integrity of the system.  Several kernel subsystems
contain such integrity critical data.  These kernel subsystems help
protect the integrity of the system.  Currently, IMA does not provide a
generic function for measuring kernel integrity critical data.

Define ima_measure_critical_data, a new IMA hook, to measure kernel
integrity critical data.

Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h               |  7 +++++++
 security/integrity/ima/ima.h      |  1 +
 security/integrity/ima/ima_api.c  |  2 +-
 security/integrity/ima/ima_main.c | 24 ++++++++++++++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 7db9cca1af34..59bd90ac3c35 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -31,6 +31,9 @@ extern void ima_post_path_mknod(struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
+extern void ima_measure_critical_data(const char *event_name,
+				      const void *buf, size_t buf_len,
+				      bool hash);
 
 #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
 extern void ima_appraise_parse_cmdline(void);
@@ -128,6 +131,10 @@ static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size
 }
 
 static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
+
+static inline void ima_measure_critical_data(const char *event_name,
+					     const void *buf, size_t buf_len,
+					     bool hash) {}
 #endif /* CONFIG_IMA */
 
 #ifndef CONFIG_IMA_KEXEC
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 0b4634515839..aa312472c7c5 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -201,6 +201,7 @@ static inline unsigned int ima_hash_key(u8 *digest)
 	hook(POLICY_CHECK, policy)			\
 	hook(KEXEC_CMDLINE, kexec_cmdline)		\
 	hook(KEY_CHECK, key)				\
+	hook(CRITICAL_DATA, critical_data)		\
 	hook(MAX_CHECK, none)
 
 #define __ima_hook_enumify(ENUM, str)	ENUM,
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index e76499b1ce78..1dd70dc68ffd 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -176,7 +176,7 @@ err_out:
  *		subj=, obj=, type=, func=, mask=, fsmagic=
  *	subj,obj, and type: are LSM specific.
  *	func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
- *	| KEXEC_CMDLINE | KEY_CHECK
+ *	| KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA
  *	mask: contains the permission mask
  *	fsmagic: hex value
  *
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 250e52114230..251e7b4006f4 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -943,6 +943,30 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
 	fdput(f);
 }
 
+/**
+ * ima_measure_critical_data - measure kernel integrity critical data
+ * @event_name: event name for the record in the IMA measurement list
+ * @buf: pointer to buffer data
+ * @buf_len: length of buffer data (in bytes)
+ * @hash: measure buffer data hash
+ *
+ * Measure data critical to the integrity of the kernel into the IMA log
+ * and extend the pcr.  Examples of critical data could be various data
+ * structures, policies, and states stored in kernel memory that can
+ * impact the integrity of the system.
+ */
+void ima_measure_critical_data(const char *event_name,
+			       const void *buf, size_t buf_len,
+			       bool hash)
+{
+	if (!event_name || !buf || !buf_len)
+		return;
+
+	process_buffer_measurement(NULL, buf, buf_len, event_name,
+				   CRITICAL_DATA, 0, NULL,
+				   hash);
+}
+
 static int __init init_ima(void)
 {
 	int error;
-- 
cgit v1.2.3


From 9f5d7d23cc5ec61a92076b73665fcb9aaa5bb5a0 Mon Sep 17 00:00:00 2001
From: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Date: Thu, 7 Jan 2021 20:07:06 -0800
Subject: IMA: extend critical data hook to limit the measurement based on a
 label

The IMA hook ima_measure_critical_data() does not support a way to
specify the source of the critical data provider.  Thus, the data
measurement cannot be constrained based on the data source label
in the IMA policy.

Extend the IMA hook ima_measure_critical_data() to support passing
the data source label as an input parameter, so that the policy rule can
be used to limit the measurements based on the label.

Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h               | 7 +++++--
 security/integrity/ima/ima_main.c | 8 +++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 59bd90ac3c35..2ac834badbbe 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -31,7 +31,8 @@ extern void ima_post_path_mknod(struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
-extern void ima_measure_critical_data(const char *event_name,
+extern void ima_measure_critical_data(const char *event_label,
+				      const char *event_name,
 				      const void *buf, size_t buf_len,
 				      bool hash);
 
@@ -132,9 +133,11 @@ static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size
 
 static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
 
-static inline void ima_measure_critical_data(const char *event_name,
+static inline void ima_measure_critical_data(const char *event_label,
+					     const char *event_name,
 					     const void *buf, size_t buf_len,
 					     bool hash) {}
+
 #endif /* CONFIG_IMA */
 
 #ifndef CONFIG_IMA_KEXEC
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 251e7b4006f4..6a429846f90a 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -945,6 +945,7 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
 
 /**
  * ima_measure_critical_data - measure kernel integrity critical data
+ * @event_label: unique event label for grouping and limiting critical data
  * @event_name: event name for the record in the IMA measurement list
  * @buf: pointer to buffer data
  * @buf_len: length of buffer data (in bytes)
@@ -955,15 +956,16 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
  * structures, policies, and states stored in kernel memory that can
  * impact the integrity of the system.
  */
-void ima_measure_critical_data(const char *event_name,
+void ima_measure_critical_data(const char *event_label,
+			       const char *event_name,
 			       const void *buf, size_t buf_len,
 			       bool hash)
 {
-	if (!event_name || !buf || !buf_len)
+	if (!event_name || !event_label || !buf || !buf_len)
 		return;
 
 	process_buffer_measurement(NULL, buf, buf_len, event_name,
-				   CRITICAL_DATA, 0, NULL,
+				   CRITICAL_DATA, 0, event_label,
 				   hash);
 }
 
-- 
cgit v1.2.3


From 3c15e00e7b58bc2b37e53d2612f0a0163281be77 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Thu, 26 Nov 2020 11:41:42 +0100
Subject: mfd/bus: sunxi-rsb: Make .remove() callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of struct device_driver::remove
because there is only little that can be done. To simplify the quest to
make this function return void, let struct sunxi_rsb_driver::remove
return void, too. All users already unconditionally return 0, this
commit makes this obvious and ensures future users don't behave
differently. To simplify even further, make axp20x_device_remove()
return void instead of returning 0 unconditionally, too.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/bus/sunxi-rsb.c    | 4 +++-
 drivers/mfd/axp20x-i2c.c   | 4 +++-
 drivers/mfd/axp20x-rsb.c   | 4 ++--
 drivers/mfd/axp20x.c       | 4 +---
 include/linux/mfd/axp20x.h | 2 +-
 include/linux/sunxi-rsb.h  | 2 +-
 6 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/sunxi-rsb.c b/drivers/bus/sunxi-rsb.c
index 1bb00a959c67..117716e23ffb 100644
--- a/drivers/bus/sunxi-rsb.c
+++ b/drivers/bus/sunxi-rsb.c
@@ -170,7 +170,9 @@ static int sunxi_rsb_device_remove(struct device *dev)
 {
 	const struct sunxi_rsb_driver *drv = to_sunxi_rsb_driver(dev->driver);
 
-	return drv->remove(to_sunxi_rsb_device(dev));
+	drv->remove(to_sunxi_rsb_device(dev));
+
+	return 0;
 }
 
 static struct bus_type sunxi_rsb_bus = {
diff --git a/drivers/mfd/axp20x-i2c.c b/drivers/mfd/axp20x-i2c.c
index 2cfde81f5fbf..00ab48018d8d 100644
--- a/drivers/mfd/axp20x-i2c.c
+++ b/drivers/mfd/axp20x-i2c.c
@@ -54,7 +54,9 @@ static int axp20x_i2c_remove(struct i2c_client *i2c)
 {
 	struct axp20x_dev *axp20x = i2c_get_clientdata(i2c);
 
-	return axp20x_device_remove(axp20x);
+	axp20x_device_remove(axp20x);
+
+	return 0;
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index 4cdc79f5cc48..214bc0d84d44 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -49,11 +49,11 @@ static int axp20x_rsb_probe(struct sunxi_rsb_device *rdev)
 	return axp20x_device_probe(axp20x);
 }
 
-static int axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
+static void axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
 {
 	struct axp20x_dev *axp20x = sunxi_rsb_device_get_drvdata(rdev);
 
-	return axp20x_device_remove(axp20x);
+	axp20x_device_remove(axp20x);
 }
 
 static const struct of_device_id axp20x_rsb_of_match[] = {
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index aa59496e4376..3eae04e24ac8 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -987,7 +987,7 @@ int axp20x_device_probe(struct axp20x_dev *axp20x)
 }
 EXPORT_SYMBOL(axp20x_device_probe);
 
-int axp20x_device_remove(struct axp20x_dev *axp20x)
+void axp20x_device_remove(struct axp20x_dev *axp20x)
 {
 	if (axp20x == axp20x_pm_power_off) {
 		axp20x_pm_power_off = NULL;
@@ -996,8 +996,6 @@ int axp20x_device_remove(struct axp20x_dev *axp20x)
 
 	mfd_remove_devices(axp20x->dev);
 	regmap_del_irq_chip(axp20x->irq, axp20x->regmap_irqc);
-
-	return 0;
 }
 EXPORT_SYMBOL(axp20x_device_remove);
 
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index fd5957c042da..9ab0e2fca7ea 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -696,6 +696,6 @@ int axp20x_device_probe(struct axp20x_dev *axp20x);
  *
  * This tells the axp20x core to remove the associated mfd devices
  */
-int axp20x_device_remove(struct axp20x_dev *axp20x);
+void axp20x_device_remove(struct axp20x_dev *axp20x);
 
 #endif /* __LINUX_MFD_AXP20X_H */
diff --git a/include/linux/sunxi-rsb.h b/include/linux/sunxi-rsb.h
index 7e75bb0346d0..bf0d365f471c 100644
--- a/include/linux/sunxi-rsb.h
+++ b/include/linux/sunxi-rsb.h
@@ -59,7 +59,7 @@ static inline void sunxi_rsb_device_set_drvdata(struct sunxi_rsb_device *rdev,
 struct sunxi_rsb_driver {
 	struct device_driver driver;
 	int (*probe)(struct sunxi_rsb_device *rdev);
-	int (*remove)(struct sunxi_rsb_device *rdev);
+	void (*remove)(struct sunxi_rsb_device *rdev);
 };
 
 static inline struct sunxi_rsb_driver *to_sunxi_rsb_driver(struct device_driver *d)
-- 
cgit v1.2.3


From e68d0119e3284334de5650a1ac42ef4e179f895e Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Fri, 15 Jan 2021 12:49:11 +0300
Subject: software node: Introduce device_add_software_node()

This helper will register a software node and then assign
it to device at the same time. The function will also make
sure that the device can't have more than one software node.

Acked-by: Felipe Balbi <balbi@kernel.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20210115094914.88401-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/swnode.c    | 71 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/property.h |  3 ++
 2 files changed, 65 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 4a4b2008fbc2..4842dd678200 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -48,6 +48,19 @@ EXPORT_SYMBOL_GPL(is_software_node);
 				     struct swnode, fwnode) : NULL;	\
 	})
 
+static inline struct swnode *dev_to_swnode(struct device *dev)
+{
+	struct fwnode_handle *fwnode = dev_fwnode(dev);
+
+	if (!fwnode)
+		return NULL;
+
+	if (!is_software_node(fwnode))
+		fwnode = fwnode->secondary;
+
+	return to_swnode(fwnode);
+}
+
 static struct swnode *
 software_node_to_swnode(const struct software_node *node)
 {
@@ -843,22 +856,62 @@ void fwnode_remove_software_node(struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(fwnode_remove_software_node);
 
+/**
+ * device_add_software_node - Assign software node to a device
+ * @dev: The device the software node is meant for.
+ * @swnode: The software node.
+ *
+ * This function will register @swnode and make it the secondary firmware node
+ * pointer of @dev. If @dev has no primary node, then @swnode will become the primary
+ * node.
+ */
+int device_add_software_node(struct device *dev, const struct software_node *swnode)
+{
+	int ret;
+
+	/* Only one software node per device. */
+	if (dev_to_swnode(dev))
+		return -EBUSY;
+
+	ret = software_node_register(swnode);
+	if (ret)
+		return ret;
+
+	set_secondary_fwnode(dev, software_node_fwnode(swnode));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(device_add_software_node);
+
+/**
+ * device_remove_software_node - Remove device's software node
+ * @dev: The device with the software node.
+ *
+ * This function will unregister the software node of @dev.
+ */
+void device_remove_software_node(struct device *dev)
+{
+	struct swnode *swnode;
+
+	swnode = dev_to_swnode(dev);
+	if (!swnode)
+		return;
+
+	software_node_notify(dev, KOBJ_REMOVE);
+	set_secondary_fwnode(dev, NULL);
+	kobject_put(&swnode->kobj);
+}
+EXPORT_SYMBOL_GPL(device_remove_software_node);
+
 int software_node_notify(struct device *dev, unsigned long action)
 {
-	struct fwnode_handle *fwnode = dev_fwnode(dev);
 	struct swnode *swnode;
 	int ret;
 
-	if (!fwnode)
-		return 0;
-
-	if (!is_software_node(fwnode))
-		fwnode = fwnode->secondary;
-	if (!is_software_node(fwnode))
+	swnode = dev_to_swnode(dev);
+	if (!swnode)
 		return 0;
 
-	swnode = to_swnode(fwnode);
-
 	switch (action) {
 	case KOBJ_ADD:
 		ret = sysfs_create_link(&dev->kobj, &swnode->kobj,
diff --git a/include/linux/property.h b/include/linux/property.h
index 0a9001fe7aea..b0e413dc5927 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -488,4 +488,7 @@ fwnode_create_software_node(const struct property_entry *properties,
 			    const struct fwnode_handle *parent);
 void fwnode_remove_software_node(struct fwnode_handle *fwnode);
 
+int device_add_software_node(struct device *dev, const struct software_node *swnode);
+void device_remove_software_node(struct device *dev);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From 429b29aef7f841086949c7359f9c3ccb051e7ea3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Jan 2021 16:51:29 +0100
Subject: tty: serial: Drop unused efm32 serial driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support for this machine was just removed, so drop the now unused UART
driver, too.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20210115155130.185010-7-u.kleine-koenig@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig               |  13 -
 drivers/tty/serial/Makefile              |   1 -
 drivers/tty/serial/efm32-uart.c          | 852 -------------------------------
 include/linux/platform_data/efm32-uart.h |  19 -
 include/uapi/linux/serial_core.h         |   3 -
 5 files changed, 888 deletions(-)
 delete mode 100644 drivers/tty/serial/efm32-uart.c
 delete mode 100644 include/linux/platform_data/efm32-uart.h

(limited to 'include/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 34a2899e69c0..83f6ca4bf210 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1295,14 +1295,6 @@ config SERIAL_AR933X_NR_UARTS
 	  Set this to the number of serial ports you want the driver
 	  to support.
 
-config SERIAL_EFM32_UART
-	tristate "EFM32 UART/USART port"
-	depends on ARM && (ARCH_EFM32 || COMPILE_TEST)
-	select SERIAL_CORE
-	help
-	  This driver support the USART and UART ports on
-	  Energy Micro's efm32 SoCs.
-
 config SERIAL_MPS2_UART_CONSOLE
 	bool "MPS2 UART console support"
 	depends on SERIAL_MPS2_UART
@@ -1316,11 +1308,6 @@ config SERIAL_MPS2_UART
 	help
 	  This driver support the UART ports on ARM MPS2.
 
-config SERIAL_EFM32_UART_CONSOLE
-	bool "EFM32 UART/USART console support"
-	depends on SERIAL_EFM32_UART=y
-	select SERIAL_CORE_CONSOLE
-
 config SERIAL_ARC
 	tristate "ARC UART driver support"
 	select SERIAL_CORE
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index b85d53f9e9ff..ec2b74091f0c 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -73,7 +73,6 @@ obj-$(CONFIG_SERIAL_SIRFSOC) += sirfsoc_uart.o
 obj-$(CONFIG_SERIAL_TEGRA) += serial-tegra.o
 obj-$(CONFIG_SERIAL_TEGRA_TCU) += tegra-tcu.o
 obj-$(CONFIG_SERIAL_AR933X)   += ar933x_uart.o
-obj-$(CONFIG_SERIAL_EFM32_UART) += efm32-uart.o
 obj-$(CONFIG_SERIAL_ARC)	+= arc_uart.o
 obj-$(CONFIG_SERIAL_RP2)	+= rp2.o
 obj-$(CONFIG_SERIAL_FSL_LPUART)	+= fsl_lpuart.o
diff --git a/drivers/tty/serial/efm32-uart.c b/drivers/tty/serial/efm32-uart.c
deleted file mode 100644
index f12f29cf4f31..000000000000
--- a/drivers/tty/serial/efm32-uart.c
+++ /dev/null
@@ -1,852 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/console.h>
-#include <linux/sysrq.h>
-#include <linux/serial_core.h>
-#include <linux/tty_flip.h>
-#include <linux/slab.h>
-#include <linux/clk.h>
-#include <linux/of.h>
-#include <linux/of_device.h>
-
-#include <linux/platform_data/efm32-uart.h>
-
-#define DRIVER_NAME "efm32-uart"
-#define DEV_NAME "ttyefm"
-
-#define UARTn_CTRL		0x00
-#define UARTn_CTRL_SYNC		0x0001
-#define UARTn_CTRL_TXBIL		0x1000
-
-#define UARTn_FRAME		0x04
-#define UARTn_FRAME_DATABITS__MASK	0x000f
-#define UARTn_FRAME_DATABITS(n)		((n) - 3)
-#define UARTn_FRAME_PARITY__MASK	0x0300
-#define UARTn_FRAME_PARITY_NONE		0x0000
-#define UARTn_FRAME_PARITY_EVEN		0x0200
-#define UARTn_FRAME_PARITY_ODD		0x0300
-#define UARTn_FRAME_STOPBITS_HALF	0x0000
-#define UARTn_FRAME_STOPBITS_ONE	0x1000
-#define UARTn_FRAME_STOPBITS_TWO	0x3000
-
-#define UARTn_CMD		0x0c
-#define UARTn_CMD_RXEN			0x0001
-#define UARTn_CMD_RXDIS		0x0002
-#define UARTn_CMD_TXEN			0x0004
-#define UARTn_CMD_TXDIS		0x0008
-
-#define UARTn_STATUS		0x10
-#define UARTn_STATUS_TXENS		0x0002
-#define UARTn_STATUS_TXC		0x0020
-#define UARTn_STATUS_TXBL		0x0040
-#define UARTn_STATUS_RXDATAV		0x0080
-
-#define UARTn_CLKDIV		0x14
-
-#define UARTn_RXDATAX		0x18
-#define UARTn_RXDATAX_RXDATA__MASK	0x01ff
-#define UARTn_RXDATAX_PERR		0x4000
-#define UARTn_RXDATAX_FERR		0x8000
-/*
- * This is a software only flag used for ignore_status_mask and
- * read_status_mask! It's used for breaks that the hardware doesn't report
- * explicitly.
- */
-#define SW_UARTn_RXDATAX_BERR		0x2000
-
-#define UARTn_TXDATA		0x34
-
-#define UARTn_IF		0x40
-#define UARTn_IF_TXC			0x0001
-#define UARTn_IF_TXBL			0x0002
-#define UARTn_IF_RXDATAV		0x0004
-#define UARTn_IF_RXOF			0x0010
-
-#define UARTn_IFS		0x44
-#define UARTn_IFC		0x48
-#define UARTn_IEN		0x4c
-
-#define UARTn_ROUTE		0x54
-#define UARTn_ROUTE_LOCATION__MASK	0x0700
-#define UARTn_ROUTE_LOCATION(n)		(((n) << 8) & UARTn_ROUTE_LOCATION__MASK)
-#define UARTn_ROUTE_RXPEN		0x0001
-#define UARTn_ROUTE_TXPEN		0x0002
-
-struct efm32_uart_port {
-	struct uart_port port;
-	unsigned int txirq;
-	struct clk *clk;
-	struct efm32_uart_pdata pdata;
-};
-#define to_efm_port(_port) container_of(_port, struct efm32_uart_port, port)
-#define efm_debug(efm_port, format, arg...)			\
-	dev_dbg(efm_port->port.dev, format, ##arg)
-
-static void efm32_uart_write32(struct efm32_uart_port *efm_port,
-		u32 value, unsigned offset)
-{
-	writel_relaxed(value, efm_port->port.membase + offset);
-}
-
-static u32 efm32_uart_read32(struct efm32_uart_port *efm_port,
-		unsigned offset)
-{
-	return readl_relaxed(efm_port->port.membase + offset);
-}
-
-static unsigned int efm32_uart_tx_empty(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	u32 status = efm32_uart_read32(efm_port, UARTn_STATUS);
-
-	if (status & UARTn_STATUS_TXC)
-		return TIOCSER_TEMT;
-	else
-		return 0;
-}
-
-static void efm32_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
-{
-	/* sorry, neither handshaking lines nor loop functionallity */
-}
-
-static unsigned int efm32_uart_get_mctrl(struct uart_port *port)
-{
-	/* sorry, no handshaking lines available */
-	return TIOCM_CAR | TIOCM_CTS | TIOCM_DSR;
-}
-
-static void efm32_uart_stop_tx(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	u32 ien = efm32_uart_read32(efm_port,  UARTn_IEN);
-
-	efm32_uart_write32(efm_port, UARTn_CMD_TXDIS, UARTn_CMD);
-	ien &= ~(UARTn_IF_TXC | UARTn_IF_TXBL);
-	efm32_uart_write32(efm_port, ien, UARTn_IEN);
-}
-
-static void efm32_uart_tx_chars(struct efm32_uart_port *efm_port)
-{
-	struct uart_port *port = &efm_port->port;
-	struct circ_buf *xmit = &port->state->xmit;
-
-	while (efm32_uart_read32(efm_port, UARTn_STATUS) &
-			UARTn_STATUS_TXBL) {
-		if (port->x_char) {
-			port->icount.tx++;
-			efm32_uart_write32(efm_port, port->x_char,
-					UARTn_TXDATA);
-			port->x_char = 0;
-			continue;
-		}
-		if (!uart_circ_empty(xmit) && !uart_tx_stopped(port)) {
-			port->icount.tx++;
-			efm32_uart_write32(efm_port, xmit->buf[xmit->tail],
-					UARTn_TXDATA);
-			xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-		} else
-			break;
-	}
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
-		uart_write_wakeup(port);
-
-	if (!port->x_char && uart_circ_empty(xmit) &&
-			efm32_uart_read32(efm_port, UARTn_STATUS) &
-				UARTn_STATUS_TXC)
-		efm32_uart_stop_tx(port);
-}
-
-static void efm32_uart_start_tx(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	u32 ien;
-
-	efm32_uart_write32(efm_port,
-			UARTn_IF_TXBL | UARTn_IF_TXC, UARTn_IFC);
-	ien = efm32_uart_read32(efm_port, UARTn_IEN);
-	efm32_uart_write32(efm_port,
-			ien | UARTn_IF_TXBL | UARTn_IF_TXC, UARTn_IEN);
-	efm32_uart_write32(efm_port, UARTn_CMD_TXEN, UARTn_CMD);
-
-	efm32_uart_tx_chars(efm_port);
-}
-
-static void efm32_uart_stop_rx(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-
-	efm32_uart_write32(efm_port, UARTn_CMD_RXDIS, UARTn_CMD);
-}
-
-static void efm32_uart_break_ctl(struct uart_port *port, int ctl)
-{
-	/* not possible without fiddling with gpios */
-}
-
-static void efm32_uart_rx_chars(struct efm32_uart_port *efm_port)
-{
-	struct uart_port *port = &efm_port->port;
-
-	while (efm32_uart_read32(efm_port, UARTn_STATUS) &
-			UARTn_STATUS_RXDATAV) {
-		u32 rxdata = efm32_uart_read32(efm_port, UARTn_RXDATAX);
-		int flag = 0;
-
-		/*
-		 * This is a reserved bit and I only saw it read as 0. But to be
-		 * sure not to be confused too much by new devices adhere to the
-		 * warning in the reference manual that reserved bits might
-		 * read as 1 in the future.
-		 */
-		rxdata &= ~SW_UARTn_RXDATAX_BERR;
-
-		port->icount.rx++;
-
-		if ((rxdata & UARTn_RXDATAX_FERR) &&
-				!(rxdata & UARTn_RXDATAX_RXDATA__MASK)) {
-			rxdata |= SW_UARTn_RXDATAX_BERR;
-			port->icount.brk++;
-			if (uart_handle_break(port))
-				continue;
-		} else if (rxdata & UARTn_RXDATAX_PERR)
-			port->icount.parity++;
-		else if (rxdata & UARTn_RXDATAX_FERR)
-			port->icount.frame++;
-
-		rxdata &= port->read_status_mask;
-
-		if (rxdata & SW_UARTn_RXDATAX_BERR)
-			flag = TTY_BREAK;
-		else if (rxdata & UARTn_RXDATAX_PERR)
-			flag = TTY_PARITY;
-		else if (rxdata & UARTn_RXDATAX_FERR)
-			flag = TTY_FRAME;
-		else if (uart_handle_sysrq_char(port,
-					rxdata & UARTn_RXDATAX_RXDATA__MASK))
-			continue;
-
-		if ((rxdata & port->ignore_status_mask) == 0)
-			tty_insert_flip_char(&port->state->port,
-					rxdata & UARTn_RXDATAX_RXDATA__MASK, flag);
-	}
-}
-
-static irqreturn_t efm32_uart_rxirq(int irq, void *data)
-{
-	struct efm32_uart_port *efm_port = data;
-	u32 irqflag = efm32_uart_read32(efm_port, UARTn_IF);
-	int handled = IRQ_NONE;
-	struct uart_port *port = &efm_port->port;
-	struct tty_port *tport = &port->state->port;
-
-	spin_lock(&port->lock);
-
-	if (irqflag & UARTn_IF_RXDATAV) {
-		efm32_uart_write32(efm_port, UARTn_IF_RXDATAV, UARTn_IFC);
-		efm32_uart_rx_chars(efm_port);
-
-		handled = IRQ_HANDLED;
-	}
-
-	if (irqflag & UARTn_IF_RXOF) {
-		efm32_uart_write32(efm_port, UARTn_IF_RXOF, UARTn_IFC);
-		port->icount.overrun++;
-		tty_insert_flip_char(tport, 0, TTY_OVERRUN);
-
-		handled = IRQ_HANDLED;
-	}
-
-	spin_unlock(&port->lock);
-
-	tty_flip_buffer_push(tport);
-
-	return handled;
-}
-
-static irqreturn_t efm32_uart_txirq(int irq, void *data)
-{
-	struct efm32_uart_port *efm_port = data;
-	u32 irqflag = efm32_uart_read32(efm_port, UARTn_IF);
-
-	/* TXBL doesn't need to be cleared */
-	if (irqflag & UARTn_IF_TXC)
-		efm32_uart_write32(efm_port, UARTn_IF_TXC, UARTn_IFC);
-
-	if (irqflag & (UARTn_IF_TXC | UARTn_IF_TXBL)) {
-		efm32_uart_tx_chars(efm_port);
-		return IRQ_HANDLED;
-	} else
-		return IRQ_NONE;
-}
-
-static int efm32_uart_startup(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	int ret;
-
-	ret = clk_enable(efm_port->clk);
-	if (ret) {
-		efm_debug(efm_port, "failed to enable clk\n");
-		goto err_clk_enable;
-	}
-	port->uartclk = clk_get_rate(efm_port->clk);
-
-	/* Enable pins at configured location */
-	efm32_uart_write32(efm_port,
-			UARTn_ROUTE_LOCATION(efm_port->pdata.location) |
-			UARTn_ROUTE_RXPEN | UARTn_ROUTE_TXPEN,
-			UARTn_ROUTE);
-
-	ret = request_irq(port->irq, efm32_uart_rxirq, 0,
-			DRIVER_NAME, efm_port);
-	if (ret) {
-		efm_debug(efm_port, "failed to register rxirq\n");
-		goto err_request_irq_rx;
-	}
-
-	/* disable all irqs */
-	efm32_uart_write32(efm_port, 0, UARTn_IEN);
-
-	ret = request_irq(efm_port->txirq, efm32_uart_txirq, 0,
-			DRIVER_NAME, efm_port);
-	if (ret) {
-		efm_debug(efm_port, "failed to register txirq\n");
-		free_irq(port->irq, efm_port);
-err_request_irq_rx:
-
-		clk_disable(efm_port->clk);
-	} else {
-		efm32_uart_write32(efm_port,
-				UARTn_IF_RXDATAV | UARTn_IF_RXOF, UARTn_IEN);
-		efm32_uart_write32(efm_port, UARTn_CMD_RXEN, UARTn_CMD);
-	}
-
-err_clk_enable:
-	return ret;
-}
-
-static void efm32_uart_shutdown(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-
-	efm32_uart_write32(efm_port, 0, UARTn_IEN);
-	free_irq(port->irq, efm_port);
-
-	clk_disable(efm_port->clk);
-}
-
-static void efm32_uart_set_termios(struct uart_port *port,
-		struct ktermios *new, struct ktermios *old)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	unsigned long flags;
-	unsigned baud;
-	u32 clkdiv;
-	u32 frame = 0;
-
-	/* no modem control lines */
-	new->c_cflag &= ~(CRTSCTS | CMSPAR);
-
-	baud = uart_get_baud_rate(port, new, old,
-			DIV_ROUND_CLOSEST(port->uartclk, 16 * 8192),
-			DIV_ROUND_CLOSEST(port->uartclk, 16));
-
-	switch (new->c_cflag & CSIZE) {
-	case CS5:
-		frame |= UARTn_FRAME_DATABITS(5);
-		break;
-	case CS6:
-		frame |= UARTn_FRAME_DATABITS(6);
-		break;
-	case CS7:
-		frame |= UARTn_FRAME_DATABITS(7);
-		break;
-	case CS8:
-		frame |= UARTn_FRAME_DATABITS(8);
-		break;
-	}
-
-	if (new->c_cflag & CSTOPB)
-		/* the receiver only verifies the first stop bit */
-		frame |= UARTn_FRAME_STOPBITS_TWO;
-	else
-		frame |= UARTn_FRAME_STOPBITS_ONE;
-
-	if (new->c_cflag & PARENB) {
-		if (new->c_cflag & PARODD)
-			frame |= UARTn_FRAME_PARITY_ODD;
-		else
-			frame |= UARTn_FRAME_PARITY_EVEN;
-	} else
-		frame |= UARTn_FRAME_PARITY_NONE;
-
-	/*
-	 * the 6 lowest bits of CLKDIV are dc, bit 6 has value 0.25.
-	 * port->uartclk <= 14e6, so 4 * port->uartclk doesn't overflow.
-	 */
-	clkdiv = (DIV_ROUND_CLOSEST(4 * port->uartclk, 16 * baud) - 4) << 6;
-
-	spin_lock_irqsave(&port->lock, flags);
-
-	efm32_uart_write32(efm_port,
-			UARTn_CMD_TXDIS | UARTn_CMD_RXDIS, UARTn_CMD);
-
-	port->read_status_mask = UARTn_RXDATAX_RXDATA__MASK;
-	if (new->c_iflag & INPCK)
-		port->read_status_mask |=
-			UARTn_RXDATAX_FERR | UARTn_RXDATAX_PERR;
-	if (new->c_iflag & (IGNBRK | BRKINT | PARMRK))
-		port->read_status_mask |= SW_UARTn_RXDATAX_BERR;
-
-	port->ignore_status_mask = 0;
-	if (new->c_iflag & IGNPAR)
-		port->ignore_status_mask |=
-			UARTn_RXDATAX_FERR | UARTn_RXDATAX_PERR;
-	if (new->c_iflag & IGNBRK)
-		port->ignore_status_mask |= SW_UARTn_RXDATAX_BERR;
-
-	uart_update_timeout(port, new->c_cflag, baud);
-
-	efm32_uart_write32(efm_port, UARTn_CTRL_TXBIL, UARTn_CTRL);
-	efm32_uart_write32(efm_port, frame, UARTn_FRAME);
-	efm32_uart_write32(efm_port, clkdiv, UARTn_CLKDIV);
-
-	efm32_uart_write32(efm_port, UARTn_CMD_TXEN | UARTn_CMD_RXEN,
-			UARTn_CMD);
-
-	spin_unlock_irqrestore(&port->lock, flags);
-}
-
-static const char *efm32_uart_type(struct uart_port *port)
-{
-	return port->type == PORT_EFMUART ? "efm32-uart" : NULL;
-}
-
-static void efm32_uart_release_port(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-
-	clk_unprepare(efm_port->clk);
-	clk_put(efm_port->clk);
-	iounmap(port->membase);
-}
-
-static int efm32_uart_request_port(struct uart_port *port)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	int ret;
-
-	port->membase = ioremap(port->mapbase, 60);
-	if (!efm_port->port.membase) {
-		ret = -ENOMEM;
-		efm_debug(efm_port, "failed to remap\n");
-		goto err_ioremap;
-	}
-
-	efm_port->clk = clk_get(port->dev, NULL);
-	if (IS_ERR(efm_port->clk)) {
-		ret = PTR_ERR(efm_port->clk);
-		efm_debug(efm_port, "failed to get clock\n");
-		goto err_clk_get;
-	}
-
-	ret = clk_prepare(efm_port->clk);
-	if (ret) {
-		clk_put(efm_port->clk);
-err_clk_get:
-
-		iounmap(port->membase);
-err_ioremap:
-		return ret;
-	}
-	return 0;
-}
-
-static void efm32_uart_config_port(struct uart_port *port, int type)
-{
-	if (type & UART_CONFIG_TYPE &&
-			!efm32_uart_request_port(port))
-		port->type = PORT_EFMUART;
-}
-
-static int efm32_uart_verify_port(struct uart_port *port,
-		struct serial_struct *serinfo)
-{
-	int ret = 0;
-
-	if (serinfo->type != PORT_UNKNOWN && serinfo->type != PORT_EFMUART)
-		ret = -EINVAL;
-
-	return ret;
-}
-
-static const struct uart_ops efm32_uart_pops = {
-	.tx_empty = efm32_uart_tx_empty,
-	.set_mctrl = efm32_uart_set_mctrl,
-	.get_mctrl = efm32_uart_get_mctrl,
-	.stop_tx = efm32_uart_stop_tx,
-	.start_tx = efm32_uart_start_tx,
-	.stop_rx = efm32_uart_stop_rx,
-	.break_ctl = efm32_uart_break_ctl,
-	.startup = efm32_uart_startup,
-	.shutdown = efm32_uart_shutdown,
-	.set_termios = efm32_uart_set_termios,
-	.type = efm32_uart_type,
-	.release_port = efm32_uart_release_port,
-	.request_port = efm32_uart_request_port,
-	.config_port = efm32_uart_config_port,
-	.verify_port = efm32_uart_verify_port,
-};
-
-static struct efm32_uart_port *efm32_uart_ports[5];
-
-#ifdef CONFIG_SERIAL_EFM32_UART_CONSOLE
-static void efm32_uart_console_putchar(struct uart_port *port, int ch)
-{
-	struct efm32_uart_port *efm_port = to_efm_port(port);
-	unsigned int timeout = 0x400;
-	u32 status;
-
-	while (1) {
-		status = efm32_uart_read32(efm_port, UARTn_STATUS);
-
-		if (status & UARTn_STATUS_TXBL)
-			break;
-		if (!timeout--)
-			return;
-	}
-	efm32_uart_write32(efm_port, ch, UARTn_TXDATA);
-}
-
-static void efm32_uart_console_write(struct console *co, const char *s,
-		unsigned int count)
-{
-	struct efm32_uart_port *efm_port = efm32_uart_ports[co->index];
-	u32 status = efm32_uart_read32(efm_port, UARTn_STATUS);
-	unsigned int timeout = 0x400;
-
-	if (!(status & UARTn_STATUS_TXENS))
-		efm32_uart_write32(efm_port, UARTn_CMD_TXEN, UARTn_CMD);
-
-	uart_console_write(&efm_port->port, s, count,
-			efm32_uart_console_putchar);
-
-	/* Wait for the transmitter to become empty */
-	while (1) {
-		u32 status = efm32_uart_read32(efm_port, UARTn_STATUS);
-		if (status & UARTn_STATUS_TXC)
-			break;
-		if (!timeout--)
-			break;
-	}
-
-	if (!(status & UARTn_STATUS_TXENS))
-		efm32_uart_write32(efm_port, UARTn_CMD_TXDIS, UARTn_CMD);
-}
-
-static void efm32_uart_console_get_options(struct efm32_uart_port *efm_port,
-		int *baud, int *parity, int *bits)
-{
-	u32 ctrl = efm32_uart_read32(efm_port, UARTn_CTRL);
-	u32 route, clkdiv, frame;
-
-	if (ctrl & UARTn_CTRL_SYNC)
-		/* not operating in async mode */
-		return;
-
-	route = efm32_uart_read32(efm_port, UARTn_ROUTE);
-	if (!(route & UARTn_ROUTE_TXPEN))
-		/* tx pin not routed */
-		return;
-
-	clkdiv = efm32_uart_read32(efm_port, UARTn_CLKDIV);
-
-	*baud = DIV_ROUND_CLOSEST(4 * efm_port->port.uartclk,
-			16 * (4 + (clkdiv >> 6)));
-
-	frame = efm32_uart_read32(efm_port, UARTn_FRAME);
-	switch (frame & UARTn_FRAME_PARITY__MASK) {
-	case UARTn_FRAME_PARITY_ODD:
-		*parity = 'o';
-		break;
-	case UARTn_FRAME_PARITY_EVEN:
-		*parity = 'e';
-		break;
-	default:
-		*parity = 'n';
-	}
-
-	*bits = (frame & UARTn_FRAME_DATABITS__MASK) -
-			UARTn_FRAME_DATABITS(4) + 4;
-
-	efm_debug(efm_port, "get_opts: options=%d%c%d\n",
-			*baud, *parity, *bits);
-}
-
-static int efm32_uart_console_setup(struct console *co, char *options)
-{
-	struct efm32_uart_port *efm_port;
-	int baud = 115200;
-	int bits = 8;
-	int parity = 'n';
-	int flow = 'n';
-	int ret;
-
-	if (co->index < 0 || co->index >= ARRAY_SIZE(efm32_uart_ports)) {
-		unsigned i;
-		for (i = 0; i < ARRAY_SIZE(efm32_uart_ports); ++i) {
-			if (efm32_uart_ports[i]) {
-				pr_warn("efm32-console: fall back to console index %u (from %hhi)\n",
-						i, co->index);
-				co->index = i;
-				break;
-			}
-		}
-	}
-
-	efm_port = efm32_uart_ports[co->index];
-	if (!efm_port) {
-		pr_warn("efm32-console: No port at %d\n", co->index);
-		return -ENODEV;
-	}
-
-	ret = clk_prepare(efm_port->clk);
-	if (ret) {
-		dev_warn(efm_port->port.dev,
-				"console: clk_prepare failed: %d\n", ret);
-		return ret;
-	}
-
-	efm_port->port.uartclk = clk_get_rate(efm_port->clk);
-
-	if (options)
-		uart_parse_options(options, &baud, &parity, &bits, &flow);
-	else
-		efm32_uart_console_get_options(efm_port,
-				&baud, &parity, &bits);
-
-	return uart_set_options(&efm_port->port, co, baud, parity, bits, flow);
-}
-
-static struct uart_driver efm32_uart_reg;
-
-static struct console efm32_uart_console = {
-	.name = DEV_NAME,
-	.write = efm32_uart_console_write,
-	.device = uart_console_device,
-	.setup = efm32_uart_console_setup,
-	.flags = CON_PRINTBUFFER,
-	.index = -1,
-	.data = &efm32_uart_reg,
-};
-
-#else
-#define efm32_uart_console (*(struct console *)NULL)
-#endif /* ifdef CONFIG_SERIAL_EFM32_UART_CONSOLE / else */
-
-static struct uart_driver efm32_uart_reg = {
-	.owner = THIS_MODULE,
-	.driver_name = DRIVER_NAME,
-	.dev_name = DEV_NAME,
-	.nr = ARRAY_SIZE(efm32_uart_ports),
-	.cons = &efm32_uart_console,
-};
-
-static int efm32_uart_probe_dt(struct platform_device *pdev,
-		struct efm32_uart_port *efm_port)
-{
-	struct device_node *np = pdev->dev.of_node;
-	u32 location;
-	int ret;
-
-	if (!np)
-		return 1;
-
-	ret = of_property_read_u32(np, "energymicro,location", &location);
-
-	if (ret)
-		/* fall back to wrongly namespaced property */
-		ret = of_property_read_u32(np, "efm32,location", &location);
-
-	if (ret)
-		/* fall back to old and (wrongly) generic property "location" */
-		ret = of_property_read_u32(np, "location", &location);
-
-	if (!ret) {
-		if (location > 5) {
-			dev_err(&pdev->dev, "invalid location\n");
-			return -EINVAL;
-		}
-		efm_debug(efm_port, "using location %u\n", location);
-		efm_port->pdata.location = location;
-	} else {
-		efm_debug(efm_port, "fall back to location 0\n");
-	}
-
-	ret = of_alias_get_id(np, "serial");
-	if (ret < 0) {
-		dev_err(&pdev->dev, "failed to get alias id: %d\n", ret);
-		return ret;
-	} else {
-		efm_port->port.line = ret;
-		return 0;
-	}
-
-}
-
-static int efm32_uart_probe(struct platform_device *pdev)
-{
-	struct efm32_uart_port *efm_port;
-	struct resource *res;
-	unsigned int line;
-	int ret;
-
-	efm_port = kzalloc(sizeof(*efm_port), GFP_KERNEL);
-	if (!efm_port) {
-		dev_dbg(&pdev->dev, "failed to allocate private data\n");
-		return -ENOMEM;
-	}
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		ret = -ENODEV;
-		dev_dbg(&pdev->dev, "failed to determine base address\n");
-		goto err_get_base;
-	}
-
-	if (resource_size(res) < 60) {
-		ret = -EINVAL;
-		dev_dbg(&pdev->dev, "memory resource too small\n");
-		goto err_too_small;
-	}
-
-	ret = platform_get_irq(pdev, 0);
-	if (ret <= 0) {
-		dev_dbg(&pdev->dev, "failed to get rx irq\n");
-		goto err_get_rxirq;
-	}
-
-	efm_port->port.irq = ret;
-
-	ret = platform_get_irq(pdev, 1);
-	if (ret <= 0)
-		ret = efm_port->port.irq + 1;
-
-	efm_port->txirq = ret;
-
-	efm_port->port.dev = &pdev->dev;
-	efm_port->port.mapbase = res->start;
-	efm_port->port.type = PORT_EFMUART;
-	efm_port->port.iotype = UPIO_MEM32;
-	efm_port->port.fifosize = 2;
-	efm_port->port.has_sysrq = IS_ENABLED(CONFIG_SERIAL_EFM32_UART_CONSOLE);
-	efm_port->port.ops = &efm32_uart_pops;
-	efm_port->port.flags = UPF_BOOT_AUTOCONF;
-
-	ret = efm32_uart_probe_dt(pdev, efm_port);
-	if (ret > 0) {
-		/* not created by device tree */
-		const struct efm32_uart_pdata *pdata = dev_get_platdata(&pdev->dev);
-
-		efm_port->port.line = pdev->id;
-
-		if (pdata)
-			efm_port->pdata = *pdata;
-	} else if (ret < 0)
-		goto err_probe_dt;
-
-	line = efm_port->port.line;
-
-	if (line >= 0 && line < ARRAY_SIZE(efm32_uart_ports))
-		efm32_uart_ports[line] = efm_port;
-
-	ret = uart_add_one_port(&efm32_uart_reg, &efm_port->port);
-	if (ret) {
-		dev_dbg(&pdev->dev, "failed to add port: %d\n", ret);
-
-		if (line >= 0 && line < ARRAY_SIZE(efm32_uart_ports))
-			efm32_uart_ports[line] = NULL;
-err_probe_dt:
-err_get_rxirq:
-err_too_small:
-err_get_base:
-		kfree(efm_port);
-	} else {
-		platform_set_drvdata(pdev, efm_port);
-		dev_dbg(&pdev->dev, "\\o/\n");
-	}
-
-	return ret;
-}
-
-static int efm32_uart_remove(struct platform_device *pdev)
-{
-	struct efm32_uart_port *efm_port = platform_get_drvdata(pdev);
-	unsigned int line = efm_port->port.line;
-
-	uart_remove_one_port(&efm32_uart_reg, &efm_port->port);
-
-	if (line >= 0 && line < ARRAY_SIZE(efm32_uart_ports))
-		efm32_uart_ports[line] = NULL;
-
-	kfree(efm_port);
-
-	return 0;
-}
-
-static const struct of_device_id efm32_uart_dt_ids[] = {
-	{
-		.compatible = "energymicro,efm32-uart",
-	}, {
-		/* doesn't follow the "vendor,device" scheme, don't use */
-		.compatible = "efm32,uart",
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(of, efm32_uart_dt_ids);
-
-static struct platform_driver efm32_uart_driver = {
-	.probe = efm32_uart_probe,
-	.remove = efm32_uart_remove,
-
-	.driver = {
-		.name = DRIVER_NAME,
-		.of_match_table = efm32_uart_dt_ids,
-	},
-};
-
-static int __init efm32_uart_init(void)
-{
-	int ret;
-
-	ret = uart_register_driver(&efm32_uart_reg);
-	if (ret)
-		return ret;
-
-	ret = platform_driver_register(&efm32_uart_driver);
-	if (ret)
-		uart_unregister_driver(&efm32_uart_reg);
-
-	pr_info("EFM32 UART/USART driver\n");
-
-	return ret;
-}
-module_init(efm32_uart_init);
-
-static void __exit efm32_uart_exit(void)
-{
-	platform_driver_unregister(&efm32_uart_driver);
-	uart_unregister_driver(&efm32_uart_reg);
-}
-module_exit(efm32_uart_exit);
-
-MODULE_AUTHOR("Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>");
-MODULE_DESCRIPTION("EFM32 UART/USART driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" DRIVER_NAME);
diff --git a/include/linux/platform_data/efm32-uart.h b/include/linux/platform_data/efm32-uart.h
deleted file mode 100644
index ccbb8f11db75..000000000000
--- a/include/linux/platform_data/efm32-uart.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- *
- */
-#ifndef __LINUX_PLATFORM_DATA_EFM32_UART_H__
-#define __LINUX_PLATFORM_DATA_EFM32_UART_H__
-
-#include <linux/types.h>
-
-/**
- * struct efm32_uart_pdata
- * @location: pinmux location for the I/O pins (to be written to the ROUTE
- * 	register)
- */
-struct efm32_uart_pdata {
-	u8 location;
-};
-#endif /* ifndef __LINUX_PLATFORM_DATA_EFM32_UART_H__ */
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 62c22045fe65..c4042dcfdc0c 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -208,9 +208,6 @@
 /* Atheros AR933X SoC */
 #define PORT_AR933X	99
 
-/* Energy Micro efm32 SoC */
-#define PORT_EFMUART   100
-
 /* ARC (Synopsys) on-chip UART */
 #define PORT_ARC       101
 
-- 
cgit v1.2.3


From 0ba882ae2818193487b70ad39622973538711d9a Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Thu, 14 Jan 2021 16:16:27 +0100
Subject: spi: Drop unused efm32 bus driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support for this machine was just removed, so drop the now unused spi
bus driver, too.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20210114151630.128830-5-u.kleine-koenig@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/Kconfig                     |   7 -
 drivers/spi/Makefile                    |   1 -
 drivers/spi/spi-efm32.c                 | 462 --------------------------------
 include/linux/platform_data/efm32-spi.h |  15 --
 4 files changed, 485 deletions(-)
 delete mode 100644 drivers/spi/spi-efm32.c
 delete mode 100644 include/linux/platform_data/efm32-spi.h

(limited to 'include/linux')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index aadaea052f51..d3375aa6e292 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -292,13 +292,6 @@ config SPI_DLN2
 	 This driver can also be built as a module.  If so, the module
 	 will be called spi-dln2.
 
-config SPI_EFM32
-	tristate "EFM32 SPI controller"
-	depends on OF && ARM && (ARCH_EFM32 || COMPILE_TEST)
-	select SPI_BITBANG
-	help
-	  Driver for the spi controller found on Energy Micro's EFM32 SoCs.
-
 config SPI_EP93XX
 	tristate "Cirrus Logic EP93xx SPI controller"
 	depends on ARCH_EP93XX || COMPILE_TEST
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index 6fea5821662e..5a21b5cc8015 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -42,7 +42,6 @@ spi-dw-$(CONFIG_SPI_DW_DMA)		+= spi-dw-dma.o
 obj-$(CONFIG_SPI_DW_BT1)		+= spi-dw-bt1.o
 obj-$(CONFIG_SPI_DW_MMIO)		+= spi-dw-mmio.o
 obj-$(CONFIG_SPI_DW_PCI)		+= spi-dw-pci.o
-obj-$(CONFIG_SPI_EFM32)			+= spi-efm32.o
 obj-$(CONFIG_SPI_EP93XX)		+= spi-ep93xx.o
 obj-$(CONFIG_SPI_FALCON)		+= spi-falcon.o
 obj-$(CONFIG_SPI_FSI)			+= spi-fsi.o
diff --git a/drivers/spi/spi-efm32.c b/drivers/spi/spi-efm32.c
deleted file mode 100644
index ea6e4a7b3feb..000000000000
--- a/drivers/spi/spi-efm32.c
+++ /dev/null
@@ -1,462 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012-2013 Uwe Kleine-Koenig for Pengutronix
- */
-#include <linux/kernel.h>
-#include <linux/io.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/spi_bitbang.h>
-#include <linux/interrupt.h>
-#include <linux/platform_device.h>
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/platform_data/efm32-spi.h>
-#include <linux/of.h>
-
-#define DRIVER_NAME "efm32-spi"
-
-#define MASK_VAL(mask, val)		((val << __ffs(mask)) & mask)
-
-#define REG_CTRL		0x00
-#define REG_CTRL_SYNC			0x0001
-#define REG_CTRL_CLKPOL			0x0100
-#define REG_CTRL_CLKPHA			0x0200
-#define REG_CTRL_MSBF			0x0400
-#define REG_CTRL_TXBIL			0x1000
-
-#define REG_FRAME		0x04
-#define REG_FRAME_DATABITS__MASK	0x000f
-#define REG_FRAME_DATABITS(n)		((n) - 3)
-
-#define REG_CMD			0x0c
-#define REG_CMD_RXEN			0x0001
-#define REG_CMD_RXDIS			0x0002
-#define REG_CMD_TXEN			0x0004
-#define REG_CMD_TXDIS			0x0008
-#define REG_CMD_MASTEREN		0x0010
-
-#define REG_STATUS		0x10
-#define REG_STATUS_TXENS		0x0002
-#define REG_STATUS_TXC			0x0020
-#define REG_STATUS_TXBL			0x0040
-#define REG_STATUS_RXDATAV		0x0080
-
-#define REG_CLKDIV		0x14
-
-#define REG_RXDATAX		0x18
-#define REG_RXDATAX_RXDATA__MASK	0x01ff
-#define REG_RXDATAX_PERR		0x4000
-#define REG_RXDATAX_FERR		0x8000
-
-#define REG_TXDATA		0x34
-
-#define REG_IF		0x40
-#define REG_IF_TXBL			0x0002
-#define REG_IF_RXDATAV			0x0004
-
-#define REG_IFS		0x44
-#define REG_IFC		0x48
-#define REG_IEN		0x4c
-
-#define REG_ROUTE		0x54
-#define REG_ROUTE_RXPEN			0x0001
-#define REG_ROUTE_TXPEN			0x0002
-#define REG_ROUTE_CLKPEN		0x0008
-#define REG_ROUTE_LOCATION__MASK	0x0700
-#define REG_ROUTE_LOCATION(n)		MASK_VAL(REG_ROUTE_LOCATION__MASK, (n))
-
-struct efm32_spi_ddata {
-	struct spi_bitbang bitbang;
-
-	spinlock_t lock;
-
-	struct clk *clk;
-	void __iomem *base;
-	unsigned int rxirq, txirq;
-	struct efm32_spi_pdata pdata;
-
-	/* irq data */
-	struct completion done;
-	const u8 *tx_buf;
-	u8 *rx_buf;
-	unsigned tx_len, rx_len;
-};
-
-#define ddata_to_dev(ddata)	(&(ddata->bitbang.master->dev))
-#define efm32_spi_vdbg(ddata, format, arg...)	\
-	dev_vdbg(ddata_to_dev(ddata), format, ##arg)
-
-static void efm32_spi_write32(struct efm32_spi_ddata *ddata,
-		u32 value, unsigned offset)
-{
-	writel_relaxed(value, ddata->base + offset);
-}
-
-static u32 efm32_spi_read32(struct efm32_spi_ddata *ddata, unsigned offset)
-{
-	return readl_relaxed(ddata->base + offset);
-}
-
-static int efm32_spi_setup_transfer(struct spi_device *spi,
-		struct spi_transfer *t)
-{
-	struct efm32_spi_ddata *ddata = spi_master_get_devdata(spi->master);
-
-	unsigned bpw = t->bits_per_word ?: spi->bits_per_word;
-	unsigned speed = t->speed_hz ?: spi->max_speed_hz;
-	unsigned long clkfreq = clk_get_rate(ddata->clk);
-	u32 clkdiv;
-
-	efm32_spi_write32(ddata, REG_CTRL_SYNC | REG_CTRL_MSBF |
-			(spi->mode & SPI_CPHA ? REG_CTRL_CLKPHA : 0) |
-			(spi->mode & SPI_CPOL ? REG_CTRL_CLKPOL : 0), REG_CTRL);
-
-	efm32_spi_write32(ddata,
-			REG_FRAME_DATABITS(bpw), REG_FRAME);
-
-	if (2 * speed >= clkfreq)
-		clkdiv = 0;
-	else
-		clkdiv = 64 * (DIV_ROUND_UP(2 * clkfreq, speed) - 4);
-
-	if (clkdiv > (1U << 21))
-		return -EINVAL;
-
-	efm32_spi_write32(ddata, clkdiv, REG_CLKDIV);
-	efm32_spi_write32(ddata, REG_CMD_MASTEREN, REG_CMD);
-	efm32_spi_write32(ddata, REG_CMD_RXEN | REG_CMD_TXEN, REG_CMD);
-
-	return 0;
-}
-
-static void efm32_spi_tx_u8(struct efm32_spi_ddata *ddata)
-{
-	u8 val = 0;
-
-	if (ddata->tx_buf) {
-		val = *ddata->tx_buf;
-		ddata->tx_buf++;
-	}
-
-	ddata->tx_len--;
-	efm32_spi_write32(ddata, val, REG_TXDATA);
-	efm32_spi_vdbg(ddata, "%s: tx 0x%x\n", __func__, val);
-}
-
-static void efm32_spi_rx_u8(struct efm32_spi_ddata *ddata)
-{
-	u32 rxdata = efm32_spi_read32(ddata, REG_RXDATAX);
-	efm32_spi_vdbg(ddata, "%s: rx 0x%x\n", __func__, rxdata);
-
-	if (ddata->rx_buf) {
-		*ddata->rx_buf = rxdata;
-		ddata->rx_buf++;
-	}
-
-	ddata->rx_len--;
-}
-
-static void efm32_spi_filltx(struct efm32_spi_ddata *ddata)
-{
-	while (ddata->tx_len &&
-			ddata->tx_len + 2 > ddata->rx_len &&
-			efm32_spi_read32(ddata, REG_STATUS) & REG_STATUS_TXBL) {
-		efm32_spi_tx_u8(ddata);
-	}
-}
-
-static int efm32_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
-{
-	struct efm32_spi_ddata *ddata = spi_master_get_devdata(spi->master);
-	int ret = -EBUSY;
-
-	spin_lock_irq(&ddata->lock);
-
-	if (ddata->tx_buf || ddata->rx_buf)
-		goto out_unlock;
-
-	ddata->tx_buf = t->tx_buf;
-	ddata->rx_buf = t->rx_buf;
-	ddata->tx_len = ddata->rx_len =
-		t->len * DIV_ROUND_UP(t->bits_per_word, 8);
-
-	efm32_spi_filltx(ddata);
-
-	reinit_completion(&ddata->done);
-
-	efm32_spi_write32(ddata, REG_IF_TXBL | REG_IF_RXDATAV, REG_IEN);
-
-	spin_unlock_irq(&ddata->lock);
-
-	wait_for_completion(&ddata->done);
-
-	spin_lock_irq(&ddata->lock);
-
-	ret = t->len - max(ddata->tx_len, ddata->rx_len);
-
-	efm32_spi_write32(ddata, 0, REG_IEN);
-	ddata->tx_buf = ddata->rx_buf = NULL;
-
-out_unlock:
-	spin_unlock_irq(&ddata->lock);
-
-	return ret;
-}
-
-static irqreturn_t efm32_spi_rxirq(int irq, void *data)
-{
-	struct efm32_spi_ddata *ddata = data;
-	irqreturn_t ret = IRQ_NONE;
-
-	spin_lock(&ddata->lock);
-
-	while (ddata->rx_len > 0 &&
-			efm32_spi_read32(ddata, REG_STATUS) &
-			REG_STATUS_RXDATAV) {
-		efm32_spi_rx_u8(ddata);
-
-		ret = IRQ_HANDLED;
-	}
-
-	if (!ddata->rx_len) {
-		u32 ien = efm32_spi_read32(ddata, REG_IEN);
-
-		ien &= ~REG_IF_RXDATAV;
-
-		efm32_spi_write32(ddata, ien, REG_IEN);
-
-		complete(&ddata->done);
-	}
-
-	spin_unlock(&ddata->lock);
-
-	return ret;
-}
-
-static irqreturn_t efm32_spi_txirq(int irq, void *data)
-{
-	struct efm32_spi_ddata *ddata = data;
-
-	efm32_spi_vdbg(ddata,
-			"%s: txlen = %u, rxlen = %u, if=0x%08x, stat=0x%08x\n",
-			__func__, ddata->tx_len, ddata->rx_len,
-			efm32_spi_read32(ddata, REG_IF),
-			efm32_spi_read32(ddata, REG_STATUS));
-
-	spin_lock(&ddata->lock);
-
-	efm32_spi_filltx(ddata);
-
-	efm32_spi_vdbg(ddata, "%s: txlen = %u, rxlen = %u\n",
-			__func__, ddata->tx_len, ddata->rx_len);
-
-	if (!ddata->tx_len) {
-		u32 ien = efm32_spi_read32(ddata, REG_IEN);
-
-		ien &= ~REG_IF_TXBL;
-
-		efm32_spi_write32(ddata, ien, REG_IEN);
-		efm32_spi_vdbg(ddata, "disable TXBL\n");
-	}
-
-	spin_unlock(&ddata->lock);
-
-	return IRQ_HANDLED;
-}
-
-static u32 efm32_spi_get_configured_location(struct efm32_spi_ddata *ddata)
-{
-	u32 reg = efm32_spi_read32(ddata, REG_ROUTE);
-
-	return (reg & REG_ROUTE_LOCATION__MASK) >> __ffs(REG_ROUTE_LOCATION__MASK);
-}
-
-static void efm32_spi_probe_dt(struct platform_device *pdev,
-		struct spi_master *master, struct efm32_spi_ddata *ddata)
-{
-	struct device_node *np = pdev->dev.of_node;
-	u32 location;
-	int ret;
-
-	ret = of_property_read_u32(np, "energymicro,location", &location);
-
-	if (ret)
-		/* fall back to wrongly namespaced property */
-		ret = of_property_read_u32(np, "efm32,location", &location);
-
-	if (ret)
-		/* fall back to old and (wrongly) generic property "location" */
-		ret = of_property_read_u32(np, "location", &location);
-
-	if (!ret) {
-		dev_dbg(&pdev->dev, "using location %u\n", location);
-	} else {
-		/* default to location configured in hardware */
-		location = efm32_spi_get_configured_location(ddata);
-
-		dev_info(&pdev->dev, "fall back to location %u\n", location);
-	}
-
-	ddata->pdata.location = location;
-}
-
-static int efm32_spi_probe(struct platform_device *pdev)
-{
-	struct efm32_spi_ddata *ddata;
-	struct resource *res;
-	int ret;
-	struct spi_master *master;
-	struct device_node *np = pdev->dev.of_node;
-
-	if (!np)
-		return -EINVAL;
-
-	master = spi_alloc_master(&pdev->dev, sizeof(*ddata));
-	if (!master) {
-		dev_dbg(&pdev->dev,
-				"failed to allocate spi master controller\n");
-		return -ENOMEM;
-	}
-	platform_set_drvdata(pdev, master);
-
-	master->dev.of_node = pdev->dev.of_node;
-
-	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
-	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(4, 16);
-	master->use_gpio_descriptors = true;
-
-	ddata = spi_master_get_devdata(master);
-
-	ddata->bitbang.master = master;
-	ddata->bitbang.setup_transfer = efm32_spi_setup_transfer;
-	ddata->bitbang.txrx_bufs = efm32_spi_txrx_bufs;
-
-	spin_lock_init(&ddata->lock);
-	init_completion(&ddata->done);
-
-	ddata->clk = devm_clk_get(&pdev->dev, NULL);
-	if (IS_ERR(ddata->clk)) {
-		ret = PTR_ERR(ddata->clk);
-		dev_err(&pdev->dev, "failed to get clock: %d\n", ret);
-		goto err;
-	}
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		ret = -ENODEV;
-		dev_err(&pdev->dev, "failed to determine base address\n");
-		goto err;
-	}
-
-	if (resource_size(res) < 0x60) {
-		ret = -EINVAL;
-		dev_err(&pdev->dev, "memory resource too small\n");
-		goto err;
-	}
-
-	ddata->base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(ddata->base)) {
-		ret = PTR_ERR(ddata->base);
-		goto err;
-	}
-
-	ret = platform_get_irq(pdev, 0);
-	if (ret <= 0)
-		goto err;
-
-	ddata->rxirq = ret;
-
-	ret = platform_get_irq(pdev, 1);
-	if (ret <= 0)
-		ret = ddata->rxirq + 1;
-
-	ddata->txirq = ret;
-
-	ret = clk_prepare_enable(ddata->clk);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "failed to enable clock (%d)\n", ret);
-		goto err;
-	}
-
-	efm32_spi_probe_dt(pdev, master, ddata);
-
-	efm32_spi_write32(ddata, 0, REG_IEN);
-	efm32_spi_write32(ddata, REG_ROUTE_TXPEN | REG_ROUTE_RXPEN |
-			REG_ROUTE_CLKPEN |
-			REG_ROUTE_LOCATION(ddata->pdata.location), REG_ROUTE);
-
-	ret = request_irq(ddata->rxirq, efm32_spi_rxirq,
-			0, DRIVER_NAME " rx", ddata);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to register rxirq (%d)\n", ret);
-		goto err_disable_clk;
-	}
-
-	ret = request_irq(ddata->txirq, efm32_spi_txirq,
-			0, DRIVER_NAME " tx", ddata);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to register txirq (%d)\n", ret);
-		goto err_free_rx_irq;
-	}
-
-	ret = spi_bitbang_start(&ddata->bitbang);
-	if (ret) {
-		dev_err(&pdev->dev, "spi_bitbang_start failed (%d)\n", ret);
-
-		free_irq(ddata->txirq, ddata);
-err_free_rx_irq:
-		free_irq(ddata->rxirq, ddata);
-err_disable_clk:
-		clk_disable_unprepare(ddata->clk);
-err:
-		spi_master_put(master);
-	}
-
-	return ret;
-}
-
-static int efm32_spi_remove(struct platform_device *pdev)
-{
-	struct spi_master *master = platform_get_drvdata(pdev);
-	struct efm32_spi_ddata *ddata = spi_master_get_devdata(master);
-
-	spi_bitbang_stop(&ddata->bitbang);
-
-	efm32_spi_write32(ddata, 0, REG_IEN);
-
-	free_irq(ddata->txirq, ddata);
-	free_irq(ddata->rxirq, ddata);
-	clk_disable_unprepare(ddata->clk);
-	spi_master_put(master);
-
-	return 0;
-}
-
-static const struct of_device_id efm32_spi_dt_ids[] = {
-	{
-		.compatible = "energymicro,efm32-spi",
-	}, {
-		/* doesn't follow the "vendor,device" scheme, don't use */
-		.compatible = "efm32,spi",
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(of, efm32_spi_dt_ids);
-
-static struct platform_driver efm32_spi_driver = {
-	.probe = efm32_spi_probe,
-	.remove = efm32_spi_remove,
-
-	.driver = {
-		.name = DRIVER_NAME,
-		.of_match_table = efm32_spi_dt_ids,
-	},
-};
-module_platform_driver(efm32_spi_driver);
-
-MODULE_AUTHOR("Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>");
-MODULE_DESCRIPTION("EFM32 SPI driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" DRIVER_NAME);
diff --git a/include/linux/platform_data/efm32-spi.h b/include/linux/platform_data/efm32-spi.h
deleted file mode 100644
index a2c56fcd0534..000000000000
--- a/include/linux/platform_data/efm32-spi.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_PLATFORM_DATA_EFM32_SPI_H__
-#define __LINUX_PLATFORM_DATA_EFM32_SPI_H__
-
-#include <linux/types.h>
-
-/**
- * struct efm32_spi_pdata
- * @location: pinmux location for the I/O pins (to be written to the ROUTE
- * 	register)
- */
-struct efm32_spi_pdata {
-	u8 location;
-};
-#endif /* ifndef __LINUX_PLATFORM_DATA_EFM32_SPI_H__ */
-- 
cgit v1.2.3


From 8d502ef682fdbe0ddf2274dae903b07baf1140e0 Mon Sep 17 00:00:00 2001
From: Craig Tatlor <ctatlor97@gmail.com>
Date: Fri, 4 Dec 2020 05:54:56 +0300
Subject: fixp-arith: add a linear interpolation function

Adds a function to interpolate against two points,
this is carried arount as a helper function by tons of drivers.

Signed-off-by: Craig Tatlor <ctatlor97@gmail.com>
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20201204025509.1075506-3-dmitry.baryshkov@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/fixp-arith.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fixp-arith.h b/include/linux/fixp-arith.h
index 8396013785ef..281cb4f83dbe 100644
--- a/include/linux/fixp-arith.h
+++ b/include/linux/fixp-arith.h
@@ -141,4 +141,23 @@ static inline s32 fixp_sin32_rad(u32 radians, u32 twopi)
 #define fixp_cos32_rad(rad, twopi)	\
 	fixp_sin32_rad(rad + twopi / 4, twopi)
 
+/**
+ * fixp_linear_interpolate() - interpolates a value from two known points
+ *
+ * @x0: x value of point 0
+ * @y0: y value of point 0
+ * @x1: x value of point 1
+ * @y1: y value of point 1
+ * @x: the linear interpolant
+ */
+static inline int fixp_linear_interpolate(int x0, int y0, int x1, int y1, int x)
+{
+	if (y0 == y1 || x == x0)
+		return y0;
+	if (x1 == x0 || x == x1)
+		return y1;
+
+	return y0 + ((y1 - y0) * (x - x0) / (x1 - x0));
+}
+
 #endif
-- 
cgit v1.2.3


From ec82edb258bbf0233306cc45f65d4b6092243ee9 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 4 Dec 2020 05:54:59 +0300
Subject: iio: adc: move qcom-vadc-common.h to include dir

qcom-vadc-common module will be used by ADC thermal monitoring driver,
so move it to global include dir.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20201204025509.1075506-6-dmitry.baryshkov@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/qcom-pm8xxx-xoadc.c      |   3 +-
 drivers/iio/adc/qcom-spmi-adc5.c         |   2 +-
 drivers/iio/adc/qcom-spmi-vadc.c         |   3 +-
 drivers/iio/adc/qcom-vadc-common.c       |   3 +-
 drivers/iio/adc/qcom-vadc-common.h       | 185 ------------------------------
 include/linux/iio/adc/qcom-vadc-common.h | 187 +++++++++++++++++++++++++++++++
 6 files changed, 191 insertions(+), 192 deletions(-)
 delete mode 100644 drivers/iio/adc/qcom-vadc-common.h
 create mode 100644 include/linux/iio/adc/qcom-vadc-common.h

(limited to 'include/linux')

diff --git a/drivers/iio/adc/qcom-pm8xxx-xoadc.c b/drivers/iio/adc/qcom-pm8xxx-xoadc.c
index 7e108da7d255..0610bf254771 100644
--- a/drivers/iio/adc/qcom-pm8xxx-xoadc.c
+++ b/drivers/iio/adc/qcom-pm8xxx-xoadc.c
@@ -10,6 +10,7 @@
  * Author: Linus Walleij <linus.walleij@linaro.org>
  */
 
+#include <linux/iio/adc/qcom-vadc-common.h>
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
 #include <linux/module.h>
@@ -21,8 +22,6 @@
 #include <linux/interrupt.h>
 #include <linux/regulator/consumer.h>
 
-#include "qcom-vadc-common.h"
-
 /*
  * Definitions for the "user processor" registers lifted from the v3.4
  * Qualcomm tree. Their kernel has two out-of-tree drivers for the ADC:
diff --git a/drivers/iio/adc/qcom-spmi-adc5.c b/drivers/iio/adc/qcom-spmi-adc5.c
index c2da8f068b87..b10a0fcf09dc 100644
--- a/drivers/iio/adc/qcom-spmi-adc5.c
+++ b/drivers/iio/adc/qcom-spmi-adc5.c
@@ -7,6 +7,7 @@
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/iio/adc/qcom-vadc-common.h>
 #include <linux/iio/iio.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -19,7 +20,6 @@
 #include <linux/slab.h>
 
 #include <dt-bindings/iio/qcom,spmi-vadc.h>
-#include "qcom-vadc-common.h"
 
 #define ADC5_USR_REVISION1			0x0
 #define ADC5_USR_STATUS1			0x8
diff --git a/drivers/iio/adc/qcom-spmi-vadc.c b/drivers/iio/adc/qcom-spmi-vadc.c
index b0388f8a69f4..05ff948372b3 100644
--- a/drivers/iio/adc/qcom-spmi-vadc.c
+++ b/drivers/iio/adc/qcom-spmi-vadc.c
@@ -7,6 +7,7 @@
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/iio/adc/qcom-vadc-common.h>
 #include <linux/iio/iio.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -20,8 +21,6 @@
 
 #include <dt-bindings/iio/qcom,spmi-vadc.h>
 
-#include "qcom-vadc-common.h"
-
 /* VADC register and bit definitions */
 #define VADC_REVISION2				0x1
 #define VADC_REVISION2_SUPPORTED_VADC		1
diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
index 40d77b3af1bb..ee94774b72e6 100644
--- a/drivers/iio/adc/qcom-vadc-common.c
+++ b/drivers/iio/adc/qcom-vadc-common.c
@@ -3,14 +3,13 @@
 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/fixp-arith.h>
+#include <linux/iio/adc/qcom-vadc-common.h>
 #include <linux/math64.h>
 #include <linux/log2.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/units.h>
 
-#include "qcom-vadc-common.h"
-
 /* Voltage to temperature */
 static const struct vadc_map_pt adcmap_100k_104ef_104fb[] = {
 	{1758,	-40},
diff --git a/drivers/iio/adc/qcom-vadc-common.h b/drivers/iio/adc/qcom-vadc-common.h
deleted file mode 100644
index 7e5f6428e311..000000000000
--- a/drivers/iio/adc/qcom-vadc-common.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code shared between the different Qualcomm PMIC voltage ADCs
- */
-
-#ifndef QCOM_VADC_COMMON_H
-#define QCOM_VADC_COMMON_H
-
-#define VADC_CONV_TIME_MIN_US			2000
-#define VADC_CONV_TIME_MAX_US			2100
-
-/* Min ADC code represents 0V */
-#define VADC_MIN_ADC_CODE			0x6000
-/* Max ADC code represents full-scale range of 1.8V */
-#define VADC_MAX_ADC_CODE			0xa800
-
-#define VADC_ABSOLUTE_RANGE_UV			625000
-#define VADC_RATIOMETRIC_RANGE			1800
-
-#define VADC_DEF_PRESCALING			0 /* 1:1 */
-#define VADC_DEF_DECIMATION			0 /* 512 */
-#define VADC_DEF_HW_SETTLE_TIME			0 /* 0 us */
-#define VADC_DEF_AVG_SAMPLES			0 /* 1 sample */
-#define VADC_DEF_CALIB_TYPE			VADC_CALIB_ABSOLUTE
-
-#define VADC_DECIMATION_MIN			512
-#define VADC_DECIMATION_MAX			4096
-#define ADC5_DEF_VBAT_PRESCALING		1 /* 1:3 */
-#define ADC5_DECIMATION_SHORT			250
-#define ADC5_DECIMATION_MEDIUM			420
-#define ADC5_DECIMATION_LONG			840
-/* Default decimation - 1024 for rev2, 840 for pmic5 */
-#define ADC5_DECIMATION_DEFAULT			2
-#define ADC5_DECIMATION_SAMPLES_MAX		3
-
-#define VADC_HW_SETTLE_DELAY_MAX		10000
-#define VADC_HW_SETTLE_SAMPLES_MAX		16
-#define VADC_AVG_SAMPLES_MAX			512
-#define ADC5_AVG_SAMPLES_MAX			16
-
-#define PMIC5_CHG_TEMP_SCALE_FACTOR		377500
-#define PMIC5_SMB_TEMP_CONSTANT			419400
-#define PMIC5_SMB_TEMP_SCALE_FACTOR		356
-
-#define PMI_CHG_SCALE_1				-138890
-#define PMI_CHG_SCALE_2				391750000000LL
-
-#define VADC5_MAX_CODE				0x7fff
-#define ADC5_FULL_SCALE_CODE			0x70e4
-#define ADC5_USR_DATA_CHECK			0x8000
-
-#define R_PU_100K				100000
-#define RATIO_MAX_ADC7				BIT(14)
-
-#define DIE_TEMP_ADC7_SCALE_1			-60000
-#define DIE_TEMP_ADC7_SCALE_2			20000
-#define DIE_TEMP_ADC7_SCALE_FACTOR		1000
-#define DIE_TEMP_ADC7_MAX			160000
-
-/**
- * struct vadc_map_pt - Map the graph representation for ADC channel
- * @x: Represent the ADC digitized code.
- * @y: Represent the physical data which can be temperature, voltage,
- *     resistance.
- */
-struct vadc_map_pt {
-	s32 x;
-	s32 y;
-};
-
-/*
- * VADC_CALIB_ABSOLUTE: uses the 625mV and 1.25V as reference channels.
- * VADC_CALIB_RATIOMETRIC: uses the reference voltage (1.8V) and GND for
- * calibration.
- */
-enum vadc_calibration {
-	VADC_CALIB_ABSOLUTE = 0,
-	VADC_CALIB_RATIOMETRIC
-};
-
-/**
- * struct vadc_linear_graph - Represent ADC characteristics.
- * @dy: numerator slope to calculate the gain.
- * @dx: denominator slope to calculate the gain.
- * @gnd: A/D word of the ground reference used for the channel.
- *
- * Each ADC device has different offset and gain parameters which are
- * computed to calibrate the device.
- */
-struct vadc_linear_graph {
-	s32 dy;
-	s32 dx;
-	s32 gnd;
-};
-
-/**
- * struct vadc_prescale_ratio - Represent scaling ratio for ADC input.
- * @num: the inverse numerator of the gain applied to the input channel.
- * @den: the inverse denominator of the gain applied to the input channel.
- */
-struct vadc_prescale_ratio {
-	u32 num;
-	u32 den;
-};
-
-/**
- * enum vadc_scale_fn_type - Scaling function to convert ADC code to
- *				physical scaled units for the channel.
- * SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV).
- * SCALE_THERM_100K_PULLUP: Returns temperature in millidegC.
- *				 Uses a mapping table with 100K pullup.
- * SCALE_PMIC_THERM: Returns result in milli degree's Centigrade.
- * SCALE_XOTHERM: Returns XO thermistor voltage in millidegC.
- * SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp
- * SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to
- *	voltage (uV) with hardware applied offset/slope values to adc code.
- * SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using
- *	lookup table. The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using
- *	100k pullup. The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using
- *	lookup table for PMIC7. The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
- *	The hardware applies offset/slope to adc code.
- * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
- *	The hardware applies offset/slope to adc code. This is for PMIC7.
- * SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5
- *	charger temperature.
- * SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5
- *	SMB1390 temperature.
- */
-enum vadc_scale_fn_type {
-	SCALE_DEFAULT = 0,
-	SCALE_THERM_100K_PULLUP,
-	SCALE_PMIC_THERM,
-	SCALE_XOTHERM,
-	SCALE_PMI_CHG_TEMP,
-	SCALE_HW_CALIB_DEFAULT,
-	SCALE_HW_CALIB_THERM_100K_PULLUP,
-	SCALE_HW_CALIB_XOTHERM,
-	SCALE_HW_CALIB_THERM_100K_PU_PM7,
-	SCALE_HW_CALIB_PMIC_THERM,
-	SCALE_HW_CALIB_PMIC_THERM_PM7,
-	SCALE_HW_CALIB_PM5_CHG_TEMP,
-	SCALE_HW_CALIB_PM5_SMB_TEMP,
-	SCALE_HW_CALIB_INVALID,
-};
-
-struct adc5_data {
-	const u32	full_scale_code_volt;
-	const u32	full_scale_code_cur;
-	const struct adc5_channels *adc_chans;
-	const struct iio_info *info;
-	unsigned int	*decimation;
-	unsigned int	*hw_settle_1;
-	unsigned int	*hw_settle_2;
-};
-
-int qcom_vadc_scale(enum vadc_scale_fn_type scaletype,
-		    const struct vadc_linear_graph *calib_graph,
-		    const struct vadc_prescale_ratio *prescale,
-		    bool absolute,
-		    u16 adc_code, int *result_mdec);
-
-struct qcom_adc5_scale_type {
-	int (*scale_fn)(const struct vadc_prescale_ratio *prescale,
-		const struct adc5_data *data, u16 adc_code, int *result);
-};
-
-int qcom_adc5_hw_scale(enum vadc_scale_fn_type scaletype,
-		    unsigned int prescale_ratio,
-		    const struct adc5_data *data,
-		    u16 adc_code, int *result_mdec);
-
-int qcom_adc5_prescaling_from_dt(u32 num, u32 den);
-
-int qcom_adc5_hw_settle_time_from_dt(u32 value, const unsigned int *hw_settle);
-
-int qcom_adc5_avg_samples_from_dt(u32 value);
-
-int qcom_adc5_decimation_from_dt(u32 value, const unsigned int *decimation);
-
-int qcom_vadc_decimation_from_dt(u32 value);
-
-#endif /* QCOM_VADC_COMMON_H */
diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h
new file mode 100644
index 000000000000..03a9119edc71
--- /dev/null
+++ b/include/linux/iio/adc/qcom-vadc-common.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code shared between the different Qualcomm PMIC voltage ADCs
+ */
+
+#ifndef QCOM_VADC_COMMON_H
+#define QCOM_VADC_COMMON_H
+
+#include <linux/types.h>
+
+#define VADC_CONV_TIME_MIN_US			2000
+#define VADC_CONV_TIME_MAX_US			2100
+
+/* Min ADC code represents 0V */
+#define VADC_MIN_ADC_CODE			0x6000
+/* Max ADC code represents full-scale range of 1.8V */
+#define VADC_MAX_ADC_CODE			0xa800
+
+#define VADC_ABSOLUTE_RANGE_UV			625000
+#define VADC_RATIOMETRIC_RANGE			1800
+
+#define VADC_DEF_PRESCALING			0 /* 1:1 */
+#define VADC_DEF_DECIMATION			0 /* 512 */
+#define VADC_DEF_HW_SETTLE_TIME			0 /* 0 us */
+#define VADC_DEF_AVG_SAMPLES			0 /* 1 sample */
+#define VADC_DEF_CALIB_TYPE			VADC_CALIB_ABSOLUTE
+
+#define VADC_DECIMATION_MIN			512
+#define VADC_DECIMATION_MAX			4096
+#define ADC5_DEF_VBAT_PRESCALING		1 /* 1:3 */
+#define ADC5_DECIMATION_SHORT			250
+#define ADC5_DECIMATION_MEDIUM			420
+#define ADC5_DECIMATION_LONG			840
+/* Default decimation - 1024 for rev2, 840 for pmic5 */
+#define ADC5_DECIMATION_DEFAULT			2
+#define ADC5_DECIMATION_SAMPLES_MAX		3
+
+#define VADC_HW_SETTLE_DELAY_MAX		10000
+#define VADC_HW_SETTLE_SAMPLES_MAX		16
+#define VADC_AVG_SAMPLES_MAX			512
+#define ADC5_AVG_SAMPLES_MAX			16
+
+#define PMIC5_CHG_TEMP_SCALE_FACTOR		377500
+#define PMIC5_SMB_TEMP_CONSTANT			419400
+#define PMIC5_SMB_TEMP_SCALE_FACTOR		356
+
+#define PMI_CHG_SCALE_1				-138890
+#define PMI_CHG_SCALE_2				391750000000LL
+
+#define VADC5_MAX_CODE				0x7fff
+#define ADC5_FULL_SCALE_CODE			0x70e4
+#define ADC5_USR_DATA_CHECK			0x8000
+
+#define R_PU_100K				100000
+#define RATIO_MAX_ADC7				BIT(14)
+
+#define DIE_TEMP_ADC7_SCALE_1			-60000
+#define DIE_TEMP_ADC7_SCALE_2			20000
+#define DIE_TEMP_ADC7_SCALE_FACTOR		1000
+#define DIE_TEMP_ADC7_MAX			160000
+
+/**
+ * struct vadc_map_pt - Map the graph representation for ADC channel
+ * @x: Represent the ADC digitized code.
+ * @y: Represent the physical data which can be temperature, voltage,
+ *     resistance.
+ */
+struct vadc_map_pt {
+	s32 x;
+	s32 y;
+};
+
+/*
+ * VADC_CALIB_ABSOLUTE: uses the 625mV and 1.25V as reference channels.
+ * VADC_CALIB_RATIOMETRIC: uses the reference voltage (1.8V) and GND for
+ * calibration.
+ */
+enum vadc_calibration {
+	VADC_CALIB_ABSOLUTE = 0,
+	VADC_CALIB_RATIOMETRIC
+};
+
+/**
+ * struct vadc_linear_graph - Represent ADC characteristics.
+ * @dy: numerator slope to calculate the gain.
+ * @dx: denominator slope to calculate the gain.
+ * @gnd: A/D word of the ground reference used for the channel.
+ *
+ * Each ADC device has different offset and gain parameters which are
+ * computed to calibrate the device.
+ */
+struct vadc_linear_graph {
+	s32 dy;
+	s32 dx;
+	s32 gnd;
+};
+
+/**
+ * struct vadc_prescale_ratio - Represent scaling ratio for ADC input.
+ * @num: the inverse numerator of the gain applied to the input channel.
+ * @den: the inverse denominator of the gain applied to the input channel.
+ */
+struct vadc_prescale_ratio {
+	u32 num;
+	u32 den;
+};
+
+/**
+ * enum vadc_scale_fn_type - Scaling function to convert ADC code to
+ *				physical scaled units for the channel.
+ * SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV).
+ * SCALE_THERM_100K_PULLUP: Returns temperature in millidegC.
+ *				 Uses a mapping table with 100K pullup.
+ * SCALE_PMIC_THERM: Returns result in milli degree's Centigrade.
+ * SCALE_XOTHERM: Returns XO thermistor voltage in millidegC.
+ * SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp
+ * SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to
+ *	voltage (uV) with hardware applied offset/slope values to adc code.
+ * SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using
+ *	lookup table. The hardware applies offset/slope to adc code.
+ * SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using
+ *	100k pullup. The hardware applies offset/slope to adc code.
+ * SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using
+ *	lookup table for PMIC7. The hardware applies offset/slope to adc code.
+ * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
+ *	The hardware applies offset/slope to adc code.
+ * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade.
+ *	The hardware applies offset/slope to adc code. This is for PMIC7.
+ * SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5
+ *	charger temperature.
+ * SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5
+ *	SMB1390 temperature.
+ */
+enum vadc_scale_fn_type {
+	SCALE_DEFAULT = 0,
+	SCALE_THERM_100K_PULLUP,
+	SCALE_PMIC_THERM,
+	SCALE_XOTHERM,
+	SCALE_PMI_CHG_TEMP,
+	SCALE_HW_CALIB_DEFAULT,
+	SCALE_HW_CALIB_THERM_100K_PULLUP,
+	SCALE_HW_CALIB_XOTHERM,
+	SCALE_HW_CALIB_THERM_100K_PU_PM7,
+	SCALE_HW_CALIB_PMIC_THERM,
+	SCALE_HW_CALIB_PMIC_THERM_PM7,
+	SCALE_HW_CALIB_PM5_CHG_TEMP,
+	SCALE_HW_CALIB_PM5_SMB_TEMP,
+	SCALE_HW_CALIB_INVALID,
+};
+
+struct adc5_data {
+	const u32	full_scale_code_volt;
+	const u32	full_scale_code_cur;
+	const struct adc5_channels *adc_chans;
+	const struct iio_info *info;
+	unsigned int	*decimation;
+	unsigned int	*hw_settle_1;
+	unsigned int	*hw_settle_2;
+};
+
+int qcom_vadc_scale(enum vadc_scale_fn_type scaletype,
+		    const struct vadc_linear_graph *calib_graph,
+		    const struct vadc_prescale_ratio *prescale,
+		    bool absolute,
+		    u16 adc_code, int *result_mdec);
+
+struct qcom_adc5_scale_type {
+	int (*scale_fn)(const struct vadc_prescale_ratio *prescale,
+		const struct adc5_data *data, u16 adc_code, int *result);
+};
+
+int qcom_adc5_hw_scale(enum vadc_scale_fn_type scaletype,
+		    unsigned int prescale_ratio,
+		    const struct adc5_data *data,
+		    u16 adc_code, int *result_mdec);
+
+int qcom_adc5_prescaling_from_dt(u32 num, u32 den);
+
+int qcom_adc5_hw_settle_time_from_dt(u32 value, const unsigned int *hw_settle);
+
+int qcom_adc5_avg_samples_from_dt(u32 value);
+
+int qcom_adc5_decimation_from_dt(u32 value, const unsigned int *decimation);
+
+int qcom_vadc_decimation_from_dt(u32 value);
+
+#endif /* QCOM_VADC_COMMON_H */
-- 
cgit v1.2.3


From 6e39b145cef7577cd6b550e633155e5fc1e5838e Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 4 Dec 2020 05:55:01 +0300
Subject: iio: provide of_iio_channel_get_by_name() and devm_ version it

There might be cases when the IIO channel is attached to the device
subnode instead of being attached to the main device node. Allow drivers
to query IIO channels by using device tree nodes.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20201204025509.1075506-8-dmitry.baryshkov@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/inkern.c         | 34 ++++++++++++++++++++++++++--------
 include/linux/iio/consumer.h | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index fe30bcb6a57b..db77a2d4a56b 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -191,8 +191,8 @@ err_free_channel:
 	return ERR_PTR(err);
 }
 
-static struct iio_channel *of_iio_channel_get_by_name(struct device_node *np,
-						      const char *name)
+struct iio_channel *of_iio_channel_get_by_name(struct device_node *np,
+					       const char *name)
 {
 	struct iio_channel *chan = NULL;
 
@@ -230,6 +230,7 @@ static struct iio_channel *of_iio_channel_get_by_name(struct device_node *np,
 
 	return chan;
 }
+EXPORT_SYMBOL_GPL(of_iio_channel_get_by_name);
 
 static struct iio_channel *of_iio_channel_get_all(struct device *dev)
 {
@@ -272,12 +273,6 @@ error_free_chans:
 
 #else /* CONFIG_OF */
 
-static inline struct iio_channel *
-of_iio_channel_get_by_name(struct device_node *np, const char *name)
-{
-	return NULL;
-}
-
 static inline struct iio_channel *of_iio_channel_get_all(struct device *dev)
 {
 	return NULL;
@@ -393,6 +388,29 @@ struct iio_channel *devm_iio_channel_get(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_iio_channel_get);
 
+struct iio_channel *devm_of_iio_channel_get_by_name(struct device *dev,
+						    struct device_node *np,
+						    const char *channel_name)
+{
+	struct iio_channel **ptr, *channel;
+
+	ptr = devres_alloc(devm_iio_channel_free, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	channel = of_iio_channel_get_by_name(np, channel_name);
+	if (IS_ERR(channel)) {
+		devres_free(ptr);
+		return channel;
+	}
+
+	*ptr = channel;
+	devres_add(dev, ptr);
+
+	return channel;
+}
+EXPORT_SYMBOL_GPL(devm_of_iio_channel_get_by_name);
+
 struct iio_channel *iio_channel_get_all(struct device *dev)
 {
 	const char *name;
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index c4118dcb8e05..0a90ba8fa1bb 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -13,6 +13,7 @@
 struct iio_dev;
 struct iio_chan_spec;
 struct device;
+struct device_node;
 
 /**
  * struct iio_channel - everything needed for a consumer to use a channel
@@ -97,6 +98,41 @@ void iio_channel_release_all(struct iio_channel *chan);
  */
 struct iio_channel *devm_iio_channel_get_all(struct device *dev);
 
+/**
+ * of_iio_channel_get_by_name() - get description of all that is needed to access channel.
+ * @np:			Pointer to consumer device tree node
+ * @consumer_channel:	Unique name to identify the channel on the consumer
+ *			side. This typically describes the channels use within
+ *			the consumer. E.g. 'battery_voltage'
+ */
+#ifdef CONFIG_OF
+struct iio_channel *of_iio_channel_get_by_name(struct device_node *np, const char *name);
+#else
+static inline struct iio_channel *
+of_iio_channel_get_by_name(struct device_node *np, const char *name)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * devm_of_iio_channel_get_by_name() - Resource managed version of of_iio_channel_get_by_name().
+ * @dev:		Pointer to consumer device.
+ * @np:			Pointer to consumer device tree node
+ * @consumer_channel:	Unique name to identify the channel on the consumer
+ *			side. This typically describes the channels use within
+ *			the consumer. E.g. 'battery_voltage'
+ *
+ * Returns a pointer to negative errno if it is not able to get the iio channel
+ * otherwise returns valid pointer for iio channel.
+ *
+ * The allocated iio channel is automatically released when the device is
+ * unbound.
+ */
+struct iio_channel *devm_of_iio_channel_get_by_name(struct device *dev,
+						    struct device_node *np,
+						    const char *consumer_channel);
+
 struct iio_cb_buffer;
 /**
  * iio_channel_get_all_cb() - register callback for triggered capture
-- 
cgit v1.2.3


From bb01e263743293d37148f1fa00e3ea04dca416a5 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 4 Dec 2020 05:55:02 +0300
Subject: iio: adc: move vadc_map_pt from header to the source file

struct vadc_map_pt is not used outside of qcom-vadc-common.c, so move it
there from the global header file.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20201204025509.1075506-9-dmitry.baryshkov@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/qcom-vadc-common.c       | 11 +++++++++++
 include/linux/iio/adc/qcom-vadc-common.h | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
index ee94774b72e6..45a38602f66a 100644
--- a/drivers/iio/adc/qcom-vadc-common.c
+++ b/drivers/iio/adc/qcom-vadc-common.c
@@ -10,6 +10,17 @@
 #include <linux/module.h>
 #include <linux/units.h>
 
+/**
+ * struct vadc_map_pt - Map the graph representation for ADC channel
+ * @x: Represent the ADC digitized code.
+ * @y: Represent the physical data which can be temperature, voltage,
+ *     resistance.
+ */
+struct vadc_map_pt {
+	s32 x;
+	s32 y;
+};
+
 /* Voltage to temperature */
 static const struct vadc_map_pt adcmap_100k_104ef_104fb[] = {
 	{1758,	-40},
diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h
index 03a9119edc71..1d337dd9e3dc 100644
--- a/include/linux/iio/adc/qcom-vadc-common.h
+++ b/include/linux/iio/adc/qcom-vadc-common.h
@@ -59,17 +59,6 @@
 #define DIE_TEMP_ADC7_SCALE_FACTOR		1000
 #define DIE_TEMP_ADC7_MAX			160000
 
-/**
- * struct vadc_map_pt - Map the graph representation for ADC channel
- * @x: Represent the ADC digitized code.
- * @y: Represent the physical data which can be temperature, voltage,
- *     resistance.
- */
-struct vadc_map_pt {
-	s32 x;
-	s32 y;
-};
-
 /*
  * VADC_CALIB_ABSOLUTE: uses the 625mV and 1.25V as reference channels.
  * VADC_CALIB_RATIOMETRIC: uses the reference voltage (1.8V) and GND for
-- 
cgit v1.2.3


From 3bd0ceb566f700adc5d164f431d1da039374aa97 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 4 Dec 2020 05:55:03 +0300
Subject: iio: adc: qcom-vadc-common: rewrite vadc7 die temp calculation

qcom_vadc7_scale_hw_calib_die_temp() uses a table format different from
the rest of volt/temp conversion functions in this file. Also the
conversion functions results in non-monothonic values conversion, which
seems wrong.

Rewrite qcom_vadc7_scale_hw_calib_die_temp() to use
qcom_vadc_map_voltage_temp() directly, like the rest of conversion
functions do.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20201204025509.1075506-10-dmitry.baryshkov@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/qcom-vadc-common.c       | 50 ++++++++++----------------------
 include/linux/iio/adc/qcom-vadc-common.h |  5 ----
 2 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
index 45a38602f66a..0c705bb473fe 100644
--- a/drivers/iio/adc/qcom-vadc-common.c
+++ b/drivers/iio/adc/qcom-vadc-common.c
@@ -101,18 +101,18 @@ static const struct vadc_map_pt adcmap_100k_104ef_104fb_1875_vref[] = {
 };
 
 static const struct vadc_map_pt adcmap7_die_temp[] = {
-	{ 433700, 1967},
-	{ 473100, 1964},
-	{ 512400, 1957},
-	{ 551500, 1949},
-	{ 590500, 1940},
-	{ 629300, 1930},
-	{ 667900, 1921},
-	{ 706400, 1910},
-	{ 744600, 1896},
-	{ 782500, 1878},
-	{ 820100, 1859},
-	{ 857300, 0},
+	{ 857300, 160000 },
+	{ 820100, 140000 },
+	{ 782500, 120000 },
+	{ 744600, 100000 },
+	{ 706400, 80000 },
+	{ 667900, 60000 },
+	{ 629300, 40000 },
+	{ 590500, 20000 },
+	{ 551500, 0 },
+	{ 512400, -20000 },
+	{ 473100, -40000 },
+	{ 433700, -60000 },
 };
 
 /*
@@ -585,33 +585,13 @@ static int qcom_vadc7_scale_hw_calib_die_temp(
 				u16 adc_code, int *result_mdec)
 {
 
-	int voltage, vtemp0, temp, i;
+	int voltage;
 
 	voltage = qcom_vadc_scale_code_voltage_factor(adc_code,
 				prescale, data, 1);
 
-	if (adcmap7_die_temp[0].x > voltage) {
-		*result_mdec = DIE_TEMP_ADC7_SCALE_1;
-		return 0;
-	}
-
-	if (adcmap7_die_temp[ARRAY_SIZE(adcmap7_die_temp) - 1].x <= voltage) {
-		*result_mdec = DIE_TEMP_ADC7_MAX;
-		return 0;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(adcmap7_die_temp); i++)
-		if (adcmap7_die_temp[i].x > voltage)
-			break;
-
-	vtemp0 = adcmap7_die_temp[i - 1].x;
-	voltage = voltage - vtemp0;
-	temp = div64_s64(voltage * DIE_TEMP_ADC7_SCALE_FACTOR,
-		adcmap7_die_temp[i - 1].y);
-	temp += DIE_TEMP_ADC7_SCALE_1 + (DIE_TEMP_ADC7_SCALE_2 * (i - 1));
-	*result_mdec = temp;
-
-	return 0;
+	return qcom_vadc_map_voltage_temp(adcmap7_die_temp, ARRAY_SIZE(adcmap7_die_temp),
+			voltage, result_mdec);
 }
 
 static int qcom_vadc_scale_hw_smb_temp(
diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h
index 1d337dd9e3dc..58216124d89d 100644
--- a/include/linux/iio/adc/qcom-vadc-common.h
+++ b/include/linux/iio/adc/qcom-vadc-common.h
@@ -54,11 +54,6 @@
 #define R_PU_100K				100000
 #define RATIO_MAX_ADC7				BIT(14)
 
-#define DIE_TEMP_ADC7_SCALE_1			-60000
-#define DIE_TEMP_ADC7_SCALE_2			20000
-#define DIE_TEMP_ADC7_SCALE_FACTOR		1000
-#define DIE_TEMP_ADC7_MAX			160000
-
 /*
  * VADC_CALIB_ABSOLUTE: uses the 625mV and 1.25V as reference channels.
  * VADC_CALIB_RATIOMETRIC: uses the reference voltage (1.8V) and GND for
-- 
cgit v1.2.3


From bca585d24a1719d9314d5438b0d2804a33d9bbb6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 Jan 2021 14:13:52 -0500
Subject: new helper: d_find_alias_rcu()

similar to d_find_alias(inode), except that
	* the caller must be holding rcu_read_lock()
	* inode must not be freed until matching rcu_read_unlock()
	* result is *NOT* pinned and can only be dereferenced until
the matching rcu_read_unlock().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 25 +++++++++++++++++++++++++
 include/linux/dcache.h |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 97e81a844a96..843546633457 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1042,6 +1042,31 @@ struct dentry *d_find_alias(struct inode *inode)
 }
 EXPORT_SYMBOL(d_find_alias);
 
+/*
+ *  Caller MUST be holding rcu_read_lock() and be guaranteed
+ *  that inode won't get freed until rcu_read_unlock().
+ */
+struct dentry *d_find_alias_rcu(struct inode *inode)
+{
+	struct hlist_head *l = &inode->i_dentry;
+	struct dentry *de = NULL;
+
+	spin_lock(&inode->i_lock);
+	// ->i_dentry and ->i_rcu are colocated, but the latter won't be
+	// used without having I_FREEING set, which means no aliases left
+	if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
+		if (S_ISDIR(inode->i_mode)) {
+			de = hlist_entry(l->first, struct dentry, d_u.d_alias);
+		} else {
+			hlist_for_each_entry(de, l, d_u.d_alias)
+				if (!d_unhashed(de))
+					break;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return de;
+}
+
 /*
  *	Try to kill dentries associated with this inode.
  * WARNING: you must own a reference to inode.
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d7b369fc15d3..c1e48014106f 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -262,6 +262,8 @@ extern void d_tmpfile(struct dentry *, struct inode *);
 extern struct dentry *d_find_alias(struct inode *);
 extern void d_prune_aliases(struct inode *);
 
+extern struct dentry *d_find_alias_rcu(struct inode *);
+
 /* test whether we have any submounts in a subdir tree */
 extern int path_has_submounts(const struct path *);
 
-- 
cgit v1.2.3


From 7ae41220ef5831674f446baef19bfe1b31358260 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 11 Jan 2021 00:17:36 +0100
Subject: rtc: introduce features bitfield

Introduce a bitfield to allow the drivers to announce the available
features for an RTC.

The main use case would be to better handle alarms, that could be present
or not or have a minute resolution or may need a correct week day to be set.

Use the newly introduced RTC_FEATURE_ALARM bit to then test whether alarms
are available instead of relying on the presence of ops->set_alarm.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20210110231752.1418816-2-alexandre.belloni@bootlin.com
---
 drivers/rtc/class.c      |  5 +++++
 drivers/rtc/interface.c  | 12 ++++++------
 include/linux/rtc.h      |  2 ++
 include/uapi/linux/rtc.h |  5 +++++
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 7e470fbd5e4d..03abd0aa229a 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -231,6 +231,8 @@ static struct rtc_device *rtc_allocate_device(void)
 	rtc->pie_timer.function = rtc_pie_update_irq;
 	rtc->pie_enabled = 0;
 
+	set_bit(RTC_FEATURE_ALARM, rtc->features);
+
 	return rtc;
 }
 
@@ -386,6 +388,9 @@ int __devm_rtc_register_device(struct module *owner, struct rtc_device *rtc)
 		return -EINVAL;
 	}
 
+	if (!rtc->ops->set_alarm)
+		clear_bit(RTC_FEATURE_ALARM, rtc->features);
+
 	rtc->owner = owner;
 	rtc_device_get_offset(rtc);
 
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 794a4f036b99..dcb34c73319e 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -186,7 +186,7 @@ static int rtc_read_alarm_internal(struct rtc_device *rtc,
 
 	if (!rtc->ops) {
 		err = -ENODEV;
-	} else if (!rtc->ops->read_alarm) {
+	} else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->read_alarm) {
 		err = -EINVAL;
 	} else {
 		alarm->enabled = 0;
@@ -392,7 +392,7 @@ int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 		return err;
 	if (!rtc->ops) {
 		err = -ENODEV;
-	} else if (!rtc->ops->read_alarm) {
+	} else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->read_alarm) {
 		err = -EINVAL;
 	} else {
 		memset(alarm, 0, sizeof(struct rtc_wkalrm));
@@ -436,7 +436,7 @@ static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 
 	if (!rtc->ops)
 		err = -ENODEV;
-	else if (!rtc->ops->set_alarm)
+	else if (!test_bit(RTC_FEATURE_ALARM, rtc->features))
 		err = -EINVAL;
 	else
 		err = rtc->ops->set_alarm(rtc->dev.parent, alarm);
@@ -451,7 +451,7 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 
 	if (!rtc->ops)
 		return -ENODEV;
-	else if (!rtc->ops->set_alarm)
+	else if (!test_bit(RTC_FEATURE_ALARM, rtc->features))
 		return -EINVAL;
 
 	err = rtc_valid_tm(&alarm->time);
@@ -531,7 +531,7 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
 		/* nothing */;
 	else if (!rtc->ops)
 		err = -ENODEV;
-	else if (!rtc->ops->alarm_irq_enable)
+	else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->alarm_irq_enable)
 		err = -EINVAL;
 	else
 		err = rtc->ops->alarm_irq_enable(rtc->dev.parent, enabled);
@@ -843,7 +843,7 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
 
 static void rtc_alarm_disable(struct rtc_device *rtc)
 {
-	if (!rtc->ops || !rtc->ops->alarm_irq_enable)
+	if (!rtc->ops || !test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->alarm_irq_enable)
 		return;
 
 	rtc->ops->alarm_irq_enable(rtc->dev.parent, false);
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 568909449c13..bd611e26291d 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -141,6 +141,8 @@ struct rtc_device {
 	 */
 	unsigned long set_offset_nsec;
 
+	unsigned long features[BITS_TO_LONGS(RTC_FEATURE_CNT)];
+
 	time64_t range_min;
 	timeu64_t range_max;
 	time64_t start_secs;
diff --git a/include/uapi/linux/rtc.h b/include/uapi/linux/rtc.h
index fa9aff91cbf2..f950bff75e97 100644
--- a/include/uapi/linux/rtc.h
+++ b/include/uapi/linux/rtc.h
@@ -110,6 +110,11 @@ struct rtc_pll_info {
 #define RTC_AF 0x20	/* Alarm interrupt */
 #define RTC_UF 0x10	/* Update interrupt for 1Hz RTC */
 
+/* feature list */
+#define RTC_FEATURE_ALARM		0
+#define RTC_FEATURE_ALARM_RES_MINUTE	1
+#define RTC_FEATURE_NEED_WEEK_DAY	2
+#define RTC_FEATURE_CNT			3
 
 #define RTC_MAX_FREQ	8192
 
-- 
cgit v1.2.3


From 61712a5f8271ebdbb9d34d28279b77a452db9d5c Mon Sep 17 00:00:00 2001
From: Yue Zou <zouyue3@huawei.com>
Date: Mon, 18 Jan 2021 01:03:55 +0000
Subject: vgaarb: Remove unneeded semicolons

Remove superfluous semicolons after function definitions.

Signed-off-by: Yue Zou <zouyue3@huawei.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210118010356.214491-1-zouyue3@huawei.com
---
 include/linux/vgaarb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index 977caf96c8d2..fc6dfeba04a5 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -121,9 +121,9 @@ extern struct pci_dev *vga_default_device(void);
 extern void vga_set_default_device(struct pci_dev *pdev);
 extern int vga_remove_vgacon(struct pci_dev *pdev);
 #else
-static inline struct pci_dev *vga_default_device(void) { return NULL; };
-static inline void vga_set_default_device(struct pci_dev *pdev) { };
-static inline int vga_remove_vgacon(struct pci_dev *pdev) { return 0; };
+static inline struct pci_dev *vga_default_device(void) { return NULL; }
+static inline void vga_set_default_device(struct pci_dev *pdev) { }
+static inline int vga_remove_vgacon(struct pci_dev *pdev) { return 0; }
 #endif
 
 /*
-- 
cgit v1.2.3


From 8ff059b8531f3b98e14f0461859fc7cdd95823e4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 18 Jan 2021 13:38:42 +0100
Subject: efi: ia64: move IA64-only declarations to new asm/efi.h header

Move some EFI related declarations that are only referenced on IA64 to
a new asm/efi.h arch header.

Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/ia64/include/asm/efi.h      | 13 +++++++++++++
 arch/ia64/kernel/efi.c           |  1 +
 arch/ia64/kernel/machine_kexec.c |  1 +
 arch/ia64/kernel/mca.c           |  1 +
 arch/ia64/kernel/smpboot.c       |  1 +
 arch/ia64/kernel/time.c          |  1 +
 arch/ia64/kernel/uncached.c      |  4 +---
 arch/ia64/mm/contig.c            |  1 +
 arch/ia64/mm/discontig.c         |  1 +
 arch/ia64/mm/init.c              |  1 +
 include/linux/efi.h              |  6 ------
 11 files changed, 22 insertions(+), 9 deletions(-)
 create mode 100644 arch/ia64/include/asm/efi.h

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/efi.h b/arch/ia64/include/asm/efi.h
new file mode 100644
index 000000000000..6a4a50d8f19a
--- /dev/null
+++ b/arch/ia64/include/asm/efi.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_EFI_H
+#define _ASM_EFI_H
+
+typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
+
+void *efi_get_pal_addr(void);
+void efi_map_pal_code(void);
+void efi_memmap_walk(efi_freemem_callback_t, void *);
+void efi_memmap_walk_uc(efi_freemem_callback_t, void *);
+void efi_gettimeofday(struct timespec64 *ts);
+
+#endif
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index f932b25fb817..dd7fd750bb93 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -34,6 +34,7 @@
 #include <linux/kexec.h>
 #include <linux/mm.h>
 
+#include <asm/efi.h>
 #include <asm/io.h>
 #include <asm/kregs.h>
 #include <asm/meminit.h>
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index efc9b568401c..af310dc8a356 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -16,6 +16,7 @@
 #include <linux/numa.h>
 #include <linux/mmzone.h>
 
+#include <asm/efi.h>
 #include <asm/numa.h>
 #include <asm/mmu_context.h>
 #include <asm/setup.h>
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 2703f7795672..0fea266b4d39 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -91,6 +91,7 @@
 #include <linux/gfp.h>
 
 #include <asm/delay.h>
+#include <asm/efi.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 093040f7e626..49b488580939 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -45,6 +45,7 @@
 #include <asm/cache.h>
 #include <asm/current.h>
 #include <asm/delay.h>
+#include <asm/efi.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/mca.h>
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index ed9fc3d057a6..a37f161a66b1 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -26,6 +26,7 @@
 #include <linux/sched/cputime.h>
 
 #include <asm/delay.h>
+#include <asm/efi.h>
 #include <asm/hw_irq.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 0750f367837d..51883a66aeb5 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -20,14 +20,12 @@
 #include <linux/genalloc.h>
 #include <linux/gfp.h>
 #include <linux/pgtable.h>
+#include <asm/efi.h>
 #include <asm/page.h>
 #include <asm/pal.h>
 #include <linux/atomic.h>
 #include <asm/tlbflush.h>
 
-
-extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
-
 struct uncached_pool {
 	struct gen_pool *pool;
 	struct mutex add_chunk_mutex;	/* serialize adding a converted chunk */
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index bfc4ecd0a2ab..62fe80a16f42 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -21,6 +21,7 @@
 #include <linux/swap.h>
 #include <linux/sizes.h>
 
+#include <asm/efi.h>
 #include <asm/meminit.h>
 #include <asm/sections.h>
 #include <asm/mca.h>
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c7311131156e..03b3a02375ff 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -24,6 +24,7 @@
 #include <linux/efi.h>
 #include <linux/nodemask.h>
 #include <linux/slab.h>
+#include <asm/efi.h>
 #include <asm/tlb.h>
 #include <asm/meminit.h>
 #include <asm/numa.h>
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 9b5acf8fb092..24583a39fa1b 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -27,6 +27,7 @@
 #include <linux/swiotlb.h>
 
 #include <asm/dma.h>
+#include <asm/efi.h>
 #include <asm/io.h>
 #include <asm/numa.h>
 #include <asm/patch.h>
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 763b816ba19c..0c31af36697c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -167,8 +167,6 @@ struct capsule_info {
 
 int __efi_capsule_setup_info(struct capsule_info *cap_info);
 
-typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
-
 /*
  * Types and defines for Time Services
  */
@@ -605,10 +603,6 @@ efi_guid_to_str(efi_guid_t *guid, char *out)
 }
 
 extern void efi_init (void);
-extern void *efi_get_pal_addr (void);
-extern void efi_map_pal_code (void);
-extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
-extern void efi_gettimeofday (struct timespec64 *ts);
 #ifdef CONFIG_EFI
 extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if possible */
 #else
-- 
cgit v1.2.3


From 84335675f2223cbd25d0de7d38ecc7d40b95bd4a Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 15 Jan 2021 17:47:39 +0100
Subject: dma-buf: Add debug option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have too many people abusing the struct page they can get at but
really shouldn't in importers. Aside from that the backing page might
simply not exist (for dynamic p2p mappings) looking at it and using it
e.g. for mmap can also wreak the page handling of the exporter
completely. Importers really must go through the proper interface like
dma_buf_mmap for everything.

I'm semi-tempted to enforce this for dynamic importers since those
really have no excuse at all to break the rules.

Unfortuantely we can't store the right pointers somewhere safe to make
sure we oops on something recognizable, so best is to just wrangle
them a bit by flipping all the bits. At least on x86 kernel addresses
have all their high bits sets and the struct page array is fairly low
in the kernel mapping, so flipping all the bits gives us a very high
pointer in userspace and hence excellent chances for an invalid
dereference.

v2: Add a note to the @map_dma_buf hook that exporters shouldn't do
fancy caching tricks, which would blow up with this address scrambling
trick here (Chris)

Enable by default when CONFIG_DMA_API_DEBUG is enabled.

v3: Only one copy of the mangle/unmangle code (Christian)

v4: #ifdef, not #if (0day)

v5: sg_table can also be an ERR_PTR (Chris, Christian)

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: David Stevens <stevensd@chromium.org>
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Link: https://patchwork.freedesktop.org/patch/msgid/20210115164739.3958206-1-daniel.vetter@ffwll.ch
---
 drivers/dma-buf/Kconfig   |  8 ++++++++
 drivers/dma-buf/dma-buf.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/dma-buf.h   |  6 ++++++
 3 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index 4f8224a6ac95..4e16c71c24b7 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -50,6 +50,14 @@ config DMABUF_MOVE_NOTIFY
 	  This is marked experimental because we don't yet have a consistent
 	  execution context and memory management between drivers.
 
+config DMABUF_DEBUG
+	bool "DMA-BUF debug checks"
+	default y if DMA_API_DEBUG
+	help
+	  This option enables additional checks for DMA-BUF importers and
+	  exporters. Specifically it validates that importers do not peek at the
+	  underlying struct page when they import a buffer.
+
 config DMABUF_SELFTESTS
 	tristate "Selftests for the dma-buf interfaces"
 	default n
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index b8465243eca2..3948b88aa123 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -653,6 +653,34 @@ void dma_buf_put(struct dma_buf *dmabuf)
 }
 EXPORT_SYMBOL_GPL(dma_buf_put);
 
+static void mangle_sg_table(struct sg_table *sg_table)
+{
+#ifdef CONFIG_DMABUF_DEBUG
+	int i;
+	struct scatterlist *sg;
+
+	/* To catch abuse of the underlying struct page by importers mix
+	 * up the bits, but take care to preserve the low SG_ bits to
+	 * not corrupt the sgt. The mixing is undone in __unmap_dma_buf
+	 * before passing the sgt back to the exporter. */
+	for_each_sgtable_sg(sg_table, sg, i)
+		sg->page_link ^= ~0xffUL;
+#endif
+
+}
+static struct sg_table * __map_dma_buf(struct dma_buf_attachment *attach,
+				       enum dma_data_direction direction)
+{
+	struct sg_table *sg_table;
+
+	sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
+
+	if (!IS_ERR_OR_NULL(sg_table))
+		mangle_sg_table(sg_table);
+
+	return sg_table;
+}
+
 /**
  * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list
  * @dmabuf:		[in]	buffer to attach device to.
@@ -724,7 +752,7 @@ dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev,
 				goto err_unlock;
 		}
 
-		sgt = dmabuf->ops->map_dma_buf(attach, DMA_BIDIRECTIONAL);
+		sgt = __map_dma_buf(attach, DMA_BIDIRECTIONAL);
 		if (!sgt)
 			sgt = ERR_PTR(-ENOMEM);
 		if (IS_ERR(sgt)) {
@@ -771,6 +799,16 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 }
 EXPORT_SYMBOL_GPL(dma_buf_attach);
 
+static void __unmap_dma_buf(struct dma_buf_attachment *attach,
+			    struct sg_table *sg_table,
+			    enum dma_data_direction direction)
+{
+	/* uses XOR, hence this unmangles */
+	mangle_sg_table(sg_table);
+
+	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction);
+}
+
 /**
  * dma_buf_detach - Remove the given attachment from dmabuf's attachments list
  * @dmabuf:	[in]	buffer to detach from.
@@ -789,7 +827,7 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach)
 		if (dma_buf_is_dynamic(attach->dmabuf))
 			dma_resv_lock(attach->dmabuf->resv, NULL);
 
-		dmabuf->ops->unmap_dma_buf(attach, attach->sgt, attach->dir);
+		__unmap_dma_buf(attach, attach->sgt, attach->dir);
 
 		if (dma_buf_is_dynamic(attach->dmabuf)) {
 			dma_buf_unpin(attach);
@@ -911,7 +949,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 		}
 	}
 
-	sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
+	sg_table = __map_dma_buf(attach, direction);
 	if (!sg_table)
 		sg_table = ERR_PTR(-ENOMEM);
 
@@ -974,7 +1012,7 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
 	if (dma_buf_is_dynamic(attach->dmabuf))
 		dma_resv_assert_held(attach->dmabuf->resv);
 
-	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction);
+	__unmap_dma_buf(attach, sg_table, direction);
 
 	if (dma_buf_is_dynamic(attach->dmabuf) &&
 	    !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY))
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 628681bf6c99..efdc56b9d95f 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -154,6 +154,12 @@ struct dma_buf_ops {
 	 * On failure, returns a negative error value wrapped into a pointer.
 	 * May also return -EINTR when a signal was received while being
 	 * blocked.
+	 *
+	 * Note that exporters should not try to cache the scatter list, or
+	 * return the same one for multiple calls. Caching is done either by the
+	 * DMA-BUF code (for non-dynamic importers) or the importer. Ownership
+	 * of the scatter list is transferred to the caller, and returned by
+	 * @unmap_dma_buf.
 	 */
 	struct sg_table * (*map_dma_buf)(struct dma_buf_attachment *,
 					 enum dma_data_direction);
-- 
cgit v1.2.3


From 1b399bb04837183cecdc1b32ef1cfc7fcfa75d32 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Sun, 13 Dec 2020 23:54:46 +0000
Subject: kconfig.h: Add IF_ENABLED() macro

IF_ENABLED(CONFIG_FOO, ptr) evaluates to (ptr) if CONFIG_FOO is set to 'y'
or 'm', NULL otherwise. The (ptr) argument must be a pointer.

The IF_ENABLED() macro can be very useful to help GCC drop dead code.

For instance, consider the following:

    #ifdef CONFIG_FOO_SUSPEND
    static int foo_suspend(struct device *dev)
    {
       ...
    }
    #endif

    static struct pm_ops foo_ops = {
	#ifdef CONFIG_FOO_SUSPEND
        .suspend = foo_suspend,
	#endif
    };

While this works, the foo_suspend() macro is compiled conditionally,
only when CONFIG_FOO_SUSPEND is set. This is problematic, as there could
be a build bug in this function, we wouldn't have a way to know unless
the config option is set.

An alternative is to declare foo_suspend() always, but mark it as maybe
unused:

    static int __maybe_unused foo_suspend(struct device *dev)
    {
       ...
    }

    static struct pm_ops foo_ops = {
	#ifdef CONFIG_FOO_SUSPEND
        .suspend = foo_suspend,
	#endif
    };

Again, this works, but the __maybe_unused attribute is required to
instruct the compiler that the function may not be referenced anywhere,
and is safe to remove without making a fuss about it. This makes the
programmer responsible for tagging the functions that can be
garbage-collected.

With this patch, it is now possible to write the following:

    static int foo_suspend(struct device *dev)
    {
       ...
    }

    static struct pm_ops foo_ops = {
        .suspend = IF_ENABLED(CONFIG_FOO_SUSPEND, foo_suspend),
    };

The foo_suspend() function will now be automatically dropped by the
compiler, and it does not require any specific attribute.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20201213235447.138271-1-paul@crapouillou.net
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/kconfig.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index 9d12c970f18f..e78e17a76dc9 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -72,4 +72,10 @@
  */
 #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 
+/*
+ * IF_ENABLED(CONFIG_FOO, ptr) evaluates to (ptr) if CONFIG_FOO is set to 'y'
+ * or 'm', NULL otherwise.
+ */
+#define IF_ENABLED(option, ptr) (IS_ENABLED(option) ? (ptr) : NULL)
+
 #endif /* __LINUX_KCONFIG_H */
-- 
cgit v1.2.3


From b33752c300232d7f95dd9a4353947d0c9e6a0e52 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 15 Jan 2021 09:06:37 -0800
Subject: HID: i2c-hid: Reorganize so ACPI and OF are separate modules

This patch rejiggers the i2c-hid code so that the OF (Open Firmware
aka Device Tree) and ACPI support is separated out a bit.  The OF and
ACPI drivers are now separate modules that wrap the core module.

Essentially, what we're doing here:
* Make "power up" and "power down" a function that can be (optionally)
  implemented by a given user of the i2c-hid core.
* The OF and ACPI modules are drivers on their own, so they implement
  probe / remove / suspend / resume / shutdown.  The core code
  provides implementations that OF and ACPI can call into.

We'll organize this so that we now have 3 modules: the old i2c-hid
module becomes the "core" module and two new modules will depend on
it, handling probing the specific device.

As part of this work, we'll remove the i2c-hid "platform data"
concept since it's not needed.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/Makefile                  |   2 +-
 drivers/hid/i2c-hid/Kconfig           |  32 ++++-
 drivers/hid/i2c-hid/Makefile          |   5 +-
 drivers/hid/i2c-hid/i2c-hid-acpi.c    | 143 +++++++++++++++++++
 drivers/hid/i2c-hid/i2c-hid-core.c    | 252 ++++++----------------------------
 drivers/hid/i2c-hid/i2c-hid-of.c      | 143 +++++++++++++++++++
 drivers/hid/i2c-hid/i2c-hid.h         |  22 +++
 include/linux/platform_data/i2c-hid.h |  41 ------
 8 files changed, 379 insertions(+), 261 deletions(-)
 create mode 100644 drivers/hid/i2c-hid/i2c-hid-acpi.c
 create mode 100644 drivers/hid/i2c-hid/i2c-hid-of.c
 delete mode 100644 include/linux/platform_data/i2c-hid.h

(limited to 'include/linux')

diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index 014d21fe7dac..a0621a4a65cd 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -138,7 +138,7 @@ obj-$(CONFIG_USB_HID)		+= usbhid/
 obj-$(CONFIG_USB_MOUSE)		+= usbhid/
 obj-$(CONFIG_USB_KBD)		+= usbhid/
 
-obj-$(CONFIG_I2C_HID)		+= i2c-hid/
+obj-$(CONFIG_I2C_HID_CORE)	+= i2c-hid/
 
 obj-$(CONFIG_INTEL_ISH_HID)	+= intel-ish-hid/
 obj-$(INTEL_ISH_FIRMWARE_DOWNLOADER)	+= intel-ish-hid/
diff --git a/drivers/hid/i2c-hid/Kconfig b/drivers/hid/i2c-hid/Kconfig
index c4e5dfeab2bd..819b7521c182 100644
--- a/drivers/hid/i2c-hid/Kconfig
+++ b/drivers/hid/i2c-hid/Kconfig
@@ -2,18 +2,40 @@
 menu "I2C HID support"
 	depends on I2C
 
-config I2C_HID
-	tristate "HID over I2C transport layer"
+config I2C_HID_ACPI
+	tristate "HID over I2C transport layer ACPI driver"
 	default n
-	depends on I2C && INPUT
-	select HID
+	depends on I2C && INPUT && ACPI
+	help
+	  Say Y here if you use a keyboard, a touchpad, a touchscreen, or any
+	  other HID based devices which is connected to your computer via I2C.
+	  This driver supports ACPI-based systems.
+
+	  If unsure, say N.
+
+	  This support is also available as a module.  If so, the module
+	  will be called i2c-hid-acpi.  It will also build/depend on the
+	  module i2c-hid.
+
+config I2C_HID_OF
+	tristate "HID over I2C transport layer Open Firmware driver"
+	default n
+	depends on I2C && INPUT && OF
 	help
 	  Say Y here if you use a keyboard, a touchpad, a touchscreen, or any
 	  other HID based devices which is connected to your computer via I2C.
+	  This driver supports Open Firmware (Device Tree)-based systems.
 
 	  If unsure, say N.
 
 	  This support is also available as a module.  If so, the module
-	  will be called i2c-hid.
+	  will be called i2c-hid-of.  It will also build/depend on the
+	  module i2c-hid.
 
 endmenu
+
+config I2C_HID_CORE
+	tristate
+	default y if I2C_HID_ACPI=y || I2C_HID_OF=y
+	default m if I2C_HID_ACPI=m || I2C_HID_OF=m
+	select HID
diff --git a/drivers/hid/i2c-hid/Makefile b/drivers/hid/i2c-hid/Makefile
index 681b3896898e..9b4a73446841 100644
--- a/drivers/hid/i2c-hid/Makefile
+++ b/drivers/hid/i2c-hid/Makefile
@@ -3,7 +3,10 @@
 # Makefile for the I2C input drivers
 #
 
-obj-$(CONFIG_I2C_HID)				+= i2c-hid.o
+obj-$(CONFIG_I2C_HID_CORE)			+= i2c-hid.o
 
 i2c-hid-objs					=  i2c-hid-core.o
 i2c-hid-$(CONFIG_DMI)				+= i2c-hid-dmi-quirks.o
+
+obj-$(CONFIG_I2C_HID_ACPI)			+= i2c-hid-acpi.o
+obj-$(CONFIG_I2C_HID_OF)			+= i2c-hid-of.o
diff --git a/drivers/hid/i2c-hid/i2c-hid-acpi.c b/drivers/hid/i2c-hid/i2c-hid-acpi.c
new file mode 100644
index 000000000000..bb8c00e6be78
--- /dev/null
+++ b/drivers/hid/i2c-hid/i2c-hid-acpi.c
@@ -0,0 +1,143 @@
+/*
+ * HID over I2C ACPI Subclass
+ *
+ * Copyright (c) 2012 Benjamin Tissoires <benjamin.tissoires@gmail.com>
+ * Copyright (c) 2012 Ecole Nationale de l'Aviation Civile, France
+ * Copyright (c) 2012 Red Hat, Inc
+ *
+ * This code was forked out of the core code, which was partly based on
+ * "USB HID support for Linux":
+ *
+ *  Copyright (c) 1999 Andreas Gal
+ *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
+ *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
+ *  Copyright (c) 2007-2008 Oliver Neukum
+ *  Copyright (c) 2006-2010 Jiri Kosina
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive for
+ * more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+
+#include "i2c-hid.h"
+
+struct i2c_hid_acpi {
+	struct i2chid_ops ops;
+	struct i2c_client *client;
+};
+
+static const struct acpi_device_id i2c_hid_acpi_blacklist[] = {
+	/*
+	 * The CHPN0001 ACPI device, which is used to describe the Chipone
+	 * ICN8505 controller, has a _CID of PNP0C50 but is not HID compatible.
+	 */
+	{"CHPN0001", 0 },
+	{ },
+};
+
+static int i2c_hid_acpi_get_descriptor(struct i2c_client *client)
+{
+	static guid_t i2c_hid_guid =
+		GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555,
+			  0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE);
+	union acpi_object *obj;
+	struct acpi_device *adev;
+	acpi_handle handle;
+	u16 hid_descriptor_address;
+
+	handle = ACPI_HANDLE(&client->dev);
+	if (!handle || acpi_bus_get_device(handle, &adev)) {
+		dev_err(&client->dev, "Error could not get ACPI device\n");
+		return -ENODEV;
+	}
+
+	if (acpi_match_device_ids(adev, i2c_hid_acpi_blacklist) == 0)
+		return -ENODEV;
+
+	obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL,
+				      ACPI_TYPE_INTEGER);
+	if (!obj) {
+		dev_err(&client->dev, "Error _DSM call to get HID descriptor address failed\n");
+		return -ENODEV;
+	}
+
+	hid_descriptor_address = obj->integer.value;
+	ACPI_FREE(obj);
+
+	return hid_descriptor_address;
+}
+
+static void i2c_hid_acpi_shutdown_tail(struct i2chid_ops *ops)
+{
+	struct i2c_hid_acpi *ihid_acpi =
+		container_of(ops, struct i2c_hid_acpi, ops);
+	struct device *dev = &ihid_acpi->client->dev;
+	acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D3_COLD);
+}
+
+static int i2c_hid_acpi_probe(struct i2c_client *client,
+			      const struct i2c_device_id *dev_id)
+{
+	struct device *dev = &client->dev;
+	struct i2c_hid_acpi *ihid_acpi;
+	struct acpi_device *adev;
+	u16 hid_descriptor_address;
+	int ret;
+
+	ihid_acpi = devm_kzalloc(&client->dev, sizeof(*ihid_acpi), GFP_KERNEL);
+	if (!ihid_acpi)
+		return -ENOMEM;
+
+	ihid_acpi->client = client;
+	ihid_acpi->ops.shutdown_tail = i2c_hid_acpi_shutdown_tail;
+
+	ret = i2c_hid_acpi_get_descriptor(client);
+	if (ret < 0)
+		return ret;
+	hid_descriptor_address = ret;
+
+	adev = ACPI_COMPANION(dev);
+	if (adev)
+		acpi_device_fix_up_power(adev);
+
+	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) {
+		device_set_wakeup_capable(dev, true);
+		device_set_wakeup_enable(dev, false);
+	}
+
+	return i2c_hid_core_probe(client, &ihid_acpi->ops,
+				  hid_descriptor_address);
+}
+
+static const struct acpi_device_id i2c_hid_acpi_match[] = {
+	{"ACPI0C50", 0 },
+	{"PNP0C50", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, i2c_hid_acpi_match);
+
+static struct i2c_driver i2c_hid_acpi_driver = {
+	.driver = {
+		.name	= "i2c_hid_acpi",
+		.pm	= &i2c_hid_core_pm,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+		.acpi_match_table = ACPI_PTR(i2c_hid_acpi_match),
+	},
+
+	.probe		= i2c_hid_acpi_probe,
+	.remove		= i2c_hid_core_remove,
+	.shutdown	= i2c_hid_core_shutdown,
+};
+
+module_i2c_driver(i2c_hid_acpi_driver);
+
+MODULE_DESCRIPTION("HID over I2C ACPI driver");
+MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c
index bfe716d7ea44..b64441db29a8 100644
--- a/drivers/hid/i2c-hid/i2c-hid-core.c
+++ b/drivers/hid/i2c-hid/i2c-hid-core.c
@@ -35,11 +35,6 @@
 #include <linux/kernel.h>
 #include <linux/hid.h>
 #include <linux/mutex.h>
-#include <linux/acpi.h>
-#include <linux/of.h>
-#include <linux/regulator/consumer.h>
-
-#include <linux/platform_data/i2c-hid.h>
 
 #include "../hid-ids.h"
 #include "i2c-hid.h"
@@ -156,10 +151,10 @@ struct i2c_hid {
 
 	wait_queue_head_t	wait;		/* For waiting the interrupt */
 
-	struct i2c_hid_platform_data pdata;
-
 	bool			irq_wake_enabled;
 	struct mutex		reset_lock;
+
+	struct i2chid_ops	*ops;
 };
 
 static const struct i2c_hid_quirks {
@@ -884,144 +879,36 @@ static int i2c_hid_fetch_hid_descriptor(struct i2c_hid *ihid)
 	return 0;
 }
 
-#ifdef CONFIG_ACPI
-static const struct acpi_device_id i2c_hid_acpi_blacklist[] = {
-	/*
-	 * The CHPN0001 ACPI device, which is used to describe the Chipone
-	 * ICN8505 controller, has a _CID of PNP0C50 but is not HID compatible.
-	 */
-	{"CHPN0001", 0 },
-	{ },
-};
-
-static int i2c_hid_acpi_pdata(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
-{
-	static guid_t i2c_hid_guid =
-		GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555,
-			  0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE);
-	union acpi_object *obj;
-	struct acpi_device *adev;
-	acpi_handle handle;
-
-	handle = ACPI_HANDLE(&client->dev);
-	if (!handle || acpi_bus_get_device(handle, &adev)) {
-		dev_err(&client->dev, "Error could not get ACPI device\n");
-		return -ENODEV;
-	}
-
-	if (acpi_match_device_ids(adev, i2c_hid_acpi_blacklist) == 0)
-		return -ENODEV;
-
-	obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL,
-				      ACPI_TYPE_INTEGER);
-	if (!obj) {
-		dev_err(&client->dev, "Error _DSM call to get HID descriptor address failed\n");
-		return -ENODEV;
-	}
-
-	pdata->hid_descriptor_address = obj->integer.value;
-	ACPI_FREE(obj);
-
-	return 0;
-}
-
-static void i2c_hid_acpi_fix_up_power(struct device *dev)
-{
-	struct acpi_device *adev;
-
-	adev = ACPI_COMPANION(dev);
-	if (adev)
-		acpi_device_fix_up_power(adev);
-}
-
-static void i2c_hid_acpi_enable_wakeup(struct device *dev)
-{
-	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) {
-		device_set_wakeup_capable(dev, true);
-		device_set_wakeup_enable(dev, false);
-	}
-}
-
-static void i2c_hid_acpi_shutdown(struct device *dev)
+static int i2c_hid_core_power_up(struct i2c_hid *ihid)
 {
-	acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D3_COLD);
-}
+	if (!ihid->ops->power_up)
+		return 0;
 
-static const struct acpi_device_id i2c_hid_acpi_match[] = {
-	{"ACPI0C50", 0 },
-	{"PNP0C50", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(acpi, i2c_hid_acpi_match);
-#else
-static inline int i2c_hid_acpi_pdata(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
-{
-	return -ENODEV;
+	return ihid->ops->power_up(ihid->ops);
 }
 
-static inline void i2c_hid_acpi_fix_up_power(struct device *dev) {}
-
-static inline void i2c_hid_acpi_enable_wakeup(struct device *dev) {}
-
-static inline void i2c_hid_acpi_shutdown(struct device *dev) {}
-#endif
-
-#ifdef CONFIG_OF
-static int i2c_hid_of_probe(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
+static void i2c_hid_core_power_down(struct i2c_hid *ihid)
 {
-	struct device *dev = &client->dev;
-	u32 val;
-	int ret;
-
-	ret = of_property_read_u32(dev->of_node, "hid-descr-addr", &val);
-	if (ret) {
-		dev_err(&client->dev, "HID register address not provided\n");
-		return -ENODEV;
-	}
-	if (val >> 16) {
-		dev_err(&client->dev, "Bad HID register address: 0x%08x\n",
-			val);
-		return -EINVAL;
-	}
-	pdata->hid_descriptor_address = val;
-
-	return 0;
-}
+	if (!ihid->ops->power_down)
+		return;
 
-static const struct of_device_id i2c_hid_of_match[] = {
-	{ .compatible = "hid-over-i2c" },
-	{},
-};
-MODULE_DEVICE_TABLE(of, i2c_hid_of_match);
-#else
-static inline int i2c_hid_of_probe(struct i2c_client *client,
-		struct i2c_hid_platform_data *pdata)
-{
-	return -ENODEV;
+	ihid->ops->power_down(ihid->ops);
 }
-#endif
 
-static void i2c_hid_fwnode_probe(struct i2c_client *client,
-				 struct i2c_hid_platform_data *pdata)
+static void i2c_hid_core_shutdown_tail(struct i2c_hid *ihid)
 {
-	u32 val;
+	if (!ihid->ops->shutdown_tail)
+		return;
 
-	if (!device_property_read_u32(&client->dev, "post-power-on-delay-ms",
-				      &val))
-		pdata->post_power_delay_ms = val;
+	ihid->ops->shutdown_tail(ihid->ops);
 }
 
-static int i2c_hid_probe(struct i2c_client *client,
-			 const struct i2c_device_id *dev_id)
+int i2c_hid_core_probe(struct i2c_client *client, struct i2chid_ops *ops,
+		       u16 hid_descriptor_address)
 {
 	int ret;
 	struct i2c_hid *ihid;
 	struct hid_device *hid;
-	__u16 hidRegister;
-	struct i2c_hid_platform_data *platform_data = client->dev.platform_data;
 
 	dbg_hid("HID probe called for i2c 0x%02x\n", client->addr);
 
@@ -1042,44 +929,17 @@ static int i2c_hid_probe(struct i2c_client *client,
 	if (!ihid)
 		return -ENOMEM;
 
-	if (client->dev.of_node) {
-		ret = i2c_hid_of_probe(client, &ihid->pdata);
-		if (ret)
-			return ret;
-	} else if (!platform_data) {
-		ret = i2c_hid_acpi_pdata(client, &ihid->pdata);
-		if (ret)
-			return ret;
-	} else {
-		ihid->pdata = *platform_data;
-	}
-
-	/* Parse platform agnostic common properties from ACPI / device tree */
-	i2c_hid_fwnode_probe(client, &ihid->pdata);
-
-	ihid->pdata.supplies[0].supply = "vdd";
-	ihid->pdata.supplies[1].supply = "vddl";
+	ihid->ops = ops;
 
-	ret = devm_regulator_bulk_get(&client->dev,
-				      ARRAY_SIZE(ihid->pdata.supplies),
-				      ihid->pdata.supplies);
+	ret = i2c_hid_core_power_up(ihid);
 	if (ret)
 		return ret;
 
-	ret = regulator_bulk_enable(ARRAY_SIZE(ihid->pdata.supplies),
-				    ihid->pdata.supplies);
-	if (ret < 0)
-		return ret;
-
-	if (ihid->pdata.post_power_delay_ms)
-		msleep(ihid->pdata.post_power_delay_ms);
-
 	i2c_set_clientdata(client, ihid);
 
 	ihid->client = client;
 
-	hidRegister = ihid->pdata.hid_descriptor_address;
-	ihid->wHIDDescRegister = cpu_to_le16(hidRegister);
+	ihid->wHIDDescRegister = cpu_to_le16(hid_descriptor_address);
 
 	init_waitqueue_head(&ihid->wait);
 	mutex_init(&ihid->reset_lock);
@@ -1089,11 +949,7 @@ static int i2c_hid_probe(struct i2c_client *client,
 	 * real computation later. */
 	ret = i2c_hid_alloc_buffers(ihid, HID_MIN_BUFFER_SIZE);
 	if (ret < 0)
-		goto err_regulator;
-
-	i2c_hid_acpi_fix_up_power(&client->dev);
-
-	i2c_hid_acpi_enable_wakeup(&client->dev);
+		goto err_powered;
 
 	device_enable_async_suspend(&client->dev);
 
@@ -1102,19 +958,19 @@ static int i2c_hid_probe(struct i2c_client *client,
 	if (ret < 0) {
 		dev_dbg(&client->dev, "nothing at this address: %d\n", ret);
 		ret = -ENXIO;
-		goto err_regulator;
+		goto err_powered;
 	}
 
 	ret = i2c_hid_fetch_hid_descriptor(ihid);
 	if (ret < 0) {
 		dev_err(&client->dev,
 			"Failed to fetch the HID Descriptor\n");
-		goto err_regulator;
+		goto err_powered;
 	}
 
 	ret = i2c_hid_init_irq(client);
 	if (ret < 0)
-		goto err_regulator;
+		goto err_powered;
 
 	hid = hid_allocate_device();
 	if (IS_ERR(hid)) {
@@ -1153,14 +1009,14 @@ err_mem_free:
 err_irq:
 	free_irq(client->irq, ihid);
 
-err_regulator:
-	regulator_bulk_disable(ARRAY_SIZE(ihid->pdata.supplies),
-			       ihid->pdata.supplies);
+err_powered:
+	i2c_hid_core_power_down(ihid);
 	i2c_hid_free_buffers(ihid);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(i2c_hid_core_probe);
 
-static int i2c_hid_remove(struct i2c_client *client)
+int i2c_hid_core_remove(struct i2c_client *client)
 {
 	struct i2c_hid *ihid = i2c_get_clientdata(client);
 	struct hid_device *hid;
@@ -1173,24 +1029,25 @@ static int i2c_hid_remove(struct i2c_client *client)
 	if (ihid->bufsize)
 		i2c_hid_free_buffers(ihid);
 
-	regulator_bulk_disable(ARRAY_SIZE(ihid->pdata.supplies),
-			       ihid->pdata.supplies);
+	i2c_hid_core_power_down(ihid);
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(i2c_hid_core_remove);
 
-static void i2c_hid_shutdown(struct i2c_client *client)
+void i2c_hid_core_shutdown(struct i2c_client *client)
 {
 	struct i2c_hid *ihid = i2c_get_clientdata(client);
 
 	i2c_hid_set_power(client, I2C_HID_PWR_SLEEP);
 	free_irq(client->irq, ihid);
 
-	i2c_hid_acpi_shutdown(&client->dev);
+	i2c_hid_core_shutdown_tail(ihid);
 }
+EXPORT_SYMBOL_GPL(i2c_hid_core_shutdown);
 
 #ifdef CONFIG_PM_SLEEP
-static int i2c_hid_suspend(struct device *dev)
+static int i2c_hid_core_suspend(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
 	struct i2c_hid *ihid = i2c_get_clientdata(client);
@@ -1217,14 +1074,13 @@ static int i2c_hid_suspend(struct device *dev)
 			hid_warn(hid, "Failed to enable irq wake: %d\n",
 				wake_status);
 	} else {
-		regulator_bulk_disable(ARRAY_SIZE(ihid->pdata.supplies),
-				       ihid->pdata.supplies);
+		i2c_hid_core_power_down(ihid);
 	}
 
 	return 0;
 }
 
-static int i2c_hid_resume(struct device *dev)
+static int i2c_hid_core_resume(struct device *dev)
 {
 	int ret;
 	struct i2c_client *client = to_i2c_client(dev);
@@ -1233,13 +1089,7 @@ static int i2c_hid_resume(struct device *dev)
 	int wake_status;
 
 	if (!device_may_wakeup(&client->dev)) {
-		ret = regulator_bulk_enable(ARRAY_SIZE(ihid->pdata.supplies),
-					    ihid->pdata.supplies);
-		if (ret)
-			hid_warn(hid, "Failed to enable supplies: %d\n", ret);
-
-		if (ihid->pdata.post_power_delay_ms)
-			msleep(ihid->pdata.post_power_delay_ms);
+		i2c_hid_core_power_up(ihid);
 	} else if (ihid->irq_wake_enabled) {
 		wake_status = disable_irq_wake(client->irq);
 		if (!wake_status)
@@ -1276,34 +1126,10 @@ static int i2c_hid_resume(struct device *dev)
 }
 #endif
 
-static const struct dev_pm_ops i2c_hid_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(i2c_hid_suspend, i2c_hid_resume)
+const struct dev_pm_ops i2c_hid_core_pm = {
+	SET_SYSTEM_SLEEP_PM_OPS(i2c_hid_core_suspend, i2c_hid_core_resume)
 };
-
-static const struct i2c_device_id i2c_hid_id_table[] = {
-	{ "hid", 0 },
-	{ "hid-over-i2c", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(i2c, i2c_hid_id_table);
-
-
-static struct i2c_driver i2c_hid_driver = {
-	.driver = {
-		.name	= "i2c_hid",
-		.pm	= &i2c_hid_pm,
-		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
-		.acpi_match_table = ACPI_PTR(i2c_hid_acpi_match),
-		.of_match_table = of_match_ptr(i2c_hid_of_match),
-	},
-
-	.probe		= i2c_hid_probe,
-	.remove		= i2c_hid_remove,
-	.shutdown	= i2c_hid_shutdown,
-	.id_table	= i2c_hid_id_table,
-};
-
-module_i2c_driver(i2c_hid_driver);
+EXPORT_SYMBOL_GPL(i2c_hid_core_pm);
 
 MODULE_DESCRIPTION("HID over I2C core driver");
 MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@gmail.com>");
diff --git a/drivers/hid/i2c-hid/i2c-hid-of.c b/drivers/hid/i2c-hid/i2c-hid-of.c
new file mode 100644
index 000000000000..4bf7cea92637
--- /dev/null
+++ b/drivers/hid/i2c-hid/i2c-hid-of.c
@@ -0,0 +1,143 @@
+/*
+ * HID over I2C Open Firmware Subclass
+ *
+ * Copyright (c) 2012 Benjamin Tissoires <benjamin.tissoires@gmail.com>
+ * Copyright (c) 2012 Ecole Nationale de l'Aviation Civile, France
+ * Copyright (c) 2012 Red Hat, Inc
+ *
+ * This code was forked out of the core code, which was partly based on
+ * "USB HID support for Linux":
+ *
+ *  Copyright (c) 1999 Andreas Gal
+ *  Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz>
+ *  Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc
+ *  Copyright (c) 2007-2008 Oliver Neukum
+ *  Copyright (c) 2006-2010 Jiri Kosina
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive for
+ * more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/pm.h>
+#include <linux/regulator/consumer.h>
+
+#include "i2c-hid.h"
+
+struct i2c_hid_of {
+	struct i2chid_ops ops;
+
+	struct i2c_client *client;
+	struct regulator_bulk_data supplies[2];
+	int post_power_delay_ms;
+};
+
+static int i2c_hid_of_power_up(struct i2chid_ops *ops)
+{
+	struct i2c_hid_of *ihid_of = container_of(ops, struct i2c_hid_of, ops);
+	struct device *dev = &ihid_of->client->dev;
+	int ret;
+
+	ret = regulator_bulk_enable(ARRAY_SIZE(ihid_of->supplies),
+				    ihid_of->supplies);
+	if (ret) {
+		dev_warn(dev, "Failed to enable supplies: %d\n", ret);
+		return ret;
+	}
+
+	if (ihid_of->post_power_delay_ms)
+		msleep(ihid_of->post_power_delay_ms);
+
+	return 0;
+}
+
+static void i2c_hid_of_power_down(struct i2chid_ops *ops)
+{
+	struct i2c_hid_of *ihid_of = container_of(ops, struct i2c_hid_of, ops);
+
+	regulator_bulk_disable(ARRAY_SIZE(ihid_of->supplies),
+			       ihid_of->supplies);
+}
+
+static int i2c_hid_of_probe(struct i2c_client *client,
+			    const struct i2c_device_id *dev_id)
+{
+	struct device *dev = &client->dev;
+	struct i2c_hid_of *ihid_of;
+	u16 hid_descriptor_address;
+	int ret;
+	u32 val;
+
+	ihid_of = devm_kzalloc(&client->dev, sizeof(*ihid_of), GFP_KERNEL);
+	if (!ihid_of)
+		return -ENOMEM;
+
+	ihid_of->ops.power_up = i2c_hid_of_power_up;
+	ihid_of->ops.power_down = i2c_hid_of_power_down;
+
+	ret = of_property_read_u32(dev->of_node, "hid-descr-addr", &val);
+	if (ret) {
+		dev_err(&client->dev, "HID register address not provided\n");
+		return -ENODEV;
+	}
+	if (val >> 16) {
+		dev_err(&client->dev, "Bad HID register address: 0x%08x\n",
+			val);
+		return -EINVAL;
+	}
+	hid_descriptor_address = val;
+
+	if (!device_property_read_u32(&client->dev, "post-power-on-delay-ms",
+				      &val))
+		ihid_of->post_power_delay_ms = val;
+
+	ihid_of->supplies[0].supply = "vdd";
+	ihid_of->supplies[1].supply = "vddl";
+	ret = devm_regulator_bulk_get(&client->dev,
+				      ARRAY_SIZE(ihid_of->supplies),
+				      ihid_of->supplies);
+	if (ret)
+		return ret;
+
+	return i2c_hid_core_probe(client, &ihid_of->ops,
+				  hid_descriptor_address);
+}
+
+static const struct of_device_id i2c_hid_of_match[] = {
+	{ .compatible = "hid-over-i2c" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_hid_of_match);
+
+static const struct i2c_device_id i2c_hid_of_id_table[] = {
+	{ "hid", 0 },
+	{ "hid-over-i2c", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, i2c_hid_of_id_table);
+
+static struct i2c_driver i2c_hid_of_driver = {
+	.driver = {
+		.name	= "i2c_hid_of",
+		.pm	= &i2c_hid_core_pm,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+		.of_match_table = of_match_ptr(i2c_hid_of_match),
+	},
+
+	.probe		= i2c_hid_of_probe,
+	.remove		= i2c_hid_core_remove,
+	.shutdown	= i2c_hid_core_shutdown,
+	.id_table	= i2c_hid_of_id_table,
+};
+
+module_i2c_driver(i2c_hid_of_driver);
+
+MODULE_DESCRIPTION("HID over I2C OF driver");
+MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/i2c-hid/i2c-hid.h b/drivers/hid/i2c-hid/i2c-hid.h
index a8c19aef5824..05a7827d211a 100644
--- a/drivers/hid/i2c-hid/i2c-hid.h
+++ b/drivers/hid/i2c-hid/i2c-hid.h
@@ -3,6 +3,7 @@
 #ifndef I2C_HID_H
 #define I2C_HID_H
 
+#include <linux/i2c.h>
 
 #ifdef CONFIG_DMI
 struct i2c_hid_desc *i2c_hid_get_dmi_i2c_hid_desc_override(uint8_t *i2c_name);
@@ -17,4 +18,25 @@ static inline char *i2c_hid_get_dmi_hid_report_desc_override(uint8_t *i2c_name,
 { return NULL; }
 #endif
 
+/**
+ * struct i2chid_ops - Ops provided to the core.
+ *
+ * @power_up: do sequencing to power up the device.
+ * @power_down: do sequencing to power down the device.
+ * @shutdown_tail: called at the end of shutdown.
+ */
+struct i2chid_ops {
+	int (*power_up)(struct i2chid_ops *ops);
+	void (*power_down)(struct i2chid_ops *ops);
+	void (*shutdown_tail)(struct i2chid_ops *ops);
+};
+
+int i2c_hid_core_probe(struct i2c_client *client, struct i2chid_ops *ops,
+		       u16 hid_descriptor_address);
+int i2c_hid_core_remove(struct i2c_client *client);
+
+void i2c_hid_core_shutdown(struct i2c_client *client);
+
+extern const struct dev_pm_ops i2c_hid_core_pm;
+
 #endif
diff --git a/include/linux/platform_data/i2c-hid.h b/include/linux/platform_data/i2c-hid.h
deleted file mode 100644
index c628bb5e1061..000000000000
--- a/include/linux/platform_data/i2c-hid.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * HID over I2C protocol implementation
- *
- * Copyright (c) 2012 Benjamin Tissoires <benjamin.tissoires@gmail.com>
- * Copyright (c) 2012 Ecole Nationale de l'Aviation Civile, France
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive for
- * more details.
- */
-
-#ifndef __LINUX_I2C_HID_H
-#define __LINUX_I2C_HID_H
-
-#include <linux/regulator/consumer.h>
-#include <linux/types.h>
-
-/**
- * struct i2chid_platform_data - used by hid over i2c implementation.
- * @hid_descriptor_address: i2c register where the HID descriptor is stored.
- * @supplies: regulators for powering on the device.
- * @post_power_delay_ms: delay after powering on before device is usable.
- *
- * Note that it is the responsibility of the platform driver (or the acpi 5.0
- * driver, or the flattened device tree) to setup the irq related to the gpio in
- * the struct i2c_board_info.
- * The platform driver should also setup the gpio according to the device:
- *
- * A typical example is the following:
- *	irq = gpio_to_irq(intr_gpio);
- *	hkdk4412_i2c_devs5[0].irq = irq; // store the irq in i2c_board_info
- *	gpio_request(intr_gpio, "elan-irq");
- *	s3c_gpio_setpull(intr_gpio, S3C_GPIO_PULL_UP);
- */
-struct i2c_hid_platform_data {
-	u16 hid_descriptor_address;
-	struct regulator_bulk_data supplies[2];
-	int post_power_delay_ms;
-};
-
-#endif /* __LINUX_I2C_HID_H */
-- 
cgit v1.2.3


From f2fc9ff28d1c9bef7760516feadd38164044caae Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Date: Wed, 13 Jan 2021 18:52:46 -0800
Subject: usb: ch9: Add USB 3.2 SSP attributes

In preparation for USB 3.2 dual-lane support, add sublink speed
attribute macros and enum usb_ssp_rate. A USB device that operates in
SuperSpeed Plus may operate at different speed and lane count. These
additional macros and enum values help specifying that.

Signed-off-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Link: https://lore.kernel.org/r/ae9293ebd63a29f2a2035054753534d9eb123d74.1610592135.git.Thinh.Nguyen@synopsys.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/ch9.h      |  9 +++++++++
 include/uapi/linux/usb/ch9.h | 13 +++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 604c6c514a50..86c50907634e 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -36,6 +36,15 @@
 #include <linux/device.h>
 #include <uapi/linux/usb/ch9.h>
 
+/* USB 3.2 SuperSpeed Plus phy signaling rate generation and lane count */
+
+enum usb_ssp_rate {
+	USB_SSP_GEN_UNKNOWN = 0,
+	USB_SSP_GEN_2x1,
+	USB_SSP_GEN_1x2,
+	USB_SSP_GEN_2x2,
+};
+
 /**
  * usb_ep_type_string() - Returns human readable-name of the endpoint type.
  * @ep_type: The endpoint type to return human-readable name for.  If it's not
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 0f865ae4ba89..17ce56198c9a 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -968,9 +968,22 @@ struct usb_ssp_cap_descriptor {
 	__le32 bmSublinkSpeedAttr[1]; /* list of sublink speed attrib entries */
 #define USB_SSP_SUBLINK_SPEED_SSID	(0xf)		/* sublink speed ID */
 #define USB_SSP_SUBLINK_SPEED_LSE	(0x3 << 4)	/* Lanespeed exponent */
+#define USB_SSP_SUBLINK_SPEED_LSE_BPS		0
+#define USB_SSP_SUBLINK_SPEED_LSE_KBPS		1
+#define USB_SSP_SUBLINK_SPEED_LSE_MBPS		2
+#define USB_SSP_SUBLINK_SPEED_LSE_GBPS		3
+
 #define USB_SSP_SUBLINK_SPEED_ST	(0x3 << 6)	/* Sublink type */
+#define USB_SSP_SUBLINK_SPEED_ST_SYM_RX		0
+#define USB_SSP_SUBLINK_SPEED_ST_ASYM_RX	1
+#define USB_SSP_SUBLINK_SPEED_ST_SYM_TX		2
+#define USB_SSP_SUBLINK_SPEED_ST_ASYM_TX	3
+
 #define USB_SSP_SUBLINK_SPEED_RSVD	(0x3f << 8)	/* Reserved */
 #define USB_SSP_SUBLINK_SPEED_LP	(0x3 << 14)	/* Link protocol */
+#define USB_SSP_SUBLINK_SPEED_LP_SS		0
+#define USB_SSP_SUBLINK_SPEED_LP_SSP		1
+
 #define USB_SSP_SUBLINK_SPEED_LSM	(0xff << 16)	/* Lanespeed mantissa */
 } __attribute__((packed));
 
-- 
cgit v1.2.3


From db615c6264cffcd8f117c5628db1794afbd0f254 Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Date: Wed, 13 Jan 2021 18:53:00 -0800
Subject: usb: gadget: Introduce SSP rates and lanes

A USB device controller operating in SuperSpeed Plus may support gen2x1,
gen1x2, and/or gen2x2. Introduce SuperSpeed Plus signaling rate
generation and lane count to usb_gadget with the fields ssp_rate and
max_ssp_rate. The gadget driver can use these to setup the device BOS
descriptor and select the desire operating speed and number of lanes.

Signed-off-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Link: https://lore.kernel.org/r/b6d2196dcc3c73747f91abf9a082b20bbe276cc4.1610592135.git.Thinh.Nguyen@synopsys.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/gadget.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index e7351d64f11f..02483c862444 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -339,6 +339,10 @@ struct usb_gadget_ops {
  * @speed: Speed of current connection to USB host.
  * @max_speed: Maximal speed the UDC can handle.  UDC must support this
  *      and all slower speeds.
+ * @ssp_rate: Current connected SuperSpeed Plus signaling rate and lane count.
+ * @max_ssp_rate: Maximum SuperSpeed Plus signaling rate and lane count the UDC
+ *	can handle. The UDC must support this and all slower speeds and lower
+ *	number of lanes.
  * @state: the state we are now (attached, suspended, configured, etc)
  * @name: Identifies the controller hardware type.  Used in diagnostics
  *	and sometimes configuration.
@@ -406,6 +410,11 @@ struct usb_gadget {
 	struct list_head		ep_list;	/* of usb_ep */
 	enum usb_device_speed		speed;
 	enum usb_device_speed		max_speed;
+
+	/* USB SuperSpeed Plus only */
+	enum usb_ssp_rate		ssp_rate;
+	enum usb_ssp_rate		max_ssp_rate;
+
 	enum usb_device_state		state;
 	const char			*name;
 	struct device			dev;
-- 
cgit v1.2.3


From ead4c124852e66b6aa033e34cf9c4f08d40aeffc Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Date: Wed, 13 Jan 2021 18:53:07 -0800
Subject: usb: gadget: Introduce udc_set_ssp_rate() for SSP

A SuperSpeed Plus device may operate at different speed and lane count
(i.e. gen2x2, gen1x2, or gen2x1). Introduce gadget ops
udc_set_ssp_rate() to set the desire corresponding usb_ssp_rate for
SuperSpeed Plus capable devices.

If the USB device supports different speeds at SuperSpeed Plus, set the
device to operate with the maximum number of lanes and speed.

Signed-off-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Link: https://lore.kernel.org/r/9b85357cdadc02e3f0d653fd05f89eb46af836e1.1610592135.git.Thinh.Nguyen@synopsys.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/udc/core.c | 16 +++++++++++-----
 include/linux/usb/gadget.h    |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 98cf9216f3cb..4173acdcd35b 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1133,12 +1133,18 @@ static inline void usb_gadget_udc_stop(struct usb_udc *udc)
 static inline void usb_gadget_udc_set_speed(struct usb_udc *udc,
 					    enum usb_device_speed speed)
 {
-	if (udc->gadget->ops->udc_set_speed) {
-		enum usb_device_speed s;
+	struct usb_gadget *gadget = udc->gadget;
+	enum usb_device_speed s;
 
-		s = min(speed, udc->gadget->max_speed);
-		udc->gadget->ops->udc_set_speed(udc->gadget, s);
-	}
+	if (speed == USB_SPEED_UNKNOWN)
+		s = gadget->max_speed;
+	else
+		s = min(speed, gadget->max_speed);
+
+	if (s == USB_SPEED_SUPER_PLUS && gadget->ops->udc_set_ssp_rate)
+		gadget->ops->udc_set_ssp_rate(gadget, gadget->max_ssp_rate);
+	else if (gadget->ops->udc_set_speed)
+		gadget->ops->udc_set_speed(gadget, s);
 }
 
 /**
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 02483c862444..ee04ef214ce8 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -323,6 +323,8 @@ struct usb_gadget_ops {
 			struct usb_gadget_driver *);
 	int	(*udc_stop)(struct usb_gadget *);
 	void	(*udc_set_speed)(struct usb_gadget *, enum usb_device_speed);
+	void	(*udc_set_ssp_rate)(struct usb_gadget *gadget,
+			enum usb_ssp_rate rate);
 	struct usb_ep *(*match_ep)(struct usb_gadget *,
 			struct usb_endpoint_descriptor *,
 			struct usb_ss_ep_comp_descriptor *);
-- 
cgit v1.2.3


From 0627cc334d408953c0ff9ef8731325c2d02572ed Mon Sep 17 00:00:00 2001
From: Yue Zou <zouyue3@huawei.com>
Date: Mon, 18 Jan 2021 01:01:37 +0000
Subject: sony-laptop: Remove unneeded semicolon

Remove a superfluous semicolon after function definition.

Signed-off-by: Yue Zou <zouyue3@huawei.com>
Link: https://lore.kernel.org/r/20210118010137.214378-1-zouyue3@huawei.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/sony-laptop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sony-laptop.h b/include/linux/sony-laptop.h
index 374d0fdb0743..1e3c92feea6e 100644
--- a/include/linux/sony-laptop.h
+++ b/include/linux/sony-laptop.h
@@ -31,7 +31,7 @@
 #if IS_ENABLED(CONFIG_SONY_LAPTOP)
 int sony_pic_camera_command(int command, u8 value);
 #else
-static inline int sony_pic_camera_command(int command, u8 value) { return 0; };
+static inline int sony_pic_camera_command(int command, u8 value) { return 0; }
 #endif
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 719a402cf60311b1cdff3f6320abaecdcc5e46b7 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Sun, 17 Jan 2021 16:59:42 +0200
Subject: net: netdevice: Add operation ndo_sk_get_lower_dev

ndo_sk_get_lower_dev returns the lower netdev that corresponds to
a given socket.
Additionally, we implement a helper netdev_sk_get_lowest_dev() to get
the lowest one in chain.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Boris Pismenny <borisp@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  4 ++++
 net/core/dev.c            | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5b949076ed23..02dcef4d66e2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1398,6 +1398,8 @@ struct net_device_ops {
 	struct net_device*	(*ndo_get_xmit_slave)(struct net_device *dev,
 						      struct sk_buff *skb,
 						      bool all_slaves);
+	struct net_device*	(*ndo_sk_get_lower_dev)(struct net_device *dev,
+							struct sock *sk);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
 						    netdev_features_t features);
 	int			(*ndo_set_features)(struct net_device *dev,
@@ -2858,6 +2860,8 @@ int init_dummy_netdev(struct net_device *dev);
 struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 					 struct sk_buff *skb,
 					 bool all_slaves);
+struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
+					    struct sock *sk);
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
diff --git a/net/core/dev.c b/net/core/dev.c
index bae35c1ae192..6b90520a01b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8105,6 +8105,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_get_xmit_slave);
 
+static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
+						  struct sock *sk)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_sk_get_lower_dev)
+		return NULL;
+	return ops->ndo_sk_get_lower_dev(dev, sk);
+}
+
+/**
+ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
+ * @dev: device
+ * @sk: the socket
+ *
+ * %NULL is returned if no lower device is found.
+ */
+
+struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
+					    struct sock *sk)
+{
+	struct net_device *lower;
+
+	lower = netdev_sk_get_lower_dev(dev, sk);
+	while (lower) {
+		dev = lower;
+		lower = netdev_sk_get_lower_dev(dev, sk);
+	}
+
+	return dev;
+}
+EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
+
 static void netdev_adjacent_add_links(struct net_device *dev)
 {
 	struct netdev_adjacent *iter;
-- 
cgit v1.2.3


From ab0da5a57188b80d16d3222236c4e184953a5bb4 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Wed, 30 Dec 2020 15:01:20 +0200
Subject: net/mlx5: Expose ifc bits for query modify header

Expose ifc bits for query_modify_header_context_in to be used by DEVX.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 442c0160caab..cf692fc17f41 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5904,6 +5904,18 @@ struct mlx5_ifc_dealloc_modify_header_context_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_query_modify_header_context_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_60[0xa0];
+};
+
 struct mlx5_ifc_query_dct_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From f4d3bd8be4f2bc43e4b13490cbc9969d15c2f058 Mon Sep 17 00:00:00 2001
From: Yu-Hsuan Hsu <yuhsuan@chromium.org>
Date: Fri, 15 Jan 2021 15:53:00 +0800
Subject: cros_ec_commands: Add EC_CODEC_I2S_RX_RESET

Add the new command EC_CODEC_I2S_RX_RESET in ec_codec_i2s_rx_subcmd,
which is used for resetting the EC codec.

Signed-off-by: Yu-Hsuan Hsu <yuhsuan@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20210115075301.47995-1-yuhsuan@chromium.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/cros_ec_commands.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 86376779ab31..95889ada83a3 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4600,6 +4600,7 @@ enum ec_codec_i2s_rx_subcmd {
 	EC_CODEC_I2S_RX_SET_SAMPLE_DEPTH = 0x2,
 	EC_CODEC_I2S_RX_SET_DAIFMT = 0x3,
 	EC_CODEC_I2S_RX_SET_BCLK = 0x4,
+	EC_CODEC_I2S_RX_RESET = 0x5,
 	EC_CODEC_I2S_RX_SUBCMD_COUNT,
 };
 
-- 
cgit v1.2.3


From 93e86295f5e9238779096fa599c3804a08e25bd1 Mon Sep 17 00:00:00 2001
From: Menglong Dong <dong.menglong@zte.com.cn>
Date: Mon, 18 Jan 2021 00:04:55 -0800
Subject: workqueue: fix annotation for WQ_SYSFS

'wq_sysfs_register()' in annotation for 'WQ_SYSFS' is unavailable,
change it to 'workqueue_sysfs_register()'.

Signed-off-by: Menglong Dong <dong.menglong@zte.com.cn>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26de0cae2a0a..d15a7730ee18 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -311,7 +311,7 @@ enum {
 	WQ_MEM_RECLAIM		= 1 << 3, /* may be used for memory reclaim */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu intensive workqueue */
-	WQ_SYSFS		= 1 << 6, /* visible in sysfs, see wq_sysfs_register() */
+	WQ_SYSFS		= 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */
 
 	/*
 	 * Per-cpu workqueues are generally preferred because they tend to
-- 
cgit v1.2.3


From 2f196059864fb0fe8f60c14a2cb214055b283e08 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 2 Nov 2020 17:11:49 +0100
Subject: efi/libstub: whitespace cleanup

Trivial whitespace cleanup.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0c31af36697c..2537a246c2d6 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -29,10 +29,10 @@
 #include <asm/page.h>
 
 #define EFI_SUCCESS		0
-#define EFI_LOAD_ERROR          ( 1 | (1UL << (BITS_PER_LONG-1)))
+#define EFI_LOAD_ERROR		( 1 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_INVALID_PARAMETER	( 2 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_UNSUPPORTED		( 3 | (1UL << (BITS_PER_LONG-1)))
-#define EFI_BAD_BUFFER_SIZE     ( 4 | (1UL << (BITS_PER_LONG-1)))
+#define EFI_BAD_BUFFER_SIZE	( 4 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_BUFFER_TOO_SMALL	( 5 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_NOT_READY		( 6 | (1UL << (BITS_PER_LONG-1)))
 #define EFI_DEVICE_ERROR	( 7 | (1UL << (BITS_PER_LONG-1)))
-- 
cgit v1.2.3


From 3820749ddcee694abfd5ae6cabc18aaab11eab34 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 2 Nov 2020 11:51:10 +0100
Subject: efi/libstub: move TPM related prototypes into efistub.h

Move TPM related definitions that are only used in the EFI stub into
efistub.h, which is a local header.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/efistub.h | 9 +++++++++
 include/linux/efi.h                    | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 2b7438ba1fbc..cde0a2ef507d 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -849,4 +849,13 @@ void efi_handle_post_ebs_state(void);
 
 enum efi_secureboot_mode efi_get_secureboot(void);
 
+#ifdef CONFIG_RESET_ATTACK_MITIGATION
+void efi_enable_reset_attack_mitigation(void);
+#else
+static inline void
+efi_enable_reset_attack_mitigation(void) { }
+#endif
+
+void efi_retrieve_tpm2_eventlog(void);
+
 #endif
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2537a246c2d6..8710f5710c1d 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1104,13 +1104,6 @@ enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var)
 	return efi_secureboot_mode_enabled;
 }
 
-#ifdef CONFIG_RESET_ATTACK_MITIGATION
-void efi_enable_reset_attack_mitigation(void);
-#else
-static inline void
-efi_enable_reset_attack_mitigation(void) { }
-#endif
-
 #ifdef CONFIG_EFI_EMBEDDED_FIRMWARE
 void efi_check_for_embedded_firmwares(void);
 #else
@@ -1119,8 +1112,6 @@ static inline void efi_check_for_embedded_firmwares(void) { }
 
 efi_status_t efi_random_get_seed(void);
 
-void efi_retrieve_tpm2_eventlog(void);
-
 /*
  * Arch code can implement the following three template macros, avoiding
  * reptition for the void/non-void return cases of {__,}efi_call_virt():
-- 
cgit v1.2.3


From 7eab14de73a8028f770e703962c5437a2b0dda82 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sat, 16 Jan 2021 16:13:22 +0000
Subject: mdio, phy: fix -Wshadow warnings triggered by nested container_of()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

container_of() macro hides a local variable '__mptr' inside. This
becomes a problem when several container_of() are nested in each
other within single line or plain macros.
As C preprocessor doesn't support generating random variable names,
the sole solution is to avoid defining macros that consist only of
container_of() calls, or they will self-shadow '__mptr' each time:

In file included from ./include/linux/bitmap.h:10,
                 from drivers/net/phy/phy_device.c:12:
drivers/net/phy/phy_device.c: In function ‘phy_device_release’:
./include/linux/kernel.h:693:8: warning: declaration of ‘__mptr’ shadows a previous local [-Wshadow]
  693 |  void *__mptr = (void *)(ptr);     \
      |        ^~~~~~
./include/linux/phy.h:647:26: note: in expansion of macro ‘container_of’
  647 | #define to_phy_device(d) container_of(to_mdio_device(d), \
      |                          ^~~~~~~~~~~~
./include/linux/mdio.h:52:27: note: in expansion of macro ‘container_of’
   52 | #define to_mdio_device(d) container_of(d, struct mdio_device, dev)
      |                           ^~~~~~~~~~~~
./include/linux/phy.h:647:39: note: in expansion of macro ‘to_mdio_device’
  647 | #define to_phy_device(d) container_of(to_mdio_device(d), \
      |                                       ^~~~~~~~~~~~~~
drivers/net/phy/phy_device.c:217:8: note: in expansion of macro ‘to_phy_device’
  217 |  kfree(to_phy_device(dev));
      |        ^~~~~~~~~~~~~
./include/linux/kernel.h:693:8: note: shadowed declaration is here
  693 |  void *__mptr = (void *)(ptr);     \
      |        ^~~~~~
./include/linux/phy.h:647:26: note: in expansion of macro ‘container_of’
  647 | #define to_phy_device(d) container_of(to_mdio_device(d), \
      |                          ^~~~~~~~~~~~
drivers/net/phy/phy_device.c:217:8: note: in expansion of macro ‘to_phy_device’
  217 |  kfree(to_phy_device(dev));
      |        ^~~~~~~~~~~~~

As they are declared in header files, these warnings are highly
repetitive and very annoying (along with the one from linux/pci.h).

Convert the related macros from linux/{mdio,phy}.h to static inlines
to avoid self-shadowing and potentially improve bug-catching.
No functional changes implied.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20210116161246.67075-1-alobakin@pm.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 23 ++++++++++++++++++-----
 include/linux/phy.h  |  7 +++++--
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index dbd69b3d170b..ffb787d5ebde 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -49,7 +49,11 @@ struct mdio_device {
 	unsigned int reset_assert_delay;
 	unsigned int reset_deassert_delay;
 };
-#define to_mdio_device(d) container_of(d, struct mdio_device, dev)
+
+static inline struct mdio_device *to_mdio_device(const struct device *dev)
+{
+	return container_of(dev, struct mdio_device, dev);
+}
 
 /* struct mdio_driver_common: Common to all MDIO drivers */
 struct mdio_driver_common {
@@ -57,8 +61,12 @@ struct mdio_driver_common {
 	int flags;
 };
 #define MDIO_DEVICE_FLAG_PHY		1
-#define to_mdio_common_driver(d) \
-	container_of(d, struct mdio_driver_common, driver)
+
+static inline struct mdio_driver_common *
+to_mdio_common_driver(const struct device_driver *driver)
+{
+	return container_of(driver, struct mdio_driver_common, driver);
+}
 
 /* struct mdio_driver: Generic MDIO driver */
 struct mdio_driver {
@@ -73,8 +81,13 @@ struct mdio_driver {
 	/* Clears up any memory if needed */
 	void (*remove)(struct mdio_device *mdiodev);
 };
-#define to_mdio_driver(d)						\
-	container_of(to_mdio_common_driver(d), struct mdio_driver, mdiodrv)
+
+static inline struct mdio_driver *
+to_mdio_driver(const struct device_driver *driver)
+{
+	return container_of(to_mdio_common_driver(driver), struct mdio_driver,
+			    mdiodrv);
+}
 
 /* device driver data */
 static inline void mdiodev_set_drvdata(struct mdio_device *mdio, void *data)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 24fcc6456a9e..bc323fbdd21e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -648,8 +648,11 @@ struct phy_device {
 	const struct macsec_ops *macsec_ops;
 #endif
 };
-#define to_phy_device(d) container_of(to_mdio_device(d), \
-				      struct phy_device, mdio)
+
+static inline struct phy_device *to_phy_device(const struct device *dev)
+{
+	return container_of(to_mdio_device(dev), struct phy_device, mdio);
+}
 
 /**
  * struct phy_tdr_config - Configuration of a TDR raw test
-- 
cgit v1.2.3


From a7d6ba14efb778fc8c65c627a057d5dd29debd04 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 15 Dec 2020 00:38:04 +0100
Subject: thermal/core: Remove the 'forced_passive' option

The code was reorganized in 2012 with the commit 0c01ebbfd3caf1.

The main change is a loop on the trip points array and a unconditional
call to the throttle() ops of the governors for each of them even if
the trip temperature is not reached yet.

With this change, the 'forced_passive' is no longer checked in the
thermal_zone_device_update() function but in the step wise governor's
throttle() callback.

As the force_passive does no belong to the trip point array, the
thermal_zone_device_update() can not compare with the specified
passive temperature, thus does not detect the passive limit has been
crossed. Consequently, throttle() is never called and the
'forced_passive' branch is unreached.

In addition, the default processor cooling device is not automatically
bound to the thermal zone if there is not passive trip point, thus the
'forced_passive' can not operate.

If there is an active trip point, then the throttle function will be
called to mitigate at this temperature and the 'forced_passive' will
override the mitigation of the active trip point in this case but with
the default cooling device bound to the thermal zone, so usually a
fan, and that is not a passive cooling effect.

Given the regression exists since more than 8 years, nobody complained
and at the best of my knowledge there is no bug open in
https://bugzilla.kernel.org, it is reasonable to say it is unused.

Remove the 'forced_passive' related code.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Thara Gopinath <thara.gopinath@linaro.org>
Link: https://lore.kernel.org/r/20201214233811.485669-1-daniel.lezcano@linaro.org
---
 Documentation/driver-api/thermal/sysfs-api.rst | 13 -----
 drivers/thermal/gov_step_wise.c                | 14 +----
 drivers/thermal/thermal_sysfs.c                | 80 --------------------------
 include/linux/thermal.h                        |  4 --
 4 files changed, 3 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst
index e7520cb439ac..a4969c474cc3 100644
--- a/Documentation/driver-api/thermal/sysfs-api.rst
+++ b/Documentation/driver-api/thermal/sysfs-api.rst
@@ -520,19 +520,6 @@ available_policies
 
 	RW, Optional
 
-passive
-	Attribute is only present for zones in which the passive cooling
-	policy is not supported by native thermal driver. Default is zero
-	and can be set to a temperature (in millidegrees) to enable a
-	passive trip point for the zone. Activation is done by polling with
-	an interval of 1 second.
-
-	Unit: millidegrees Celsius
-
-	Valid values: 0 (disabled) or greater than 1000
-
-	RW, Optional
-
 emul_temp
 	Interface to set the emulated temperature method in thermal zone
 	(sensor). After setting this temperature, the thermal zone may pass
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index 2ae7198d3067..12acb12aac50 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -109,7 +109,7 @@ static void update_passive_instance(struct thermal_zone_device *tz,
 	 * If value is +1, activate a passive instance.
 	 * If value is -1, deactivate a passive instance.
 	 */
-	if (type == THERMAL_TRIP_PASSIVE || type == THERMAL_TRIPS_NONE)
+	if (type == THERMAL_TRIP_PASSIVE)
 		tz->passive += value;
 }
 
@@ -122,13 +122,8 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 	bool throttle = false;
 	int old_target;
 
-	if (trip == THERMAL_TRIPS_NONE) {
-		trip_temp = tz->forced_passive;
-		trip_type = THERMAL_TRIPS_NONE;
-	} else {
-		tz->ops->get_trip_temp(tz, trip, &trip_temp);
-		tz->ops->get_trip_type(tz, trip, &trip_type);
-	}
+	tz->ops->get_trip_temp(tz, trip, &trip_temp);
+	tz->ops->get_trip_type(tz, trip, &trip_type);
 
 	trend = get_tz_trend(tz, trip);
 
@@ -189,9 +184,6 @@ static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
 
 	thermal_zone_trip_update(tz, trip);
 
-	if (tz->forced_passive)
-		thermal_zone_trip_update(tz, THERMAL_TRIPS_NONE);
-
 	mutex_lock(&tz->lock);
 
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node)
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 0866e949339b..4e7f9e880d76 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -216,49 +216,6 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 	return ret ? ret : sprintf(buf, "%d\n", temperature);
 }
 
-static ssize_t
-passive_store(struct device *dev, struct device_attribute *attr,
-	      const char *buf, size_t count)
-{
-	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	int state;
-
-	if (sscanf(buf, "%d\n", &state) != 1)
-		return -EINVAL;
-
-	/* sanity check: values below 1000 millicelcius don't make sense
-	 * and can cause the system to go into a thermal heart attack
-	 */
-	if (state && state < 1000)
-		return -EINVAL;
-
-	if (state && !tz->forced_passive) {
-		if (!tz->passive_delay)
-			tz->passive_delay = 1000;
-		thermal_zone_device_rebind_exception(tz, "Processor",
-						     sizeof("Processor"));
-	} else if (!state && tz->forced_passive) {
-		tz->passive_delay = 0;
-		thermal_zone_device_unbind_exception(tz, "Processor",
-						     sizeof("Processor"));
-	}
-
-	tz->forced_passive = state;
-
-	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
-
-	return count;
-}
-
-static ssize_t
-passive_show(struct device *dev, struct device_attribute *attr,
-	     char *buf)
-{
-	struct thermal_zone_device *tz = to_thermal_zone(dev);
-
-	return sprintf(buf, "%d\n", tz->forced_passive);
-}
-
 static ssize_t
 policy_store(struct device *dev, struct device_attribute *attr,
 	     const char *buf, size_t count)
@@ -403,7 +360,6 @@ static DEVICE_ATTR_RW(sustainable_power);
 
 /* These thermal zone device attributes are created based on conditions */
 static DEVICE_ATTR_RW(mode);
-static DEVICE_ATTR_RW(passive);
 
 /* These attributes are unconditionally added to a thermal zone */
 static struct attribute *thermal_zone_dev_attrs[] = {
@@ -438,45 +394,9 @@ static const struct attribute_group thermal_zone_mode_attribute_group = {
 	.attrs = thermal_zone_mode_attrs,
 };
 
-/* We expose passive only if passive trips are present */
-static struct attribute *thermal_zone_passive_attrs[] = {
-	&dev_attr_passive.attr,
-	NULL,
-};
-
-static umode_t thermal_zone_passive_is_visible(struct kobject *kobj,
-					       struct attribute *attr,
-					       int attrno)
-{
-	struct device *dev = kobj_to_dev(kobj);
-	struct thermal_zone_device *tz;
-	enum thermal_trip_type trip_type;
-	int count, passive = 0;
-
-	tz = container_of(dev, struct thermal_zone_device, device);
-
-	for (count = 0; count < tz->trips && !passive; count++) {
-		tz->ops->get_trip_type(tz, count, &trip_type);
-
-		if (trip_type == THERMAL_TRIP_PASSIVE)
-			passive = 1;
-	}
-
-	if (!passive)
-		return attr->mode;
-
-	return 0;
-}
-
-static const struct attribute_group thermal_zone_passive_attribute_group = {
-	.attrs = thermal_zone_passive_attrs,
-	.is_visible = thermal_zone_passive_is_visible,
-};
-
 static const struct attribute_group *thermal_zone_attribute_groups[] = {
 	&thermal_zone_attribute_group,
 	&thermal_zone_mode_attribute_group,
-	&thermal_zone_passive_attribute_group,
 	/* This is not NULL terminated as we create the group dynamically */
 };
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index c80032322158..a57232a9a6f9 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -131,9 +131,6 @@ struct thermal_cooling_device {
 			trip point.
  * @prev_high_trip:	the above current temperature if you've crossed a
 			passive trip point.
- * @forced_passive:	If > 0, temperature at which to switch on all ACPI
- *			processor cooling devices.  Currently only used by the
- *			step-wise governor.
  * @need_update:	if equals 1, thermal_zone_device_update needs to be invoked.
  * @ops:	operations this &thermal_zone_device supports
  * @tzp:	thermal zone parameters
@@ -167,7 +164,6 @@ struct thermal_zone_device {
 	int passive;
 	int prev_low_trip;
 	int prev_high_trip;
-	unsigned int forced_passive;
 	atomic_t need_update;
 	struct thermal_zone_device_ops *ops;
 	struct thermal_zone_params *tzp;
-- 
cgit v1.2.3


From 2121496fdc5f2d93eda9743c4b487469f0042e3c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 15 Dec 2020 00:38:09 +0100
Subject: thermal/core: Remove unused macro THERMAL_TRIPS_NONE

The macro THERMAL_TRIPS_NONE is no longer used, remove it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Thara Gopinath <thara.gopinath@linaro.org>
Link: https://lore.kernel.org/r/20201214233811.485669-6-daniel.lezcano@linaro.org
---
 include/linux/thermal.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index a57232a9a6f9..060a2160add4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -17,7 +17,6 @@
 #include <linux/workqueue.h>
 #include <uapi/linux/thermal.h>
 
-#define THERMAL_TRIPS_NONE	-1
 #define THERMAL_MAX_TRIPS	12
 
 /* invalid cooling state */
-- 
cgit v1.2.3


From 17d399cd9c8936909bc8936a5837b6da9af9c29e Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 16 Dec 2020 23:03:35 +0100
Subject: thermal/core: Precompute the delays from msecs to jiffies

The delays are stored in ms units and when the polling function is
called this delay is converted into jiffies at each call.

Instead of doing the conversion again and again, compute the jiffies
at init time and use the value directly when setting the polling.

Cc: Thara Gopinath <thara.gopinath@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Thara Gopinath <thara.gopinath@linaro.org>
Link: https://lore.kernel.org/r/20201216220337.839878-1-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_core.c    | 3 +++
 drivers/thermal/thermal_core.h    | 1 +
 drivers/thermal/thermal_helpers.c | 7 +++++++
 include/linux/thermal.h           | 7 +++++++
 4 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index bcc2ea4f5482..2c41d4a0923f 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1315,6 +1315,9 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 	tz->passive_delay = passive_delay;
 	tz->polling_delay = polling_delay;
 
+	thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
+	thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
+
 	/* sys I/F */
 	/* Add nodes that are always present via .groups */
 	result = thermal_zone_create_device_groups(tz, mask);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index e50c6b2909fe..90f9a80c8b23 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -123,6 +123,7 @@ int thermal_build_list_of_policies(char *buf);
 
 /* Helpers */
 void thermal_zone_set_trips(struct thermal_zone_device *tz);
+void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms);
 
 /* sysfs I/F */
 int thermal_zone_create_device_groups(struct thermal_zone_device *, int);
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index c94bc824e5d3..7f50f412e02a 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -175,6 +175,13 @@ exit:
 	mutex_unlock(&tz->lock);
 }
 
+void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms)
+{
+	*delay_jiffies = msecs_to_jiffies(delay_ms);
+	if (delay_ms > 1000)
+		*delay_jiffies = round_jiffies(*delay_jiffies);
+}
+
 static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,
 				       int target)
 {
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 060a2160add4..d1b82c70de69 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -117,9 +117,14 @@ struct thermal_cooling_device {
  * @trips_disabled;	bitmap for disabled trips
  * @passive_delay:	number of milliseconds to wait between polls when
  *			performing passive cooling.
+ * @passive_delay_jiffies: number of jiffies to wait between polls when
+ *			performing passive cooling.
  * @polling_delay:	number of milliseconds to wait between polls when
  *			checking whether trip points have been crossed (0 for
  *			interrupt driven systems)
+ * @polling_delay_jiffies: number of jiffies to wait between polls when
+ *			checking whether trip points have been crossed (0 for
+ *			interrupt driven systems)
  * @temperature:	current temperature.  This is only for core code,
  *			drivers should use thermal_zone_get_temp() to get the
  *			current temperature
@@ -155,6 +160,8 @@ struct thermal_zone_device {
 	void *devdata;
 	int trips;
 	unsigned long trips_disabled;	/* bitmap for disabled trips */
+	unsigned long passive_delay_jiffies;
+	unsigned long polling_delay_jiffies;
 	int passive_delay;
 	int polling_delay;
 	int temperature;
-- 
cgit v1.2.3


From b39d2dd5b5ed08d15711aefd5afd72bd87387c64 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 16 Dec 2020 23:03:37 +0100
Subject: thermal/core: Remove ms based delay fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code does no longer use the ms unit based fields to set the
delays as they are replaced by the jiffies.

Remove them and replace their user to use the jiffies version instead.

Cc: Thara Gopinath <thara.gopinath@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Peter Kästle <peter@piie.net>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20201216220337.839878-3-daniel.lezcano@linaro.org
---
 drivers/platform/x86/acerhdf.c                     | 3 ++-
 drivers/thermal/da9062-thermal.c                   | 4 ++--
 drivers/thermal/gov_power_allocator.c              | 2 +-
 drivers/thermal/thermal_core.c                     | 4 +---
 drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 6 ++++--
 include/linux/thermal.h                            | 7 -------
 6 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index b6aa6e5514f4..6b8b3ab8db48 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -336,7 +336,8 @@ static void acerhdf_check_param(struct thermal_zone_device *thermal)
 			pr_notice("interval changed to: %d\n", interval);
 
 		if (thermal)
-			thermal->polling_delay = interval*1000;
+			thermal->polling_delay_jiffies =
+				round_jiffies(msecs_to_jiffies(interval * 1000));
 
 		prev_interval = interval;
 	}
diff --git a/drivers/thermal/da9062-thermal.c b/drivers/thermal/da9062-thermal.c
index 4d74994f160a..180edec34e07 100644
--- a/drivers/thermal/da9062-thermal.c
+++ b/drivers/thermal/da9062-thermal.c
@@ -95,7 +95,7 @@ static void da9062_thermal_poll_on(struct work_struct *work)
 		thermal_zone_device_update(thermal->zone,
 					   THERMAL_EVENT_UNSPECIFIED);
 
-		delay = msecs_to_jiffies(thermal->zone->passive_delay);
+		delay = thermal->zone->passive_delay_jiffies;
 		queue_delayed_work(system_freezable_wq, &thermal->work, delay);
 		return;
 	}
@@ -245,7 +245,7 @@ static int da9062_thermal_probe(struct platform_device *pdev)
 
 	dev_dbg(&pdev->dev,
 		"TJUNC temperature polling period set at %d ms\n",
-		thermal->zone->passive_delay);
+		jiffies_to_msecs(thermal->zone->passive_delay_jiffies));
 
 	ret = platform_get_irq_byname(pdev, "THERMAL");
 	if (ret < 0) {
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index 7a4170a0b51f..f8c3d1e40b86 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -258,7 +258,7 @@ static u32 pid_controller(struct thermal_zone_device *tz,
 	 * power being applied, slowing down the controller)
 	 */
 	d = mul_frac(tz->tzp->k_d, err - params->prev_err);
-	d = div_frac(d, tz->passive_delay);
+	d = div_frac(d, jiffies_to_msecs(tz->passive_delay_jiffies));
 	params->prev_err = err;
 
 	power_range = p + i + d;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index d96c515af3cb..b2615449b18f 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -313,7 +313,7 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
 
 	if (!stop && tz->passive)
 		thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies);
-	else if (!stop && tz->polling_delay)
+	else if (!stop && tz->polling_delay_jiffies)
 		thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
 	else
 		thermal_zone_device_set_polling(tz, 0);
@@ -1307,8 +1307,6 @@ thermal_zone_device_register(const char *type, int trips, int mask,
 	tz->device.class = &thermal_class;
 	tz->devdata = devdata;
 	tz->trips = trips;
-	tz->passive_delay = passive_delay;
-	tz->polling_delay = polling_delay;
 
 	thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
 	thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index 2ce4b19f312a..f84375865c97 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -166,6 +166,7 @@ int ti_thermal_expose_sensor(struct ti_bandgap *bgp, int id,
 			     char *domain)
 {
 	struct ti_thermal_data *data;
+	int interval;
 
 	data = ti_bandgap_get_sensor_data(bgp, id);
 
@@ -183,9 +184,10 @@ int ti_thermal_expose_sensor(struct ti_bandgap *bgp, int id,
 		return PTR_ERR(data->ti_thermal);
 	}
 
+	interval = jiffies_to_msecs(data->ti_thermal->polling_delay_jiffies);
+
 	ti_bandgap_set_sensor_data(bgp, id, data);
-	ti_bandgap_write_update_interval(bgp, data->sensor_id,
-					data->ti_thermal->polling_delay);
+	ti_bandgap_write_update_interval(bgp, data->sensor_id, interval);
 
 	return 0;
 }
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d1b82c70de69..1e686404951b 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -115,13 +115,8 @@ struct thermal_cooling_device {
  * @devdata:	private pointer for device private data
  * @trips:	number of trip points the thermal zone supports
  * @trips_disabled;	bitmap for disabled trips
- * @passive_delay:	number of milliseconds to wait between polls when
- *			performing passive cooling.
  * @passive_delay_jiffies: number of jiffies to wait between polls when
  *			performing passive cooling.
- * @polling_delay:	number of milliseconds to wait between polls when
- *			checking whether trip points have been crossed (0 for
- *			interrupt driven systems)
  * @polling_delay_jiffies: number of jiffies to wait between polls when
  *			checking whether trip points have been crossed (0 for
  *			interrupt driven systems)
@@ -162,8 +157,6 @@ struct thermal_zone_device {
 	unsigned long trips_disabled;	/* bitmap for disabled trips */
 	unsigned long passive_delay_jiffies;
 	unsigned long polling_delay_jiffies;
-	int passive_delay;
-	int polling_delay;
 	int temperature;
 	int last_temperature;
 	int emul_temperature;
-- 
cgit v1.2.3


From 23ff8529ee207c634ce2e170c353938db7aa98a9 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 18 Jan 2021 18:38:24 +0100
Subject: thermal/core: Make cooling device state change private

The change of the cooling device state should be used by the governor
or at least by the core code, not by the drivers themselves.

Remove the API usage and move the function declaration to the internal
headers.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Link: https://lore.kernel.org/r/20210118173824.9970-1-daniel.lezcano@linaro.org
---
 drivers/hwmon/pwm-fan.c          | 1 -
 drivers/thermal/khadas_mcu_fan.c | 1 -
 drivers/thermal/thermal_core.h   | 2 ++
 include/linux/thermal.h          | 3 ---
 4 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c
index 777439f48c14..c8a15971d704 100644
--- a/drivers/hwmon/pwm-fan.c
+++ b/drivers/hwmon/pwm-fan.c
@@ -422,7 +422,6 @@ static int pwm_fan_probe(struct platform_device *pdev)
 			return ret;
 		}
 		ctx->cdev = cdev;
-		thermal_cdev_update(cdev);
 	}
 
 	return 0;
diff --git a/drivers/thermal/khadas_mcu_fan.c b/drivers/thermal/khadas_mcu_fan.c
index 9eadd2d6413e..d35e5313bea4 100644
--- a/drivers/thermal/khadas_mcu_fan.c
+++ b/drivers/thermal/khadas_mcu_fan.c
@@ -100,7 +100,6 @@ static int khadas_mcu_fan_probe(struct platform_device *pdev)
 		return ret;
 	}
 	ctx->cdev = cdev;
-	thermal_cdev_update(cdev);
 
 	return 0;
 }
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 90f9a80c8b23..86b8cef7310e 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -65,6 +65,8 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
 		cdev->ops->power2state;
 }
 
+void thermal_cdev_update(struct thermal_cooling_device *);
+
 /**
  * struct thermal_trip - representation of a point in temperature domain
  * @np: pointer to struct device_node that this trip point was created from
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1e686404951b..6ac7bb1d2b1f 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -390,7 +390,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 int thermal_zone_get_slope(struct thermal_zone_device *tz);
 int thermal_zone_get_offset(struct thermal_zone_device *tz);
 
-void thermal_cdev_update(struct thermal_cooling_device *);
 void thermal_notify_framework(struct thermal_zone_device *, int);
 int thermal_zone_device_enable(struct thermal_zone_device *tz);
 int thermal_zone_device_disable(struct thermal_zone_device *tz);
@@ -437,8 +436,6 @@ static inline int thermal_zone_get_offset(
 		struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-static inline void thermal_cdev_update(struct thermal_cooling_device *cdev)
-{ }
 static inline void thermal_notify_framework(struct thermal_zone_device *tz,
 	int trip)
 { }
-- 
cgit v1.2.3


From fa82117010430aff2ce86400f7328f55a31b48a6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 16 Jan 2021 14:13:37 +0800
Subject: net: add inline function skb_csum_is_sctp

This patch is to define a inline function skb_csum_is_sctp(), and
also replace all places where it checks if it's a SCTP CSUM skb.
This function would be used later in many networking drivers in
the following patches.

Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 2 +-
 include/linux/skbuff.h                           | 5 +++++
 net/core/dev.c                                   | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
index ac4cd5d82e69..162a1ff1e9d2 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
@@ -979,7 +979,7 @@ static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb)
 		stats->vlan_inserted++;
 	}
 
-	if (skb->csum_not_inet)
+	if (skb_csum_is_sctp(skb))
 		stats->crc32_csum++;
 	else
 		stats->csum++;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9568cf60c2a..46f901adf1a8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4621,6 +4621,11 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_csum_is_sctp(struct sk_buff *skb)
+{
+	return skb->csum_not_inet;
+}
+
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
 				       const u64 kcov_handle)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 6b90520a01b1..0332f2e8f7da 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3617,7 +3617,7 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 int skb_csum_hwoffload_help(struct sk_buff *skb,
 			    const netdev_features_t features)
 {
-	if (unlikely(skb->csum_not_inet))
+	if (unlikely(skb_csum_is_sctp(skb)))
 		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 			skb_crc32c_csum_help(skb);
 
-- 
cgit v1.2.3


From de641d74fb00f5b32f054ee154e31fb037e0db88 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Sun, 17 Jan 2021 11:26:33 +0200
Subject: Revert "RDMA/mlx5: Fix devlink deadlock on net namespace deletion"

This reverts commit fbdd0049d98d44914fc57d4b91f867f4996c787b.

Due to commit in fixes tag, netdevice events were received only in one net
namespace of mlx5_core_dev. Due to this when netdevice events arrive in
net namespace other than net namespace of mlx5_core_dev, they are missed.

This results in empty GID table due to RDMA device being detached from its
net device.

Hence, revert back to receive netdevice events in all net namespaces to
restore back RDMA functionality in non init_net net namespace. The
deadlock will have to be addressed in another patch.

Fixes: fbdd0049d98d ("RDMA/mlx5: Fix devlink deadlock on net namespace deletion")
Link: https://lore.kernel.org/r/20210117092633.10690-1-leon@kernel.org
Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c                  |  6 ++----
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |  5 +++++
 include/linux/mlx5/driver.h                        | 18 ------------------
 3 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d26f3f3e0462..aabdc07e4753 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3311,8 +3311,7 @@ static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 	int err;
 
 	dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
-	err = register_netdevice_notifier_net(mlx5_core_net(dev->mdev),
-					      &dev->port[port_num].roce.nb);
+	err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
 	if (err) {
 		dev->port[port_num].roce.nb.notifier_call = NULL;
 		return err;
@@ -3324,8 +3323,7 @@ static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 {
 	if (dev->port[port_num].roce.nb.notifier_call) {
-		unregister_netdevice_notifier_net(mlx5_core_net(dev->mdev),
-						  &dev->port[port_num].roce.nb);
+		unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
 		dev->port[port_num].roce.nb.notifier_call = NULL;
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 3a9fa629503f..d046db7bb047 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -90,4 +90,9 @@ int mlx5_create_encryption_key(struct mlx5_core_dev *mdev,
 			       u32 key_type, u32 *p_key_id);
 void mlx5_destroy_encryption_key(struct mlx5_core_dev *mdev, u32 key_id);
 
+static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
+{
+	return devlink_net(priv_to_devlink(dev));
+}
+
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f93bfe7473aa..4672e12f1aa5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1209,22 +1209,4 @@ static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev)
 	return val.vbool;
 }
 
-/**
- * mlx5_core_net - Provide net namespace of the mlx5_core_dev
- * @dev: mlx5 core device
- *
- * mlx5_core_net() returns the net namespace of mlx5 core device.
- * This can be called only in below described limited context.
- * (a) When a devlink instance for mlx5_core is registered and
- *     when devlink reload operation is disabled.
- *     or
- * (b) during devlink reload reload_down() and reload_up callbacks
- *     where it is ensured that devlink instance's net namespace is
- *     stable.
- */
-static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
-{
-	return devlink_net(priv_to_devlink(dev));
-}
-
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 7b8fc0103bb51d1d3e1fb5fd67958612e709f883 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 18 Jan 2021 20:09:27 -0500
Subject: bonding: add a vlan+srcmac tx hashing option

This comes from an end-user request, where they're running multiple VMs on
hosts with bonded interfaces connected to some interest switch topologies,
where 802.3ad isn't an option. They're currently running a proprietary
solution that effectively achieves load-balancing of VMs and bandwidth
utilization improvements with a similar form of transmission algorithm.

Basically, each VM has it's own vlan, so it always sends its traffic out
the same interface, unless that interface fails. Traffic gets split
between the interfaces, maintaining a consistent path, with failover still
available if an interface goes down.

Unlike bond_eth_hash(), this hash function is using the full source MAC
address instead of just the last byte, as there are so few components to
the hash, and in the no-vlan case, we would be returning just the last
byte of the source MAC as the hash value. It's entirely possible to have
two NICs in a bond with the same last byte of their MAC, but not the same
MAC, so this adjustment should guarantee distinct hashes in all cases.

This has been rudimetarily tested to provide similar results to the
proprietary solution it is aiming to replace. A patch for iproute2 is also
posted, to properly support the new mode there as well.

Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Thomas Davis <tadavis@lbl.gov>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Link: https://lore.kernel.org/r/20210119010927.1191922-1-jarod@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/bonding.rst | 13 +++++++++++++
 drivers/net/bonding/bond_main.c      | 34 ++++++++++++++++++++++++++++++++--
 drivers/net/bonding/bond_options.c   | 13 +++++++------
 include/linux/netdevice.h            |  1 +
 include/uapi/linux/if_bonding.h      |  1 +
 5 files changed, 54 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index adc314639085..5f690f0ad0e4 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -951,6 +951,19 @@ xmit_hash_policy
 		packets will be distributed according to the encapsulated
 		flows.
 
+	vlan+srcmac
+
+		This policy uses a very rudimentary vlan ID and source mac
+		hash to load-balance traffic per-vlan, with failover
+		should one leg fail. The intended use case is for a bond
+		shared by multiple virtual machines, all configured to
+		use their own vlan, to give lacp-like functionality
+		without requiring lacp-capable switching hardware.
+
+		The formula for the hash is simply
+
+		hash = (vlan ID) XOR (source MAC vendor) XOR (source MAC dev)
+
 	The default value is layer2.  This option was added in bonding
 	version 2.6.3.  In earlier versions of bonding, this parameter
 	does not exist, and the layer2 policy is the only policy.  The
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 539c6bc218df..74cbbb22470b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -167,7 +167,7 @@ module_param(xmit_hash_policy, charp, 0);
 MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
-				   "4 for encap layer 3+4");
+				   "4 for encap layer 3+4, 5 for vlan+srcmac");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -1457,6 +1457,8 @@ static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
 		return NETDEV_LAG_HASH_E23;
 	case BOND_XMIT_POLICY_ENCAP34:
 		return NETDEV_LAG_HASH_E34;
+	case BOND_XMIT_POLICY_VLAN_SRCMAC:
+		return NETDEV_LAG_HASH_VLAN_SRCMAC;
 	default:
 		return NETDEV_LAG_HASH_UNKNOWN;
 	}
@@ -3519,6 +3521,27 @@ static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk,
 	return true;
 }
 
+static u32 bond_vlan_srcmac_hash(struct sk_buff *skb)
+{
+	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
+	u32 srcmac_vendor = 0, srcmac_dev = 0;
+	u16 vlan;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];
+
+	for (i = 3; i < ETH_ALEN; i++)
+		srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];
+
+	if (!skb_vlan_tag_present(skb))
+		return srcmac_vendor ^ srcmac_dev;
+
+	vlan = skb_vlan_tag_get(skb);
+
+	return vlan ^ srcmac_vendor ^ srcmac_dev;
+}
+
 /* Extract the appropriate headers based on bond's xmit policy */
 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 			      struct flow_keys *fk)
@@ -3526,10 +3549,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
 	int noff, proto = -1;
 
-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_ENCAP23:
+	case BOND_XMIT_POLICY_ENCAP34:
 		memset(fk, 0, sizeof(*fk));
 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
 					  fk, NULL, 0, 0, 0, 0);
+	default:
+		break;
 	}
 
 	fk->ports.ports = 0;
@@ -3591,6 +3618,9 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    skb->l4_hash)
 		return skb->hash;
 
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
+		return bond_vlan_srcmac_hash(skb);
+
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
 	    !bond_flow_dissect(bond, skb, &flow))
 		return bond_eth_hash(skb);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 8fcbf7f9c7b2..77d7c38bd435 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -96,12 +96,13 @@ static const struct bond_opt_value bond_pps_tbl[] = {
 };
 
 static const struct bond_opt_value bond_xmit_hashtype_tbl[] = {
-	{ "layer2",   BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT},
-	{ "layer3+4", BOND_XMIT_POLICY_LAYER34, 0},
-	{ "layer2+3", BOND_XMIT_POLICY_LAYER23, 0},
-	{ "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0},
-	{ "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0},
-	{ NULL,       -1,                       0},
+	{ "layer2",      BOND_XMIT_POLICY_LAYER2,      BOND_VALFLAG_DEFAULT},
+	{ "layer3+4",    BOND_XMIT_POLICY_LAYER34,     0},
+	{ "layer2+3",    BOND_XMIT_POLICY_LAYER23,     0},
+	{ "encap2+3",    BOND_XMIT_POLICY_ENCAP23,     0},
+	{ "encap3+4",    BOND_XMIT_POLICY_ENCAP34,     0},
+	{ "vlan+srcmac", BOND_XMIT_POLICY_VLAN_SRCMAC, 0},
+	{ NULL,          -1,                           0},
 };
 
 static const struct bond_opt_value bond_arp_validate_tbl[] = {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 02dcef4d66e2..ef517254367d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2617,6 +2617,7 @@ enum netdev_lag_hash {
 	NETDEV_LAG_HASH_L23,
 	NETDEV_LAG_HASH_E23,
 	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_VLAN_SRCMAC,
 	NETDEV_LAG_HASH_UNKNOWN,
 };
 
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index 45f3750aa861..e8eb4ad03cf1 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -94,6 +94,7 @@
 #define BOND_XMIT_POLICY_LAYER23	2 /* layer 2+3 (IP ^ MAC) */
 #define BOND_XMIT_POLICY_ENCAP23	3 /* encapsulated layer 2+3 */
 #define BOND_XMIT_POLICY_ENCAP34	4 /* encapsulated layer 3+4 */
+#define BOND_XMIT_POLICY_VLAN_SRCMAC	5 /* vlan + source MAC */
 
 /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */
 #define LACP_STATE_LACP_ACTIVITY   0x1
-- 
cgit v1.2.3


From 67c6bb56b649590a3f59c2a92331aa4e83d4534c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 6 Jan 2021 10:34:49 +0000
Subject: firmware: smccc: Add SMCCC TRNG function call IDs

The ARM architected TRNG firmware interface, described in ARM spec
DEN0098, define an ARM SMCCC based interface to a true random number
generator, provided by firmware.

Add the definitions of the SMCCC functions as defined by the spec.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20210106103453.152275-2-andre.przywara@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arm-smccc.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index f860645f6512..62c54234576c 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -102,6 +102,37 @@
 			   ARM_SMCCC_OWNER_STANDARD_HYP,	\
 			   0x21)
 
+/* TRNG entropy source calls (defined by ARM DEN0098) */
+#define ARM_SMCCC_TRNG_VERSION					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_32,			\
+			   ARM_SMCCC_OWNER_STANDARD,		\
+			   0x50)
+
+#define ARM_SMCCC_TRNG_FEATURES					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_32,			\
+			   ARM_SMCCC_OWNER_STANDARD,		\
+			   0x51)
+
+#define ARM_SMCCC_TRNG_GET_UUID					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_32,			\
+			   ARM_SMCCC_OWNER_STANDARD,		\
+			   0x52)
+
+#define ARM_SMCCC_TRNG_RND32					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_32,			\
+			   ARM_SMCCC_OWNER_STANDARD,		\
+			   0x53)
+
+#define ARM_SMCCC_TRNG_RND64					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_64,			\
+			   ARM_SMCCC_OWNER_STANDARD,		\
+			   0x53)
+
 /*
  * Return codes defined in ARM DEN 0070A
  * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C
-- 
cgit v1.2.3


From 93cc26fa8f37fbd320f36525bfedd4b3e2b3e2ba Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 18 Jan 2021 14:23:22 +0100
Subject: backlight: lms283gf05: Convert to GPIO descriptors

This converts the lms283gf05 backlight driver to use GPIO
descriptors and switches the single PXA Palm Z2 device
over to defining these.

Since the platform data was only used to convey GPIO
information we can delete the platform data header.

Notice that we define the proper active low semantics in
the board file GPIO descriptor table (active low) and
assert the reset line by bringing it to "1" (asserted).

Cc: Marek Vasut <marex@denx.de>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Daniel Mack <daniel@zonque.org>
Acked-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/arm/mach-pxa/z2.c               | 12 ++++++----
 drivers/video/backlight/lms283gf05.c | 43 ++++++++++++++----------------------
 include/linux/spi/lms283gf05.h       | 16 --------------
 3 files changed, 25 insertions(+), 46 deletions(-)
 delete mode 100644 include/linux/spi/lms283gf05.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/z2.c b/arch/arm/mach-pxa/z2.c
index 21fd76bb09cd..89eb5243c85f 100644
--- a/arch/arm/mach-pxa/z2.c
+++ b/arch/arm/mach-pxa/z2.c
@@ -20,7 +20,6 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/pxa2xx_spi.h>
 #include <linux/spi/libertas_spi.h>
-#include <linux/spi/lms283gf05.h>
 #include <linux/power_supply.h>
 #include <linux/mtd/physmap.h>
 #include <linux/gpio.h>
@@ -578,8 +577,13 @@ static struct pxa2xx_spi_chip lms283_chip_info = {
 	.gpio_cs	= GPIO88_ZIPITZ2_LCD_CS,
 };
 
-static const struct lms283gf05_pdata lms283_pdata = {
-	.reset_gpio	= GPIO19_ZIPITZ2_LCD_RESET,
+static struct gpiod_lookup_table lms283_gpio_table = {
+	.dev_id = "spi2.0", /* SPI bus 2 chip select 0 */
+	.table = {
+		GPIO_LOOKUP("gpio-pxa", GPIO19_ZIPITZ2_LCD_RESET,
+			    "reset", GPIO_ACTIVE_LOW),
+		{ },
+	},
 };
 
 static struct spi_board_info spi_board_info[] __initdata = {
@@ -595,7 +599,6 @@ static struct spi_board_info spi_board_info[] __initdata = {
 {
 	.modalias		= "lms283gf05",
 	.controller_data	= &lms283_chip_info,
-	.platform_data		= &lms283_pdata,
 	.max_speed_hz		= 400000,
 	.bus_num		= 2,
 	.chip_select		= 0,
@@ -615,6 +618,7 @@ static void __init z2_spi_init(void)
 {
 	pxa2xx_set_spi_info(1, &pxa_ssp1_master_info);
 	pxa2xx_set_spi_info(2, &pxa_ssp2_master_info);
+	gpiod_add_lookup_table(&lms283_gpio_table);
 	spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info));
 }
 #else
diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c
index 0e45685bcc1c..36856962ed83 100644
--- a/drivers/video/backlight/lms283gf05.c
+++ b/drivers/video/backlight/lms283gf05.c
@@ -9,16 +9,16 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/lcd.h>
 
 #include <linux/spi/spi.h>
-#include <linux/spi/lms283gf05.h>
 #include <linux/module.h>
 
 struct lms283gf05_state {
 	struct spi_device	*spi;
 	struct lcd_device	*ld;
+	struct gpio_desc	*reset;
 };
 
 struct lms283gf05_seq {
@@ -90,13 +90,13 @@ static const struct lms283gf05_seq disp_pdwnseq[] = {
 };
 
 
-static void lms283gf05_reset(unsigned long gpio, bool inverted)
+static void lms283gf05_reset(struct gpio_desc *gpiod)
 {
-	gpio_set_value(gpio, !inverted);
+	gpiod_set_value(gpiod, 0); /* De-asserted */
 	mdelay(100);
-	gpio_set_value(gpio, inverted);
+	gpiod_set_value(gpiod, 1); /* Asserted */
 	mdelay(20);
-	gpio_set_value(gpio, !inverted);
+	gpiod_set_value(gpiod, 0); /* De-asserted */
 	mdelay(20);
 }
 
@@ -125,18 +125,15 @@ static int lms283gf05_power_set(struct lcd_device *ld, int power)
 {
 	struct lms283gf05_state *st = lcd_get_data(ld);
 	struct spi_device *spi = st->spi;
-	struct lms283gf05_pdata *pdata = dev_get_platdata(&spi->dev);
 
 	if (power <= FB_BLANK_NORMAL) {
-		if (pdata)
-			lms283gf05_reset(pdata->reset_gpio,
-					pdata->reset_inverted);
+		if (st->reset)
+			lms283gf05_reset(st->reset);
 		lms283gf05_toggle(spi, disp_initseq, ARRAY_SIZE(disp_initseq));
 	} else {
 		lms283gf05_toggle(spi, disp_pdwnseq, ARRAY_SIZE(disp_pdwnseq));
-		if (pdata)
-			gpio_set_value(pdata->reset_gpio,
-					pdata->reset_inverted);
+		if (st->reset)
+			gpiod_set_value(st->reset, 1); /* Asserted */
 	}
 
 	return 0;
@@ -150,24 +147,18 @@ static struct lcd_ops lms_ops = {
 static int lms283gf05_probe(struct spi_device *spi)
 {
 	struct lms283gf05_state *st;
-	struct lms283gf05_pdata *pdata = dev_get_platdata(&spi->dev);
 	struct lcd_device *ld;
-	int ret = 0;
-
-	if (pdata != NULL) {
-		ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio,
-				GPIOF_DIR_OUT | (!pdata->reset_inverted ?
-				GPIOF_INIT_HIGH : GPIOF_INIT_LOW),
-				"LMS283GF05 RESET");
-		if (ret)
-			return ret;
-	}
 
 	st = devm_kzalloc(&spi->dev, sizeof(struct lms283gf05_state),
 				GFP_KERNEL);
 	if (st == NULL)
 		return -ENOMEM;
 
+	st->reset = gpiod_get_optional(&spi->dev, "reset", GPIOD_OUT_LOW);
+	if (IS_ERR(st->reset))
+		return PTR_ERR(st->reset);
+	gpiod_set_consumer_name(st->reset, "LMS283GF05 RESET");
+
 	ld = devm_lcd_device_register(&spi->dev, "lms283gf05", &spi->dev, st,
 					&lms_ops);
 	if (IS_ERR(ld))
@@ -179,8 +170,8 @@ static int lms283gf05_probe(struct spi_device *spi)
 	spi_set_drvdata(spi, st);
 
 	/* kick in the LCD */
-	if (pdata)
-		lms283gf05_reset(pdata->reset_gpio, pdata->reset_inverted);
+	if (st->reset)
+		lms283gf05_reset(st->reset);
 	lms283gf05_toggle(spi, disp_initseq, ARRAY_SIZE(disp_initseq));
 
 	return 0;
diff --git a/include/linux/spi/lms283gf05.h b/include/linux/spi/lms283gf05.h
deleted file mode 100644
index f237b2d062e9..000000000000
--- a/include/linux/spi/lms283gf05.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * lms283gf05.h - Platform glue for Samsung LMS283GF05 LCD
- *
- * Copyright (C) 2009 Marek Vasut <marek.vasut@gmail.com>
-*/
-
-#ifndef _INCLUDE_LINUX_SPI_LMS283GF05_H_
-#define _INCLUDE_LINUX_SPI_LMS283GF05_H_
-
-struct lms283gf05_pdata {
-	unsigned long	reset_gpio;
-	bool		reset_inverted;
-};
-
-#endif /* _INCLUDE_LINUX_SPI_LMS283GF05_H_ */
-- 
cgit v1.2.3


From f9ce0be71d1fbb038ada15ced83474b0e63f264d Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sat, 19 Dec 2020 15:19:23 +0300
Subject: mm: Cleanup faultaround and finish_fault() codepaths

alloc_set_pte() has two users with different requirements: in the
faultaround code, it called from an atomic context and PTE page table
has to be preallocated. finish_fault() can sleep and allocate page table
as needed.

PTL locking rules are also strange, hard to follow and overkill for
finish_fault().

Let's untangle the mess. alloc_set_pte() has gone now. All locking is
explicit.

The price is some code duplication to handle huge pages in faultaround
path, but it should be fine, having overall improvement in readability.

Link: https://lore.kernel.org/r/20201229132819.najtavneutnf7ajp@box
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
[will: s/from from/from/ in comment; spotted by willy]
Signed-off-by: Will Deacon <will@kernel.org>
---
 fs/xfs/xfs_file.c       |   6 +-
 include/linux/mm.h      |  12 +--
 include/linux/pgtable.h |  11 +++
 mm/filemap.c            | 177 +++++++++++++++++++++++++++++++-----------
 mm/memory.c             | 199 ++++++++++++++----------------------------------
 5 files changed, 213 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f73837..111fe73bb8a7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1319,17 +1319,19 @@ xfs_filemap_pfn_mkwrite(
 	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
-static void
+static vm_fault_t
 xfs_filemap_map_pages(
 	struct vm_fault		*vmf,
 	pgoff_t			start_pgoff,
 	pgoff_t			end_pgoff)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
+	vm_fault_t ret;
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	filemap_map_pages(vmf, start_pgoff, end_pgoff);
+	ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	return ret;
 }
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8cd6ae..4572a9bc5862 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -542,8 +542,8 @@ struct vm_fault {
 					 * is not NULL, otherwise pmd.
 					 */
 	pgtable_t prealloc_pte;		/* Pre-allocated pte page table.
-					 * vm_ops->map_pages() calls
-					 * alloc_set_pte() from atomic context.
+					 * vm_ops->map_pages() sets up a page
+					 * table from atomic context.
 					 * do_fault_around() pre-allocates
 					 * page table to avoid allocation from
 					 * atomic context.
@@ -578,7 +578,7 @@ struct vm_operations_struct {
 	vm_fault_t (*fault)(struct vm_fault *vmf);
 	vm_fault_t (*huge_fault)(struct vm_fault *vmf,
 			enum page_entry_size pe_size);
-	void (*map_pages)(struct vm_fault *vmf,
+	vm_fault_t (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 	unsigned long (*pagesize)(struct vm_area_struct * area);
 
@@ -988,7 +988,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+void do_set_pte(struct vm_fault *vmf, struct page *page);
+
 vm_fault_t finish_fault(struct vm_fault *vmf);
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
@@ -2622,7 +2624,7 @@ extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern vm_fault_t filemap_fault(struct vm_fault *vmf);
-extern void filemap_map_pages(struct vm_fault *vmf,
+extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 8fcdfa52eb4b..36eb748f3c97 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1314,6 +1314,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 #endif
 }
 
+/*
+ * the ordering of these checks is important for pmds with _page_devmap set.
+ * if we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
 #ifndef CONFIG_NUMA_BALANCING
 /*
  * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
diff --git a/mm/filemap.c b/mm/filemap.c
index 5c9d564317a5..c1f2dc89b8a7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,7 @@
 #include <linux/psi.h>
 #include <linux/ramfs.h>
 #include <linux/page_idle.h>
+#include <asm/pgalloc.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -2911,74 +2912,164 @@ out_retry:
 }
 EXPORT_SYMBOL(filemap_fault);
 
-void filemap_map_pages(struct vm_fault *vmf,
-		pgoff_t start_pgoff, pgoff_t end_pgoff)
+static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
 {
-	struct file *file = vmf->vma->vm_file;
+	struct mm_struct *mm = vmf->vma->vm_mm;
+
+	/* Huge page is mapped? No need to proceed. */
+	if (pmd_trans_huge(*vmf->pmd)) {
+		unlock_page(page);
+		put_page(page);
+		return true;
+	}
+
+	if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
+	    vm_fault_t ret = do_set_pmd(vmf, page);
+	    if (!ret) {
+		    /* The page is mapped successfully, reference consumed. */
+		    unlock_page(page);
+		    return true;
+	    }
+	}
+
+	if (pmd_none(*vmf->pmd)) {
+		vmf->ptl = pmd_lock(mm, vmf->pmd);
+		if (likely(pmd_none(*vmf->pmd))) {
+			mm_inc_nr_ptes(mm);
+			pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
+			vmf->prealloc_pte = NULL;
+		}
+		spin_unlock(vmf->ptl);
+	}
+
+	/* See comment in handle_pte_fault() */
+	if (pmd_devmap_trans_unstable(vmf->pmd)) {
+		unlock_page(page);
+		put_page(page);
+		return true;
+	}
+
+	return false;
+}
+
+static struct page *next_uptodate_page(struct page *page,
+				       struct address_space *mapping,
+				       struct xa_state *xas, pgoff_t end_pgoff)
+{
+	unsigned long max_idx;
+
+	do {
+		if (!page)
+			return NULL;
+		if (xas_retry(xas, page))
+			continue;
+		if (xa_is_value(page))
+			continue;
+		if (PageLocked(page))
+			continue;
+		if (!page_cache_get_speculative(page))
+			continue;
+		/* Has the page moved or been split? */
+		if (unlikely(page != xas_reload(xas)))
+			goto skip;
+		if (!PageUptodate(page) || PageReadahead(page))
+			goto skip;
+		if (PageHWPoison(page))
+			goto skip;
+		if (!trylock_page(page))
+			goto skip;
+		if (page->mapping != mapping)
+			goto unlock;
+		if (!PageUptodate(page))
+			goto unlock;
+		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+		if (xas->xa_index >= max_idx)
+			goto unlock;
+		return page;
+unlock:
+		unlock_page(page);
+skip:
+		put_page(page);
+	} while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
+
+	return NULL;
+}
+
+static inline struct page *first_map_page(struct address_space *mapping,
+					  struct xa_state *xas,
+					  pgoff_t end_pgoff)
+{
+	return next_uptodate_page(xas_find(xas, end_pgoff),
+				  mapping, xas, end_pgoff);
+}
+
+static inline struct page *next_map_page(struct address_space *mapping,
+					 struct xa_state *xas,
+					 pgoff_t end_pgoff)
+{
+	return next_uptodate_page(xas_next_entry(xas, end_pgoff),
+				  mapping, xas, end_pgoff);
+}
+
+vm_fault_t filemap_map_pages(struct vm_fault *vmf,
+			     pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	pgoff_t last_pgoff = start_pgoff;
-	unsigned long max_idx;
+	unsigned long address = vmf->address;
 	XA_STATE(xas, &mapping->i_pages, start_pgoff);
 	struct page *head, *page;
 	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+	vm_fault_t ret = 0;
 
 	rcu_read_lock();
-	xas_for_each(&xas, head, end_pgoff) {
-		if (xas_retry(&xas, head))
-			continue;
-		if (xa_is_value(head))
-			goto next;
+	head = first_map_page(mapping, &xas, end_pgoff);
+	if (!head)
+		goto out;
 
-		/*
-		 * Check for a locked page first, as a speculative
-		 * reference may adversely influence page migration.
-		 */
-		if (PageLocked(head))
-			goto next;
-		if (!page_cache_get_speculative(head))
-			goto next;
+	if (filemap_map_pmd(vmf, head)) {
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
 
-		/* Has the page moved or been split? */
-		if (unlikely(head != xas_reload(&xas)))
-			goto skip;
+	vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
+	do {
 		page = find_subpage(head, xas.xa_index);
-
-		if (!PageUptodate(head) ||
-				PageReadahead(page) ||
-				PageHWPoison(page))
-			goto skip;
-		if (!trylock_page(head))
-			goto skip;
-
-		if (head->mapping != mapping || !PageUptodate(head))
-			goto unlock;
-
-		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
-		if (xas.xa_index >= max_idx)
+		if (PageHWPoison(page))
 			goto unlock;
 
 		if (mmap_miss > 0)
 			mmap_miss--;
 
 		vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
-		if (vmf->pte)
-			vmf->pte += xas.xa_index - last_pgoff;
+		vmf->pte += xas.xa_index - last_pgoff;
 		last_pgoff = xas.xa_index;
-		if (alloc_set_pte(vmf, page))
+
+		if (!pte_none(*vmf->pte))
 			goto unlock;
+
+		do_set_pte(vmf, page);
+		/* no need to invalidate: a not-present page won't be cached */
+		update_mmu_cache(vma, vmf->address, vmf->pte);
 		unlock_page(head);
-		goto next;
+
+		/* The fault is handled */
+		if (vmf->address == address)
+			ret = VM_FAULT_NOPAGE;
+		continue;
 unlock:
 		unlock_page(head);
-skip:
 		put_page(head);
-next:
-		/* Huge page is mapped? No need to proceed. */
-		if (pmd_trans_huge(*vmf->pmd))
-			break;
-	}
+	} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
+	pte_unmap_unlock(vmf->pte, vmf->ptl);
+out:
 	rcu_read_unlock();
+	vmf->address = address;
 	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
+	return ret;
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..3e2fc2950ad7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3503,7 +3503,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	if (pte_alloc(vma->vm_mm, vmf->pmd))
 		return VM_FAULT_OOM;
 
-	/* See the comment in pte_alloc_one_map() */
+	/* See comment in handle_pte_fault() */
 	if (unlikely(pmd_trans_unstable(vmf->pmd)))
 		return 0;
 
@@ -3643,66 +3643,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
 	return ret;
 }
 
-/*
- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
- */
-static int pmd_devmap_trans_unstable(pmd_t *pmd)
-{
-	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
-}
-
-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-
-	if (!pmd_none(*vmf->pmd))
-		goto map_pte;
-	if (vmf->prealloc_pte) {
-		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-		if (unlikely(!pmd_none(*vmf->pmd))) {
-			spin_unlock(vmf->ptl);
-			goto map_pte;
-		}
-
-		mm_inc_nr_ptes(vma->vm_mm);
-		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
-		spin_unlock(vmf->ptl);
-		vmf->prealloc_pte = NULL;
-	} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
-		return VM_FAULT_OOM;
-	}
-map_pte:
-	/*
-	 * If a huge pmd materialized under us just retry later.  Use
-	 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
-	 * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
-	 * under us and then back to pmd_none, as a result of MADV_DONTNEED
-	 * running immediately after a huge pmd fault in a different thread of
-	 * this mm, in turn leading to a misleading pmd_trans_huge() retval.
-	 * All we have to ensure is that it is a regular pmd that we can walk
-	 * with pte_offset_map() and we can do that through an atomic read in
-	 * C, which is what pmd_trans_unstable() provides.
-	 */
-	if (pmd_devmap_trans_unstable(vmf->pmd))
-		return VM_FAULT_NOPAGE;
-
-	/*
-	 * At this point we know that our vmf->pmd points to a page of ptes
-	 * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
-	 * for the duration of the fault.  If a racing MADV_DONTNEED runs and
-	 * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
-	 * be valid and we will re-check to make sure the vmf->pte isn't
-	 * pte_none() under vmf->ptl protection when we return to
-	 * alloc_set_pte().
-	 */
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-			&vmf->ptl);
-	return 0;
-}
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
@@ -3717,7 +3657,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
 	vmf->prealloc_pte = NULL;
 }
 
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3775,52 +3715,17 @@ out:
 	return ret;
 }
 #else
-static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
-	BUILD_BUG();
-	return 0;
+	return VM_FAULT_FALLBACK;
 }
 #endif
 
-/**
- * alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the function allocates page table or use pre-allocated.
- *
- * @vmf: fault environment
- * @page: page to map
- *
- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
- * return.
- *
- * Target users are page handler itself and implementations of
- * vm_ops->map_pages.
- *
- * Return: %0 on success, %VM_FAULT_ code in case of error.
- */
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
+void do_set_pte(struct vm_fault *vmf, struct page *page)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	pte_t entry;
-	vm_fault_t ret;
-
-	if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
-		ret = do_set_pmd(vmf, page);
-		if (ret != VM_FAULT_FALLBACK)
-			return ret;
-	}
-
-	if (!vmf->pte) {
-		ret = pte_alloc_one_map(vmf);
-		if (ret)
-			return ret;
-	}
-
-	/* Re-check under ptl */
-	if (unlikely(!pte_none(*vmf->pte))) {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
-		return VM_FAULT_NOPAGE;
-	}
 
 	flush_icache_page(vma, page);
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -3837,14 +3742,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
 		page_add_file_rmap(page, false);
 	}
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
-
-	/* no need to invalidate: a not-present page won't be cached */
-	update_mmu_cache(vma, vmf->address, vmf->pte);
-
-	return 0;
 }
 
-
 /**
  * finish_fault - finish page fault once we have prepared the page to fault
  *
@@ -3862,12 +3761,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
  */
 vm_fault_t finish_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
-	vm_fault_t ret = 0;
+	vm_fault_t ret;
 
 	/* Did we COW the page? */
-	if ((vmf->flags & FAULT_FLAG_WRITE) &&
-	    !(vmf->vma->vm_flags & VM_SHARED))
+	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
 		page = vmf->cow_page;
 	else
 		page = vmf->page;
@@ -3876,12 +3775,38 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	 * check even for read faults because we might have lost our CoWed
 	 * page
 	 */
-	if (!(vmf->vma->vm_flags & VM_SHARED))
-		ret = check_stable_address_space(vmf->vma->vm_mm);
-	if (!ret)
-		ret = alloc_set_pte(vmf, page);
-	if (vmf->pte)
-		pte_unmap_unlock(vmf->pte, vmf->ptl);
+	if (!(vma->vm_flags & VM_SHARED)) {
+		ret = check_stable_address_space(vma->vm_mm);
+		if (ret)
+			return ret;
+	}
+
+	if (pmd_none(*vmf->pmd)) {
+		if (PageTransCompound(page)) {
+			ret = do_set_pmd(vmf, page);
+			if (ret != VM_FAULT_FALLBACK)
+				return ret;
+		}
+
+		if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
+			return VM_FAULT_OOM;
+	}
+
+	/* See comment in handle_pte_fault() */
+	if (pmd_devmap_trans_unstable(vmf->pmd))
+		return 0;
+
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+				      vmf->address, &vmf->ptl);
+	ret = 0;
+	/* Re-check under ptl */
+	if (likely(pte_none(*vmf->pte)))
+		do_set_pte(vmf, page);
+	else
+		ret = VM_FAULT_NOPAGE;
+
+	update_mmu_tlb(vma, vmf->address, vmf->pte);
+	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return ret;
 }
 
@@ -3951,13 +3876,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
 	pgoff_t start_pgoff = vmf->pgoff;
 	pgoff_t end_pgoff;
 	int off;
-	vm_fault_t ret = 0;
 
 	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
 	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
 
-	vmf->address = max(address & mask, vmf->vma->vm_start);
-	off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+	address = max(address & mask, vmf->vma->vm_start);
+	off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
 	start_pgoff -= off;
 
 	/*
@@ -3965,7 +3889,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
 	 *  the vma or nr_pages from start_pgoff, depending what is nearest.
 	 */
 	end_pgoff = start_pgoff -
-		((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
 		PTRS_PER_PTE - 1;
 	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
 			start_pgoff + nr_pages - 1);
@@ -3973,31 +3897,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
 	if (pmd_none(*vmf->pmd)) {
 		vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
 		if (!vmf->prealloc_pte)
-			goto out;
+			return VM_FAULT_OOM;
 		smp_wmb(); /* See comment in __pte_alloc() */
 	}
 
-	vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
-
-	/* Huge page is mapped? Page fault is solved */
-	if (pmd_trans_huge(*vmf->pmd)) {
-		ret = VM_FAULT_NOPAGE;
-		goto out;
-	}
-
-	/* ->map_pages() haven't done anything useful. Cold page cache? */
-	if (!vmf->pte)
-		goto out;
-
-	/* check if the page fault is solved */
-	vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
-	if (!pte_none(*vmf->pte))
-		ret = VM_FAULT_NOPAGE;
-	pte_unmap_unlock(vmf->pte, vmf->ptl);
-out:
-	vmf->address = address;
-	vmf->pte = NULL;
-	return ret;
+	return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
 }
 
 static vm_fault_t do_read_fault(struct vm_fault *vmf)
@@ -4353,7 +4257,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		 */
 		vmf->pte = NULL;
 	} else {
-		/* See comment in pte_alloc_one_map() */
+		/*
+		 * If a huge pmd materialized under us just retry later.  Use
+		 * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
+		 * of pmd_trans_huge() to ensure the pmd didn't become
+		 * pmd_trans_huge under us and then back to pmd_none, as a
+		 * result of MADV_DONTNEED running immediately after a huge pmd
+		 * fault in a different thread of this mm, in turn leading to a
+		 * misleading pmd_trans_huge() retval. All we have to ensure is
+		 * that it is a regular pmd that we can walk with
+		 * pte_offset_map() and we can do that through an atomic read
+		 * in C, which is what pmd_trans_unstable() provides.
+		 */
 		if (pmd_devmap_trans_unstable(vmf->pmd))
 			return 0;
 		/*
-- 
cgit v1.2.3


From 46bdb4277f98e70d0c91f4289897ade533fe9e80 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 24 Nov 2020 18:48:26 +0000
Subject: mm: Allow architectures to request 'old' entries when prefaulting

Commit 5c0a85fad949 ("mm: make faultaround produce old ptes") changed
the "faultaround" behaviour to initialise prefaulted PTEs as 'old',
since this avoids vmscan wrongly assuming that they are hot, despite
having never been explicitly accessed by userspace. The change has been
shown to benefit numerous arm64 micro-architectures (with hardware
access flag) running Android, where both application launch latency and
direct reclaim time are significantly reduced (by 10%+ and ~80%
respectively).

Unfortunately, commit 315d09bf30c2 ("Revert "mm: make faultaround
produce old ptes"") reverted the change due to it being identified as
the cause of a ~6% regression in unixbench on x86. Experiments on a
variety of recent arm64 micro-architectures indicate that unixbench is
not affected by the original commit, which appears to yield a 0-1%
performance improvement.

Since one size does not fit all for the initial state of prefaulted
PTEs, introduce arch_wants_old_prefaulted_pte(), which allows an
architecture to opt-in to 'old' prefaulted PTEs at runtime based on
whatever criteria it may have.

Cc: Jan Kara <jack@suse.cz>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/mm.h |  5 ++++-
 mm/filemap.c       | 14 ++++++++++----
 mm/memory.c        | 20 +++++++++++++++++++-
 3 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4572a9bc5862..251a2339befb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -434,6 +434,7 @@ extern pgprot_t protection_map[16];
  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ * @FAULT_FLAG_PREFAULT: Fault was a prefault.
  *
  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
  * whether we would allow page faults to retry by specifying these two
@@ -464,6 +465,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE			0x80
 #define FAULT_FLAG_INSTRUCTION  		0x100
 #define FAULT_FLAG_INTERRUPTIBLE		0x200
+#define FAULT_FLAG_PREFAULT			0x400
 
 /*
  * The default fault flags that should be used by most of the
@@ -501,7 +503,8 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags)
 	{ FAULT_FLAG_USER,		"USER" }, \
 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
+	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
+	{ FAULT_FLAG_PREFAULT,		"PREFAULT" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
diff --git a/mm/filemap.c b/mm/filemap.c
index c1f2dc89b8a7..a6dc97906c8e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3019,6 +3019,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	struct address_space *mapping = file->f_mapping;
 	pgoff_t last_pgoff = start_pgoff;
 	unsigned long address = vmf->address;
+	unsigned long flags = vmf->flags;
 	XA_STATE(xas, &mapping->i_pages, start_pgoff);
 	struct page *head, *page;
 	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
@@ -3051,14 +3052,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		if (!pte_none(*vmf->pte))
 			goto unlock;
 
+		/* We're about to handle the fault */
+		if (vmf->address == address) {
+			vmf->flags &= ~FAULT_FLAG_PREFAULT;
+			ret = VM_FAULT_NOPAGE;
+		} else {
+			vmf->flags |= FAULT_FLAG_PREFAULT;
+		}
+
 		do_set_pte(vmf, page);
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, vmf->address, vmf->pte);
 		unlock_page(head);
-
-		/* The fault is handled */
-		if (vmf->address == address)
-			ret = VM_FAULT_NOPAGE;
 		continue;
 unlock:
 		unlock_page(head);
@@ -3067,6 +3072,7 @@ unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
 	rcu_read_unlock();
+	vmf->flags = flags;
 	vmf->address = address;
 	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
 	return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 3e2fc2950ad7..f0e7c589ca9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void)
 }
 #endif
 
+#ifndef arch_wants_old_prefaulted_pte
+static inline bool arch_wants_old_prefaulted_pte(void)
+{
+	/*
+	 * Transitioning a PTE from 'old' to 'young' can be expensive on
+	 * some architectures, even if it's performed in hardware. By
+	 * default, "false" means prefaulted entries will be 'young'.
+	 */
+	return false;
+}
+#endif
+
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
@@ -3725,11 +3737,17 @@ void do_set_pte(struct vm_fault *vmf, struct page *page)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool prefault = vmf->flags & FAULT_FLAG_PREFAULT;
 	pte_t entry;
 
 	flush_icache_page(vma, page);
 	entry = mk_pte(page, vma->vm_page_prot);
-	entry = pte_sw_mkyoung(entry);
+
+	if (prefault && arch_wants_old_prefaulted_pte())
+		entry = pte_mkold(entry);
+	else
+		entry = pte_sw_mkyoung(entry);
+
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 	/* copy-on-write page */
-- 
cgit v1.2.3


From 9f77c58d65ff216d71f5ab829b6f39afefbde10c Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Mon, 21 Dec 2020 12:12:23 +0800
Subject: platform/chrome: cros_ec_commands: Add host command to keep AP off
 after EC reset.

Add command to EC_CMD_REBOOT_EC to reset EC but don't boot AP.

Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201221041231.14516-1-pihsun@chromium.org
---
 include/linux/platform_data/cros_ec_commands.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 9787715540c7..d3c40220b281 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -4741,6 +4741,7 @@ enum ec_reboot_cmd {
 	EC_REBOOT_DISABLE_JUMP = 5,  /* Disable jump until next reboot */
 	EC_REBOOT_HIBERNATE = 6,     /* Hibernate EC */
 	EC_REBOOT_HIBERNATE_CLEAR_AP_OFF = 7, /* and clears AP_OFF flag */
+	EC_REBOOT_COLD_AP_OFF = 8,   /* Cold-reboot and don't boot AP */
 };
 
 /* Flags for ec_params_reboot_ec.reboot_flags */
-- 
cgit v1.2.3


From f6f1f8e6e3eea25f539105d48166e91f0ab46dd1 Mon Sep 17 00:00:00 2001
From: Aswath Govindraju <a-govindraju@ti.com>
Date: Tue, 5 Jan 2021 16:28:12 +0530
Subject: misc: eeprom_93xx46: Add quirk to support Microchip 93LC46B eeprom

A dummy zero bit is sent preceding the data during a read transfer by the
Microchip 93LC46B eeprom (section 2.7 of[1]). This results in right shift
of data during a read. In order to ignore this bit a quirk can be added to
send an extra zero bit after the read address.

Add a quirk to ignore the zero bit sent before data by adding a zero bit
after the read address.

[1] - https://www.mouser.com/datasheet/2/268/20001749K-277859.pdf

Signed-off-by: Aswath Govindraju <a-govindraju@ti.com>
Link: https://lore.kernel.org/r/20210105105817.17644-3-a-govindraju@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/eeprom/eeprom_93xx46.c | 15 +++++++++++++++
 include/linux/eeprom_93xx46.h       |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/eeprom/eeprom_93xx46.c b/drivers/misc/eeprom/eeprom_93xx46.c
index e130a0d858e9..80114f4c80ad 100644
--- a/drivers/misc/eeprom/eeprom_93xx46.c
+++ b/drivers/misc/eeprom/eeprom_93xx46.c
@@ -35,6 +35,10 @@ static const struct eeprom_93xx46_devtype_data atmel_at93c46d_data = {
 		  EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH,
 };
 
+static const struct eeprom_93xx46_devtype_data microchip_93lc46b_data = {
+	.quirks = EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE,
+};
+
 struct eeprom_93xx46_dev {
 	struct spi_device *spi;
 	struct eeprom_93xx46_platform_data *pdata;
@@ -55,6 +59,11 @@ static inline bool has_quirk_instruction_length(struct eeprom_93xx46_dev *edev)
 	return edev->pdata->quirks & EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH;
 }
 
+static inline bool has_quirk_extra_read_cycle(struct eeprom_93xx46_dev *edev)
+{
+	return edev->pdata->quirks & EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE;
+}
+
 static int eeprom_93xx46_read(void *priv, unsigned int off,
 			      void *val, size_t count)
 {
@@ -96,6 +105,11 @@ static int eeprom_93xx46_read(void *priv, unsigned int off,
 		dev_dbg(&edev->spi->dev, "read cmd 0x%x, %d Hz\n",
 			cmd_addr, edev->spi->max_speed_hz);
 
+		if (has_quirk_extra_read_cycle(edev)) {
+			cmd_addr <<= 1;
+			bits += 1;
+		}
+
 		spi_message_init(&m);
 
 		t[0].tx_buf = (char *)&cmd_addr;
@@ -363,6 +377,7 @@ static void select_deassert(void *context)
 static const struct of_device_id eeprom_93xx46_of_table[] = {
 	{ .compatible = "eeprom-93xx46", },
 	{ .compatible = "atmel,at93c46d", .data = &atmel_at93c46d_data, },
+	{ .compatible = "microchip,93lc46b", .data = &microchip_93lc46b_data, },
 	{}
 };
 MODULE_DEVICE_TABLE(of, eeprom_93xx46_of_table);
diff --git a/include/linux/eeprom_93xx46.h b/include/linux/eeprom_93xx46.h
index eec7928ff8fe..99580c22f91a 100644
--- a/include/linux/eeprom_93xx46.h
+++ b/include/linux/eeprom_93xx46.h
@@ -16,6 +16,8 @@ struct eeprom_93xx46_platform_data {
 #define EEPROM_93XX46_QUIRK_SINGLE_WORD_READ		(1 << 0)
 /* Instructions such as EWEN are (addrlen + 2) in length. */
 #define EEPROM_93XX46_QUIRK_INSTRUCTION_LENGTH		(1 << 1)
+/* Add extra cycle after address during a read */
+#define EEPROM_93XX46_QUIRK_EXTRA_READ_CYCLE		BIT(2)
 
 	/*
 	 * optional hooks to control additional logic
-- 
cgit v1.2.3


From 149ae80b1d50e7db5ac7df1cdf0820017b70e716 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Thu, 14 Jan 2021 11:53:18 +0100
Subject: soc: bcm: brcmstb: add stubs for getting platform IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some brcmstb drivers may be shared with other SoC families. E.g. the
same USB PHY block is shared by brcmstb and BCM4908.

To avoid building brcmstb common code on non-brcmstb platforms we need
stubs for:
1. brcmstb_get_family_id()
2. brcmstb_get_product_id()
(to avoid "undefined reference to" errors).

With this change PHY_BRCM_USB will not have to unconditionally select
SOC_BRCMSTB anymore.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 include/linux/soc/brcmstb/brcmstb.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/brcmstb/brcmstb.h b/include/linux/soc/brcmstb/brcmstb.h
index 8e884e0dda0a..f2b768852777 100644
--- a/include/linux/soc/brcmstb/brcmstb.h
+++ b/include/linux/soc/brcmstb/brcmstb.h
@@ -2,6 +2,8 @@
 #ifndef __BRCMSTB_SOC_H
 #define __BRCMSTB_SOC_H
 
+#include <linux/kconfig.h>
+
 static inline u32 BRCM_ID(u32 reg)
 {
 	return reg >> 28 ? reg >> 16 : reg >> 8;
@@ -12,6 +14,8 @@ static inline u32 BRCM_REV(u32 reg)
 	return reg & 0xff;
 }
 
+#if IS_ENABLED(CONFIG_SOC_BRCMSTB)
+
 /*
  * Helper functions for getting family or product id from the
  * SoC driver.
@@ -19,4 +23,16 @@ static inline u32 BRCM_REV(u32 reg)
 u32 brcmstb_get_family_id(void);
 u32 brcmstb_get_product_id(void);
 
+#else
+static inline u32 brcmstb_get_family_id(void)
+{
+	return 0;
+}
+
+static inline u32 brcmstb_get_product_id(void)
+{
+	return 0;
+}
+#endif
+
 #endif /* __BRCMSTB_SOC_H */
-- 
cgit v1.2.3


From 9cacf81f8161111db25f98e78a7a0e32ae142b3f Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 15 Jan 2021 08:34:59 -0800
Subject: bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE

Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE.
We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom
call in do_tcp_getsockopt using the on-stack data. This removes
3% overhead for locking/unlocking the socket.

Without this patch:
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc

With the patch applied:
     0.52%     0.12%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt_kern

Note, exporting uapi/tcp.h requires removing netinet/tcp.h
from test_progs.h because those headers have confliciting
definitions.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com
---
 include/linux/bpf-cgroup.h                         |  27 +-
 include/linux/indirect_call_wrapper.h              |   6 +
 include/net/sock.h                                 |   2 +
 include/net/tcp.h                                  |   1 +
 kernel/bpf/cgroup.c                                |  46 +++
 net/ipv4/tcp.c                                     |  14 +
 net/ipv4/tcp_ipv4.c                                |   1 +
 net/ipv6/tcp_ipv6.c                                |   1 +
 net/socket.c                                       |   3 +
 tools/include/uapi/linux/tcp.h                     | 357 +++++++++++++++++++++
 .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c  |   1 +
 .../selftests/bpf/prog_tests/cls_redirect.c        |   1 +
 .../selftests/bpf/prog_tests/sockmap_basic.c       |   1 +
 .../testing/selftests/bpf/prog_tests/sockopt_sk.c  |  28 ++
 tools/testing/selftests/bpf/progs/sockopt_sk.c     |  23 +-
 tools/testing/selftests/bpf/test_progs.h           |   1 -
 16 files changed, 506 insertions(+), 7 deletions(-)
 create mode 100644 tools/include/uapi/linux/tcp.h

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 72e69a0e1e8c..bcb2915e6124 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				       int __user *optlen, int max_optlen,
 				       int retval);
 
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+					    int optname, void *optval,
+					    int *optlen, int retval);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 ({									       \
 	int __ret = retval;						       \
 	if (cgroup_bpf_enabled)						       \
-		__ret = __cgroup_bpf_run_filter_getsockopt(sock, level,	       \
-							   optname, optval,    \
-							   optlen, max_optlen, \
-							   retval);	       \
+		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
+		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
+					tcp_bpf_bypass_getsockopt,	       \
+					level, optname))		       \
+			__ret = __cgroup_bpf_run_filter_getsockopt(	       \
+				sock, level, optname, optval, optlen,	       \
+				max_optlen, retval);			       \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval,      \
+					    optlen, retval)		       \
+({									       \
+	int __ret = retval;						       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
+			sock, level, optname, optval, optlen, retval);	       \
 	__ret;								       \
 })
 
@@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
 				       optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
+					    optlen, retval) ({ retval; })
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
 
diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index 54c02c84906a..cfcfef37b2f1 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -60,4 +60,10 @@
 #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
 #endif
 
+#if IS_ENABLED(CONFIG_INET)
+#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
+#endif
+
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 129d200bccb4..7644ea64a376 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1174,6 +1174,8 @@ struct proto {
 
 	int			(*backlog_rcv) (struct sock *sk,
 						struct sk_buff *skb);
+	bool			(*bpf_bypass_getsockopt)(int level,
+							 int optname);
 
 	void		(*release_cb)(struct sock *sk);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 78d13c88720f..4bb42fb19711 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
 		      struct poll_table_struct *wait);
 int tcp_getsockopt(struct sock *sk, int level, int optname,
 		   char __user *optval, int __user *optlen);
+bool tcp_bpf_bypass_getsockopt(int level, int optname);
 int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		   unsigned int optlen);
 void tcp_set_keepalive(struct sock *sk, int val);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 96555a8a2c54..416e7738981b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1486,6 +1486,52 @@ out:
 	sockopt_free_buf(&ctx);
 	return ret;
 }
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+					    int optname, void *optval,
+					    int *optlen, int retval)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_kern ctx = {
+		.sk = sk,
+		.level = level,
+		.optname = optname,
+		.retval = retval,
+		.optlen = *optlen,
+		.optval = optval,
+		.optval_end = optval + *optlen,
+	};
+	int ret;
+
+	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+	 * user data back into BPF buffer when reval != 0. This is
+	 * done as an optimization to avoid extra copy, assuming
+	 * kernel won't populate the data in case of an error.
+	 * Here we always pass the data and memset() should
+	 * be called if that data shouldn't be "exported".
+	 */
+
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+				 &ctx, BPF_PROG_RUN);
+	if (!ret)
+		return -EPERM;
+
+	if (ctx.optlen > *optlen)
+		return -EFAULT;
+
+	/* BPF programs only allowed to set retval to 0, not some
+	 * arbitrary value.
+	 */
+	if (ctx.retval != 0 && ctx.retval != retval)
+		return -EFAULT;
+
+	/* BPF programs can shrink the buffer, export the modifications.
+	 */
+	if (ctx.optlen != 0)
+		*optlen = ctx.optlen;
+
+	return ctx.retval;
+}
 #endif
 
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 856ae516ac18..26aa923cf522 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		lock_sock(sk);
 		err = tcp_zerocopy_receive(sk, &zc);
+		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+							  &zc, &len, err);
 		release_sock(sk);
 		if (len >= offsetofend(struct tcp_zerocopy_receive, err))
 			goto zerocopy_rcv_sk_err;
@@ -4133,6 +4135,18 @@ zerocopy_rcv_out:
 	return 0;
 }
 
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
+{
+	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
+	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+	 */
+	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
+
 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int __user *optlen)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 777306b5bc22..62b6fd385a47 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2793,6 +2793,7 @@ struct proto tcp_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0e1509b02cb3..8539715ff035 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 	.keepalive		= tcp_set_keepalive,
 	.recvmsg		= tcp_recvmsg,
 	.sendmsg		= tcp_sendmsg,
diff --git a/net/socket.c b/net/socket.c
index 33e8b6c4e1d3..7f0617ab5437 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
 	return __sys_setsockopt(fd, level, optname, optval, optlen);
 }
 
+INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
+							 int optname));
+
 /*
  *	Get a socket option. Because we don't know the option lengths we have
  *	to pass a user mode parameter for the protocols to sort out.
diff --git a/tools/include/uapi/linux/tcp.h b/tools/include/uapi/linux/tcp.h
new file mode 100644
index 000000000000..13ceeb395eb8
--- /dev/null
+++ b/tools/include/uapi/linux/tcp.h
@@ -0,0 +1,357 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for the TCP protocol.
+ *
+ * Version:	@(#)tcp.h	1.0.2	04/28/93
+ *
+ * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _UAPI_LINUX_TCP_H
+#define _UAPI_LINUX_TCP_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/socket.h>
+
+struct tcphdr {
+	__be16	source;
+	__be16	dest;
+	__be32	seq;
+	__be32	ack_seq;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u16	res1:4,
+		doff:4,
+		fin:1,
+		syn:1,
+		rst:1,
+		psh:1,
+		ack:1,
+		urg:1,
+		ece:1,
+		cwr:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u16	doff:4,
+		res1:4,
+		cwr:1,
+		ece:1,
+		urg:1,
+		ack:1,
+		psh:1,
+		rst:1,
+		syn:1,
+		fin:1;
+#else
+#error	"Adjust your <asm/byteorder.h> defines"
+#endif	
+	__be16	window;
+	__sum16	check;
+	__be16	urg_ptr;
+};
+
+/*
+ *	The union cast uses a gcc extension to avoid aliasing problems
+ *  (union is compatible to any of its members)
+ *  This means this part of the code is -fstrict-aliasing safe now.
+ */
+union tcp_word_hdr { 
+	struct tcphdr hdr;
+	__be32 		  words[5];
+}; 
+
+#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) 
+
+enum { 
+	TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000),
+	TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000),
+	TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000),
+	TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000),
+	TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000),
+	TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000),
+	TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000),
+	TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000),
+	TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000),
+	TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000)
+}; 
+
+/*
+ * TCP general constants
+ */
+#define TCP_MSS_DEFAULT		 536U	/* IPv4 (RFC1122, RFC2581) */
+#define TCP_MSS_DESIRED		1220U	/* IPv6 (tunneled), EDNS0 (RFC3226) */
+
+/* TCP socket options */
+#define TCP_NODELAY		1	/* Turn off Nagle's algorithm. */
+#define TCP_MAXSEG		2	/* Limit MSS */
+#define TCP_CORK		3	/* Never send partially complete segments */
+#define TCP_KEEPIDLE		4	/* Start keeplives after this period */
+#define TCP_KEEPINTVL		5	/* Interval between keepalives */
+#define TCP_KEEPCNT		6	/* Number of keepalives before death */
+#define TCP_SYNCNT		7	/* Number of SYN retransmits */
+#define TCP_LINGER2		8	/* Life time of orphaned FIN-WAIT-2 state */
+#define TCP_DEFER_ACCEPT	9	/* Wake up listener only when data arrive */
+#define TCP_WINDOW_CLAMP	10	/* Bound advertised window */
+#define TCP_INFO		11	/* Information about this connection. */
+#define TCP_QUICKACK		12	/* Block/reenable quick acks */
+#define TCP_CONGESTION		13	/* Congestion control algorithm */
+#define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
+#define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
+#define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
+#define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */
+#define TCP_REPAIR		19	/* TCP sock is under repair right now */
+#define TCP_REPAIR_QUEUE	20
+#define TCP_QUEUE_SEQ		21
+#define TCP_REPAIR_OPTIONS	22
+#define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */
+#define TCP_TIMESTAMP		24
+#define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
+#define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */
+#define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
+#define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
+#define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
+#define TCP_FASTOPEN_CONNECT	30	/* Attempt FastOpen with connect */
+#define TCP_ULP			31	/* Attach a ULP to a TCP connection */
+#define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
+#define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
+#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
+#define TCP_ZEROCOPY_RECEIVE	35
+#define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */
+
+#define TCP_CM_INQ		TCP_INQ
+
+#define TCP_TX_DELAY		37	/* delay outgoing packets by XX usec */
+
+
+#define TCP_REPAIR_ON		1
+#define TCP_REPAIR_OFF		0
+#define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
+
+struct tcp_repair_opt {
+	__u32	opt_code;
+	__u32	opt_val;
+};
+
+struct tcp_repair_window {
+	__u32	snd_wl1;
+	__u32	snd_wnd;
+	__u32	max_window;
+
+	__u32	rcv_wnd;
+	__u32	rcv_wup;
+};
+
+enum {
+	TCP_NO_QUEUE,
+	TCP_RECV_QUEUE,
+	TCP_SEND_QUEUE,
+	TCP_QUEUES_NR,
+};
+
+/* why fastopen failed from client perspective */
+enum tcp_fastopen_client_fail {
+	TFO_STATUS_UNSPEC, /* catch-all */
+	TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */
+	TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */
+	TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */
+};
+
+/* for TCP_INFO socket option */
+#define TCPI_OPT_TIMESTAMPS	1
+#define TCPI_OPT_SACK		2
+#define TCPI_OPT_WSCALE		4
+#define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
+#define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
+#define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
+
+/*
+ * Sender's congestion state indicating normal or abnormal situations
+ * in the last round of packets sent. The state is driven by the ACK
+ * information and timer events.
+ */
+enum tcp_ca_state {
+	/*
+	 * Nothing bad has been observed recently.
+	 * No apparent reordering, packet loss, or ECN marks.
+	 */
+	TCP_CA_Open = 0,
+#define TCPF_CA_Open	(1<<TCP_CA_Open)
+	/*
+	 * The sender enters disordered state when it has received DUPACKs or
+	 * SACKs in the last round of packets sent. This could be due to packet
+	 * loss or reordering but needs further information to confirm packets
+	 * have been lost.
+	 */
+	TCP_CA_Disorder = 1,
+#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
+	/*
+	 * The sender enters Congestion Window Reduction (CWR) state when it
+	 * has received ACKs with ECN-ECE marks, or has experienced congestion
+	 * or packet discard on the sender host (e.g. qdisc).
+	 */
+	TCP_CA_CWR = 2,
+#define TCPF_CA_CWR	(1<<TCP_CA_CWR)
+	/*
+	 * The sender is in fast recovery and retransmitting lost packets,
+	 * typically triggered by ACK events.
+	 */
+	TCP_CA_Recovery = 3,
+#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
+	/*
+	 * The sender is in loss recovery triggered by retransmission timeout.
+	 */
+	TCP_CA_Loss = 4
+#define TCPF_CA_Loss	(1<<TCP_CA_Loss)
+};
+
+struct tcp_info {
+	__u8	tcpi_state;
+	__u8	tcpi_ca_state;
+	__u8	tcpi_retransmits;
+	__u8	tcpi_probes;
+	__u8	tcpi_backoff;
+	__u8	tcpi_options;
+	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+	__u8	tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
+
+	__u32	tcpi_rto;
+	__u32	tcpi_ato;
+	__u32	tcpi_snd_mss;
+	__u32	tcpi_rcv_mss;
+
+	__u32	tcpi_unacked;
+	__u32	tcpi_sacked;
+	__u32	tcpi_lost;
+	__u32	tcpi_retrans;
+	__u32	tcpi_fackets;
+
+	/* Times. */
+	__u32	tcpi_last_data_sent;
+	__u32	tcpi_last_ack_sent;     /* Not remembered, sorry. */
+	__u32	tcpi_last_data_recv;
+	__u32	tcpi_last_ack_recv;
+
+	/* Metrics. */
+	__u32	tcpi_pmtu;
+	__u32	tcpi_rcv_ssthresh;
+	__u32	tcpi_rtt;
+	__u32	tcpi_rttvar;
+	__u32	tcpi_snd_ssthresh;
+	__u32	tcpi_snd_cwnd;
+	__u32	tcpi_advmss;
+	__u32	tcpi_reordering;
+
+	__u32	tcpi_rcv_rtt;
+	__u32	tcpi_rcv_space;
+
+	__u32	tcpi_total_retrans;
+
+	__u64	tcpi_pacing_rate;
+	__u64	tcpi_max_pacing_rate;
+	__u64	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
+	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	__u32	tcpi_notsent_bytes;
+	__u32	tcpi_min_rtt;
+	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
+
+	__u64   tcpi_delivery_rate;
+
+	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */
+	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
+	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+	__u32	tcpi_delivered;
+	__u32	tcpi_delivered_ce;
+
+	__u64	tcpi_bytes_sent;     /* RFC4898 tcpEStatsPerfHCDataOctetsOut */
+	__u64	tcpi_bytes_retrans;  /* RFC4898 tcpEStatsPerfOctetsRetrans */
+	__u32	tcpi_dsack_dups;     /* RFC4898 tcpEStatsStackDSACKDups */
+	__u32	tcpi_reord_seen;     /* reordering events seen */
+
+	__u32	tcpi_rcv_ooopack;    /* Out-of-order packets received */
+
+	__u32	tcpi_snd_wnd;	     /* peer's advertised receive window after
+				      * scaling (bytes)
+				      */
+};
+
+/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+enum {
+	TCP_NLA_PAD,
+	TCP_NLA_BUSY,		/* Time (usec) busy sending data */
+	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */
+	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
+	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */
+	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */
+	TCP_NLA_PACING_RATE,    /* Pacing rate in bytes per second */
+	TCP_NLA_DELIVERY_RATE,  /* Delivery rate in bytes per second */
+	TCP_NLA_SND_CWND,       /* Sending congestion window */
+	TCP_NLA_REORDERING,     /* Reordering metric */
+	TCP_NLA_MIN_RTT,        /* minimum RTT */
+	TCP_NLA_RECUR_RETRANS,  /* Recurring retransmits for the current pkt */
+	TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
+	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */
+	TCP_NLA_CA_STATE,	/* ca_state of socket */
+	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
+	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
+	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
+	TCP_NLA_BYTES_SENT,	/* Data bytes sent including retransmission */
+	TCP_NLA_BYTES_RETRANS,	/* Data bytes retransmitted */
+	TCP_NLA_DSACK_DUPS,	/* DSACK blocks received */
+	TCP_NLA_REORD_SEEN,	/* reordering events seen */
+	TCP_NLA_SRTT,		/* smoothed RTT in usecs */
+	TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */
+	TCP_NLA_BYTES_NOTSENT,	/* Bytes in write queue not yet sent */
+	TCP_NLA_EDT,		/* Earliest departure time (CLOCK_MONOTONIC) */
+};
+
+/* for TCP_MD5SIG socket option */
+#define TCP_MD5SIG_MAXKEYLEN	80
+
+/* tcp_md5sig extension flags for TCP_MD5SIG_EXT */
+#define TCP_MD5SIG_FLAG_PREFIX		0x1	/* address prefix length */
+#define TCP_MD5SIG_FLAG_IFINDEX		0x2	/* ifindex set */
+
+struct tcp_md5sig {
+	struct __kernel_sockaddr_storage tcpm_addr;	/* address associated */
+	__u8	tcpm_flags;				/* extension flags */
+	__u8	tcpm_prefixlen;				/* address prefix */
+	__u16	tcpm_keylen;				/* key length */
+	int	tcpm_ifindex;				/* device index for scope */
+	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
+};
+
+/* INET_DIAG_MD5SIG */
+struct tcp_diag_md5sig {
+	__u8	tcpm_family;
+	__u8	tcpm_prefixlen;
+	__u16	tcpm_keylen;
+	__be32	tcpm_addr[4];
+	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
+};
+
+/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
+
+#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
+struct tcp_zerocopy_receive {
+	__u64 address;		/* in: address of mapping */
+	__u32 length;		/* in/out: number of bytes to map/mapped */
+	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
+	__u32 inq; /* out: amount of bytes in read queue */
+	__s32 err; /* out: socket error */
+	__u64 copybuf_address;	/* in: copybuf address (small reads) */
+	__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
+	__u32 flags; /* in: flags */
+};
+#endif /* _UAPI_LINUX_TCP_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 9a8f47fc0b91..37c5494a0381 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2019 Facebook */
 
 #include <linux/err.h>
+#include <netinet/tcp.h>
 #include <test_progs.h>
 #include "bpf_dctcp.skel.h"
 #include "bpf_cubic.skel.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
index 9781d85cb223..e075d03ab630 100644
--- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
@@ -7,6 +7,7 @@
 #include <string.h>
 
 #include <linux/pkt_cls.h>
+#include <netinet/tcp.h>
 
 #include <test_progs.h>
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 85f73261fab0..b8b48cac2ac3 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2020 Cloudflare
 #include <error.h>
+#include <netinet/tcp.h>
 
 #include "test_progs.h"
 #include "test_skmsg_load_helpers.skel.h"
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index b25c9c45c148..d5b44b135c00 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -2,6 +2,12 @@
 #include <test_progs.h>
 #include "cgroup_helpers.h"
 
+#include <linux/tcp.h>
+
+#ifndef SOL_TCP
+#define SOL_TCP IPPROTO_TCP
+#endif
+
 #define SOL_CUSTOM			0xdeadbeef
 
 static int getsetsockopt(void)
@@ -11,6 +17,7 @@ static int getsetsockopt(void)
 		char u8[4];
 		__u32 u32;
 		char cc[16]; /* TCP_CA_NAME_MAX */
+		struct tcp_zerocopy_receive zc;
 	} buf = {};
 	socklen_t optlen;
 	char *big_buf = NULL;
@@ -154,6 +161,27 @@ static int getsetsockopt(void)
 		goto err;
 	}
 
+	/* TCP_ZEROCOPY_RECEIVE triggers */
+	memset(&buf, 0, sizeof(buf));
+	optlen = sizeof(buf.zc);
+	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+	if (err) {
+		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+			err, errno);
+		goto err;
+	}
+
+	memset(&buf, 0, sizeof(buf));
+	buf.zc.address = 12345; /* rejected by BPF */
+	optlen = sizeof(buf.zc);
+	errno = 0;
+	err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+	if (errno != EPERM) {
+		log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+			err, errno);
+		goto err;
+	}
+
 	free(big_buf);
 	close(fd);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index 712df7b49cb1..d3597f81e6e9 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
+#include <linux/tcp.h>
 #include <linux/bpf.h>
+#include <netinet/in.h>
 #include <bpf/bpf_helpers.h>
 
 char _license[] SEC("license") = "GPL";
@@ -12,6 +12,10 @@ __u32 _version SEC("version") = 1;
 #define PAGE_SIZE 4096
 #endif
 
+#ifndef SOL_TCP
+#define SOL_TCP IPPROTO_TCP
+#endif
+
 #define SOL_CUSTOM			0xdeadbeef
 
 struct sockopt_sk {
@@ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx)
 		return 1;
 	}
 
+	if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
+		/* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+		 * It has a custom implementation for performance
+		 * reasons.
+		 */
+
+		if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
+			return 0; /* EPERM, bounds check */
+
+		if (((struct tcp_zerocopy_receive *)optval)->address != 0)
+			return 0; /* EPERM, unexpected data */
+
+		return 1;
+	}
+
 	if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
 		if (optval + 1 > optval_end)
 			return 0; /* EPERM, bounds check */
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index e49e2fdde942..f7c2fd89d01a 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -16,7 +16,6 @@ typedef __u16 __sum16;
 #include <linux/if_packet.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
-#include <netinet/tcp.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
 #include <linux/socket.h>
-- 
cgit v1.2.3


From 20f2505fb436cfa674cf1f46aaa624f44d3d1d03 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 15 Jan 2021 08:35:00 -0800
Subject: bpf: Try to avoid kzalloc in cgroup/{s,g}etsockopt

When we attach a bpf program to cgroup/getsockopt any other getsockopt()
syscall starts incurring kzalloc/kfree cost.

Let add a small buffer on the stack and use it for small (majority)
{s,g}etsockopt values. The buffer is small enough to fit into
the cache line and cover the majority of simple options (most
of them are 4 byte ints).

It seems natural to do the same for setsockopt, but it's a bit more
involved when the BPF program modifies the data (where we have to
kmalloc). The assumption is that for the majority of setsockopt
calls (which are doing pure BPF options or apply policy) this
will bring some benefit as well.

Without this patch (we remove about 1% __kmalloc):
     3.38%     0.07%  tcp_mmap  [kernel.kallsyms]  [k] __cgroup_bpf_run_filter_getsockopt
            |
             --3.30%--__cgroup_bpf_run_filter_getsockopt
                       |
                        --0.81%--__kmalloc

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-3-sdf@google.com
---
 include/linux/filter.h |  5 +++++
 kernel/bpf/cgroup.c    | 52 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7fdce5407214..5b3137d7b690 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1298,6 +1298,11 @@ struct bpf_sysctl_kern {
 	u64 tmp_reg;
 };
 
+#define BPF_SOCKOPT_KERN_BUF_SIZE	32
+struct bpf_sockopt_buf {
+	u8		data[BPF_SOCKOPT_KERN_BUF_SIZE];
+};
+
 struct bpf_sockopt_kern {
 	struct sock	*sk;
 	u8		*optval;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 416e7738981b..ba8a1199d0ba 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 	return empty;
 }
 
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+			     struct bpf_sockopt_buf *buf)
 {
 	if (unlikely(max_optlen < 0))
 		return -EINVAL;
@@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 		max_optlen = PAGE_SIZE;
 	}
 
+	if (max_optlen <= sizeof(buf->data)) {
+		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+		 * bytes avoid the cost of kzalloc.
+		 */
+		ctx->optval = buf->data;
+		ctx->optval_end = ctx->optval + max_optlen;
+		return max_optlen;
+	}
+
 	ctx->optval = kzalloc(max_optlen, GFP_USER);
 	if (!ctx->optval)
 		return -ENOMEM;
@@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 	return max_optlen;
 }
 
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+			     struct bpf_sockopt_buf *buf)
 {
+	if (ctx->optval == buf->data)
+		return;
 	kfree(ctx->optval);
 }
 
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+				  struct bpf_sockopt_buf *buf)
+{
+	return ctx->optval != buf->data;
+}
+
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 				       int *optname, char __user *optval,
 				       int *optlen, char **kernel_optval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_buf buf = {};
 	struct bpf_sockopt_kern ctx = {
 		.sk = sk,
 		.level = *level,
@@ -1350,7 +1370,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 */
 	max_optlen = max_t(int, 16, *optlen);
 
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
 
@@ -1390,14 +1410,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		 */
 		if (ctx.optlen != 0) {
 			*optlen = ctx.optlen;
-			*kernel_optval = ctx.optval;
+			/* We've used bpf_sockopt_kern->buf as an intermediary
+			 * storage, but the BPF program indicates that we need
+			 * to pass this data to the kernel setsockopt handler.
+			 * No way to export on-stack buf, have to allocate a
+			 * new buffer.
+			 */
+			if (!sockopt_buf_allocated(&ctx, &buf)) {
+				void *p = kmalloc(ctx.optlen, GFP_USER);
+
+				if (!p) {
+					ret = -ENOMEM;
+					goto out;
+				}
+				memcpy(p, ctx.optval, ctx.optlen);
+				*kernel_optval = p;
+			} else {
+				*kernel_optval = ctx.optval;
+			}
 			/* export and don't free sockopt buf */
 			return 0;
 		}
 	}
 
 out:
-	sockopt_free_buf(&ctx);
+	sockopt_free_buf(&ctx, &buf);
 	return ret;
 }
 
@@ -1407,6 +1444,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				       int retval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_sockopt_buf buf = {};
 	struct bpf_sockopt_kern ctx = {
 		.sk = sk,
 		.level = level,
@@ -1425,7 +1463,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	ctx.optlen = max_optlen;
 
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
 
@@ -1483,7 +1521,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	ret = ctx.retval;
 
 out:
-	sockopt_free_buf(&ctx);
+	sockopt_free_buf(&ctx, &buf);
 	return ret;
 }
 
-- 
cgit v1.2.3


From a9ed15dae0755a0368735e0556a462d8519bdb05 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 15 Jan 2021 08:35:01 -0800
Subject: bpf: Split cgroup_bpf_enabled per attach type

When we attach any cgroup hook, the rest (even if unused/unattached) start
to contribute small overhead. In particular, the one we want to avoid is
__cgroup_bpf_run_filter_skb which does two redirections to get to
the cgroup and pushes/pulls skb.

Let's split cgroup_bpf_enabled to be per-attach to make sure
only used attach types trigger.

I've dropped some existing high-level cgroup_bpf_enabled in some
places because BPF_PROG_CGROUP_XXX_RUN macros usually have another
cgroup_bpf_enabled check.

I also had to copy-paste BPF_CGROUP_RUN_SA_PROG_LOCK for
GETPEERNAME/GETSOCKNAME because type for cgroup_bpf_enabled[type]
has to be constant and known at compile time.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210115163501.805133-4-sdf@google.com
---
 include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++------------------
 kernel/bpf/cgroup.c        | 14 ++++++--------
 net/ipv4/af_inet.c         |  9 +++++----
 net/ipv4/udp.c             |  7 +++----
 net/ipv6/af_inet6.c        |  9 +++++----
 net/ipv6/udp.c             |  7 +++----
 6 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index bcb2915e6124..0748fd87969e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,8 +23,8 @@ struct ctl_table_header;
 
 #ifdef CONFIG_CGROUP_BPF
 
-extern struct static_key_false cgroup_bpf_enabled_key;
-#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
+#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
 
 DECLARE_PER_CPU(struct bpf_cgroup_storage*,
 		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
@@ -189,7 +189,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled)						      \
+	if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS))		      \
 		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
 						    BPF_CGROUP_INET_INGRESS); \
 									      \
@@ -199,7 +199,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
 		typeof(sk) __sk = sk_to_full_sk(sk);			       \
 		if (sk_fullsock(__sk))					       \
 			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
@@ -211,7 +211,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_SK_PROG(sk, type)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled) {					       \
+	if (cgroup_bpf_enabled(type)) {					       \
 		__ret = __cgroup_bpf_run_filter_sk(sk, type);		       \
 	}								       \
 	__ret;								       \
@@ -232,7 +232,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled)						       \
+	if (cgroup_bpf_enabled(type))					       \
 		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
 							  NULL);	       \
 	__ret;								       \
@@ -241,7 +241,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled)	{					       \
+	if (cgroup_bpf_enabled(type))	{				       \
 		lock_sock(sk);						       \
 		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
 							  t_ctx);	       \
@@ -256,8 +256,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr)			       \
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
 
-#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
-					    sk->sk_prot->pre_connect)
+#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk)				       \
+	((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) ||		       \
+	  cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) &&		       \
+	 (sk)->sk_prot->pre_connect)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
@@ -301,7 +303,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk)			\
 ({									\
 	int __ret = 0;							\
-	if (cgroup_bpf_enabled)						\
+	if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS))			\
 		__ret = __cgroup_bpf_run_filter_sock_ops(sk,		\
 							 sock_ops,	\
 							 BPF_CGROUP_SOCK_OPS); \
@@ -311,7 +313,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled && (sock_ops)->sk) {	       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) {       \
 		typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk);	       \
 		if (__sk && sk_fullsock(__sk))				       \
 			__ret = __cgroup_bpf_run_filter_sock_ops(__sk,	       \
@@ -324,7 +326,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
 ({									      \
 	int __ret = 0;							      \
-	if (cgroup_bpf_enabled)						      \
+	if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE))			      \
 		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
 							  access,	      \
 							  BPF_CGROUP_DEVICE); \
@@ -336,7 +338,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos)  \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled)						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL))			       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
 						       buf, count, pos,        \
 						       BPF_CGROUP_SYSCTL);     \
@@ -347,7 +349,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				       kernel_optval)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled)						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT))			       \
 		__ret = __cgroup_bpf_run_filter_setsockopt(sock, level,	       \
 							   optname, optval,    \
 							   optlen,	       \
@@ -358,7 +360,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen)			       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled)						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
 		get_user(__ret, optlen);				       \
 	__ret;								       \
 })
@@ -367,7 +369,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 				       max_optlen, retval)		       \
 ({									       \
 	int __ret = retval;						       \
-	if (cgroup_bpf_enabled)						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
 		if (!(sock)->sk_prot->bpf_bypass_getsockopt ||		       \
 		    !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
 					tcp_bpf_bypass_getsockopt,	       \
@@ -382,7 +384,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 					    optlen, retval)		       \
 ({									       \
 	int __ret = retval;						       \
-	if (cgroup_bpf_enabled)						       \
+	if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT))			       \
 		__ret = __cgroup_bpf_run_filter_getsockopt_kern(	       \
 			sock, level, optname, optval, optlen, retval);	       \
 	__ret;								       \
@@ -444,7 +446,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 	return 0;
 }
 
-#define cgroup_bpf_enabled (0)
+#define cgroup_bpf_enabled(type) (0)
 #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ba8a1199d0ba..da649f20d6b2 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -19,7 +19,7 @@
 
 #include "../cgroup/cgroup-internal.h"
 
-DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
 void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work)
 			if (pl->link)
 				bpf_cgroup_link_auto_detach(pl->link);
 			kfree(pl);
-			static_branch_dec(&cgroup_bpf_enabled_key);
+			static_branch_dec(&cgroup_bpf_enabled_key[type]);
 		}
 		old_array = rcu_dereference_protected(
 				cgrp->bpf.effective[type],
@@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (old_prog)
 		bpf_prog_put(old_prog);
 	else
-		static_branch_inc(&cgroup_bpf_enabled_key);
+		static_branch_inc(&cgroup_bpf_enabled_key[type]);
 	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
@@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		cgrp->bpf.flags[type] = 0;
 	if (old_prog)
 		bpf_prog_put(old_prog);
-	static_branch_dec(&cgroup_bpf_enabled_key);
+	static_branch_dec(&cgroup_bpf_enabled_key[type]);
 	return 0;
 
 cleanup:
@@ -1360,8 +1360,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (!cgroup_bpf_enabled ||
-	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
 		return 0;
 
 	/* Allocate a bit more than the initial user buffer for
@@ -1457,8 +1456,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	 * attached to the hook so we don't waste time allocating
 	 * memory and locking the socket.
 	 */
-	if (!cgroup_bpf_enabled ||
-	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+	if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
 		return retval;
 
 	ctx.optlen = max_optlen;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b94fa8eb831b..6ba2930ff49b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -777,18 +777,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 			return -ENOTCONN;
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
+		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+					    BPF_CGROUP_INET4_GETPEERNAME,
+					    NULL);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
 		if (!addr)
 			addr = inet->inet_saddr;
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
-	}
-	if (cgroup_bpf_enabled)
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    peer ? BPF_CGROUP_INET4_GETPEERNAME :
-						   BPF_CGROUP_INET4_GETSOCKNAME,
+					    BPF_CGROUP_INET4_GETSOCKNAME,
 					    NULL);
+	}
 	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 	return sizeof(*sin);
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 69ea76578abb..c67e483fce41 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1124,7 +1124,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
-	if (cgroup_bpf_enabled && !connected) {
+	if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
 					    (struct sockaddr *)usin, &ipc.addr);
 		if (err)
@@ -1858,9 +1858,8 @@ try_again:
 		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 		*addr_len = sizeof(*sin);
 
-		if (cgroup_bpf_enabled)
-			BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-							(struct sockaddr *)sin);
+		BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+						      (struct sockaddr *)sin);
 	}
 
 	if (udp_sk(sk)->gro_enabled)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8e9c3e9ea36e..b9c654836b72 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -527,18 +527,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin6_addr = sk->sk_v6_daddr;
 		if (np->sndflow)
 			sin->sin6_flowinfo = np->flow_label;
+		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
+					    BPF_CGROUP_INET6_GETPEERNAME,
+					    NULL);
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
 			sin->sin6_addr = np->saddr;
 		else
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 		sin->sin6_port = inet->inet_sport;
-	}
-	if (cgroup_bpf_enabled)
 		BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
-					    peer ? BPF_CGROUP_INET6_GETPEERNAME :
-						   BPF_CGROUP_INET6_GETSOCKNAME,
+					    BPF_CGROUP_INET6_GETSOCKNAME,
 					    NULL);
+	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
 						 sk->sk_bound_dev_if);
 	return sizeof(*sin);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b9f3dfdd2383..a02ac875a923 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -409,9 +409,8 @@ try_again:
 		}
 		*addr_len = sizeof(*sin6);
 
-		if (cgroup_bpf_enabled)
-			BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
-						(struct sockaddr *)sin6);
+		BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+						      (struct sockaddr *)sin6);
 	}
 
 	if (udp_sk(sk)->gro_enabled)
@@ -1462,7 +1461,7 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
-	if (cgroup_bpf_enabled && !connected) {
+	if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
 					   (struct sockaddr *)sin6, &fl6.saddr);
 		if (err)
-- 
cgit v1.2.3


From 3b830a9c34d5897be07176ce4e6f2d75e2c8cfd7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 18 Jan 2021 13:31:30 -0800
Subject: tty: convert tty_ldisc_ops 'read()' function to take a kernel pointer

The tty line discipline .read() function was passed the final user
pointer destination as an argument, which doesn't match the 'write()'
function, and makes it very inconvenient to do a splice method for
ttys.

This is a conversion to use a kernel buffer instead.

NOTE! It does this by passing the tty line discipline ->read() function
an additional "cookie" to fill in, and an offset into the cookie data.

The line discipline can fill in the cookie data with its own private
information, and then the reader will repeat the read until either the
cookie is cleared or it runs out of data.

The only real user of this is N_HDLC, which can use this to handle big
packets, even if the kernel buffer is smaller than the whole packet.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/bluetooth/hci_ldisc.c | 34 +++++++++---------
 drivers/input/serio/serport.c |  4 ++-
 drivers/net/ppp/ppp_async.c   |  3 +-
 drivers/net/ppp/ppp_synctty.c |  3 +-
 drivers/tty/n_gsm.c           |  3 +-
 drivers/tty/n_hdlc.c          | 60 +++++++++++++++++++++----------
 drivers/tty/n_null.c          |  3 +-
 drivers/tty/n_r3964.c         | 10 +++---
 drivers/tty/n_tracerouter.c   |  4 ++-
 drivers/tty/n_tracesink.c     |  4 ++-
 drivers/tty/n_tty.c           | 82 ++++++++++++++++++-------------------------
 drivers/tty/tty_io.c          | 64 +++++++++++++++++++++++++++++++--
 include/linux/tty_ldisc.h     |  3 +-
 net/nfc/nci/uart.c            |  3 +-
 14 files changed, 178 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index f83d67eafc9f..dd92aea15b8b 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -802,7 +802,8 @@ static int hci_uart_tty_ioctl(struct tty_struct *tty, struct file *file,
  * We don't provide read/write/poll interface for user space.
  */
 static ssize_t hci_uart_tty_read(struct tty_struct *tty, struct file *file,
-				 unsigned char __user *buf, size_t nr)
+				 unsigned char *buf, size_t nr,
+				 void **cookie, unsigned long offset)
 {
 	return 0;
 }
@@ -819,29 +820,28 @@ static __poll_t hci_uart_tty_poll(struct tty_struct *tty,
 	return 0;
 }
 
+static struct tty_ldisc_ops hci_uart_ldisc = {
+	.owner		= THIS_MODULE,
+	.magic		= TTY_LDISC_MAGIC,
+	.name		= "n_hci",
+	.open		= hci_uart_tty_open,
+	.close		= hci_uart_tty_close,
+	.read		= hci_uart_tty_read,
+	.write		= hci_uart_tty_write,
+	.ioctl		= hci_uart_tty_ioctl,
+	.compat_ioctl	= hci_uart_tty_ioctl,
+	.poll		= hci_uart_tty_poll,
+	.receive_buf	= hci_uart_tty_receive,
+	.write_wakeup	= hci_uart_tty_wakeup,
+};
+
 static int __init hci_uart_init(void)
 {
-	static struct tty_ldisc_ops hci_uart_ldisc;
 	int err;
 
 	BT_INFO("HCI UART driver ver %s", VERSION);
 
 	/* Register the tty discipline */
-
-	memset(&hci_uart_ldisc, 0, sizeof(hci_uart_ldisc));
-	hci_uart_ldisc.magic		= TTY_LDISC_MAGIC;
-	hci_uart_ldisc.name		= "n_hci";
-	hci_uart_ldisc.open		= hci_uart_tty_open;
-	hci_uart_ldisc.close		= hci_uart_tty_close;
-	hci_uart_ldisc.read		= hci_uart_tty_read;
-	hci_uart_ldisc.write		= hci_uart_tty_write;
-	hci_uart_ldisc.ioctl		= hci_uart_tty_ioctl;
-	hci_uart_ldisc.compat_ioctl	= hci_uart_tty_ioctl;
-	hci_uart_ldisc.poll		= hci_uart_tty_poll;
-	hci_uart_ldisc.receive_buf	= hci_uart_tty_receive;
-	hci_uart_ldisc.write_wakeup	= hci_uart_tty_wakeup;
-	hci_uart_ldisc.owner		= THIS_MODULE;
-
 	err = tty_register_ldisc(N_HCI, &hci_uart_ldisc);
 	if (err) {
 		BT_ERR("HCI line discipline registration failed. (%d)", err);
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index 8ac970a423de..33e9d9bfd036 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -156,7 +156,9 @@ out:
  * returning 0 characters.
  */
 
-static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, unsigned char __user * buf, size_t nr)
+static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file,
+				  unsigned char *kbuf, size_t nr,
+				  void **cookie, unsigned long offset)
 {
 	struct serport *serport = (struct serport*) tty->disc_data;
 	struct serio *serio;
diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
index 29a0917a81e6..f14a9d190de9 100644
--- a/drivers/net/ppp/ppp_async.c
+++ b/drivers/net/ppp/ppp_async.c
@@ -259,7 +259,8 @@ static int ppp_asynctty_hangup(struct tty_struct *tty)
  */
 static ssize_t
 ppp_asynctty_read(struct tty_struct *tty, struct file *file,
-		  unsigned char __user *buf, size_t count)
+		  unsigned char *buf, size_t count,
+		  void **cookie, unsigned long offset)
 {
 	return -EAGAIN;
 }
diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index 0f338752c38b..f774b7e52da4 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -257,7 +257,8 @@ static int ppp_sync_hangup(struct tty_struct *tty)
  */
 static ssize_t
 ppp_sync_read(struct tty_struct *tty, struct file *file,
-	       unsigned char __user *buf, size_t count)
+	      unsigned char *buf, size_t count,
+	      void **cookie, unsigned long offset)
 {
 	return -EAGAIN;
 }
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 25f3152089c2..fea1eeac5b90 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2557,7 +2557,8 @@ static void gsmld_write_wakeup(struct tty_struct *tty)
  */
 
 static ssize_t gsmld_read(struct tty_struct *tty, struct file *file,
-			 unsigned char __user *buf, size_t nr)
+			  unsigned char *buf, size_t nr,
+			  void **cookie, unsigned long offset)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 12557ee1edb6..1363e659dc1d 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -416,13 +416,19 @@ static void n_hdlc_tty_receive(struct tty_struct *tty, const __u8 *data,
  * Returns the number of bytes returned or error code.
  */
 static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
-			   __u8 __user *buf, size_t nr)
+			   __u8 *kbuf, size_t nr,
+			   void **cookie, unsigned long offset)
 {
 	struct n_hdlc *n_hdlc = tty->disc_data;
 	int ret = 0;
 	struct n_hdlc_buf *rbuf;
 	DECLARE_WAITQUEUE(wait, current);
 
+	/* Is this a repeated call for an rbuf we already found earlier? */
+	rbuf = *cookie;
+	if (rbuf)
+		goto have_rbuf;
+
 	add_wait_queue(&tty->read_wait, &wait);
 
 	for (;;) {
@@ -436,25 +442,8 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		rbuf = n_hdlc_buf_get(&n_hdlc->rx_buf_list);
-		if (rbuf) {
-			if (rbuf->count > nr) {
-				/* too large for caller's buffer */
-				ret = -EOVERFLOW;
-			} else {
-				__set_current_state(TASK_RUNNING);
-				if (copy_to_user(buf, rbuf->buf, rbuf->count))
-					ret = -EFAULT;
-				else
-					ret = rbuf->count;
-			}
-
-			if (n_hdlc->rx_free_buf_list.count >
-			    DEFAULT_RX_BUF_COUNT)
-				kfree(rbuf);
-			else
-				n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, rbuf);
+		if (rbuf)
 			break;
-		}
 
 		/* no data */
 		if (tty_io_nonblock(tty, file)) {
@@ -473,6 +462,39 @@ static ssize_t n_hdlc_tty_read(struct tty_struct *tty, struct file *file,
 	remove_wait_queue(&tty->read_wait, &wait);
 	__set_current_state(TASK_RUNNING);
 
+	if (!rbuf)
+		return ret;
+	*cookie = rbuf;
+
+have_rbuf:
+	/* Have we used it up entirely? */
+	if (offset >= rbuf->count)
+		goto done_with_rbuf;
+
+	/* More data to go, but can't copy any more? EOVERFLOW */
+	ret = -EOVERFLOW;
+	if (!nr)
+		goto done_with_rbuf;
+
+	/* Copy as much data as possible */
+	ret = rbuf->count - offset;
+	if (ret > nr)
+		ret = nr;
+	memcpy(kbuf, rbuf->buf+offset, ret);
+	offset += ret;
+
+	/* If we still have data left, we leave the rbuf in the cookie */
+	if (offset < rbuf->count)
+		return ret;
+
+done_with_rbuf:
+	*cookie = NULL;
+
+	if (n_hdlc->rx_free_buf_list.count > DEFAULT_RX_BUF_COUNT)
+		kfree(rbuf);
+	else
+		n_hdlc_buf_put(&n_hdlc->rx_free_buf_list, rbuf);
+
 	return ret;
 
 }	/* end of n_hdlc_tty_read() */
diff --git a/drivers/tty/n_null.c b/drivers/tty/n_null.c
index 96feabae4740..ce03ae78f5c6 100644
--- a/drivers/tty/n_null.c
+++ b/drivers/tty/n_null.c
@@ -20,7 +20,8 @@ static void n_null_close(struct tty_struct *tty)
 }
 
 static ssize_t n_null_read(struct tty_struct *tty, struct file *file,
-			   unsigned char __user * buf, size_t nr)
+			   unsigned char *buf, size_t nr,
+			   void **cookie, unsigned long offset)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index 934dd2fb2ec8..3161f0a535e3 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -129,7 +129,7 @@ static void remove_client_block(struct r3964_info *pInfo,
 static int r3964_open(struct tty_struct *tty);
 static void r3964_close(struct tty_struct *tty);
 static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
-		unsigned char __user * buf, size_t nr);
+		void *cookie, unsigned char *buf, size_t nr);
 static ssize_t r3964_write(struct tty_struct *tty, struct file *file,
 		const unsigned char *buf, size_t nr);
 static int r3964_ioctl(struct tty_struct *tty, struct file *file,
@@ -1058,7 +1058,8 @@ static void r3964_close(struct tty_struct *tty)
 }
 
 static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
-			  unsigned char __user * buf, size_t nr)
+			  unsigned char *kbuf, size_t nr,
+			  void **cookie, unsigned long offset)
 {
 	struct r3964_info *pInfo = tty->disc_data;
 	struct r3964_client_info *pClient;
@@ -1109,10 +1110,7 @@ static ssize_t r3964_read(struct tty_struct *tty, struct file *file,
 		kfree(pMsg);
 		TRACE_M("r3964_read - msg kfree %p", pMsg);
 
-		if (copy_to_user(buf, &theMsg, ret)) {
-			ret = -EFAULT;
-			goto unlock;
-		}
+		memcpy(kbuf, &theMsg, ret);
 
 		TRACE_PS("read - return %d", ret);
 		goto unlock;
diff --git a/drivers/tty/n_tracerouter.c b/drivers/tty/n_tracerouter.c
index 4479af4d2fa5..3490ed51b1a3 100644
--- a/drivers/tty/n_tracerouter.c
+++ b/drivers/tty/n_tracerouter.c
@@ -118,7 +118,9 @@ static void n_tracerouter_close(struct tty_struct *tty)
  *	 -EINVAL
  */
 static ssize_t n_tracerouter_read(struct tty_struct *tty, struct file *file,
-				  unsigned char __user *buf, size_t nr) {
+				  unsigned char *buf, size_t nr,
+				  void **cookie, unsigned long offset)
+{
 	return -EINVAL;
 }
 
diff --git a/drivers/tty/n_tracesink.c b/drivers/tty/n_tracesink.c
index d96ba82cc356..1d9931041fd8 100644
--- a/drivers/tty/n_tracesink.c
+++ b/drivers/tty/n_tracesink.c
@@ -115,7 +115,9 @@ static void n_tracesink_close(struct tty_struct *tty)
  *	 -EINVAL
  */
 static ssize_t n_tracesink_read(struct tty_struct *tty, struct file *file,
-				unsigned char __user *buf, size_t nr) {
+				unsigned char *buf, size_t nr,
+				void **cookie, unsigned long offset)
+{
 	return -EINVAL;
 }
 
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 7e5e36315260..4a34a9f43b29 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -164,29 +164,24 @@ static void zero_buffer(struct tty_struct *tty, u8 *buffer, int size)
 		memset(buffer, 0x00, size);
 }
 
-static int tty_copy_to_user(struct tty_struct *tty, void __user *to,
-			    size_t tail, size_t n)
+static void tty_copy(struct tty_struct *tty, void *to, size_t tail, size_t n)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	size_t size = N_TTY_BUF_SIZE - tail;
 	void *from = read_buf_addr(ldata, tail);
-	int uncopied;
 
 	if (n > size) {
 		tty_audit_add_data(tty, from, size);
-		uncopied = copy_to_user(to, from, size);
-		zero_buffer(tty, from, size - uncopied);
-		if (uncopied)
-			return uncopied;
+		memcpy(to, from, size);
+		zero_buffer(tty, from, size);
 		to += size;
 		n -= size;
 		from = ldata->read_buf;
 	}
 
 	tty_audit_add_data(tty, from, n);
-	uncopied = copy_to_user(to, from, n);
-	zero_buffer(tty, from, n - uncopied);
-	return uncopied;
+	memcpy(to, from, n);
+	zero_buffer(tty, from, n);
 }
 
 /**
@@ -1942,15 +1937,16 @@ static inline int input_available_p(struct tty_struct *tty, int poll)
 /**
  *	copy_from_read_buf	-	copy read data directly
  *	@tty: terminal device
- *	@b: user data
+ *	@kbp: data
  *	@nr: size of data
  *
  *	Helper function to speed up n_tty_read.  It is only called when
- *	ICANON is off; it copies characters straight from the tty queue to
- *	user space directly.  It can be profitably called twice; once to
- *	drain the space from the tail pointer to the (physical) end of the
- *	buffer, and once to drain the space from the (physical) beginning of
- *	the buffer to head pointer.
+ *	ICANON is off; it copies characters straight from the tty queue.
+ *
+ *	It can be profitably called twice; once to drain the space from
+ *	the tail pointer to the (physical) end of the buffer, and once
+ *	to drain the space from the (physical) beginning of the buffer
+ *	to head pointer.
  *
  *	Called under the ldata->atomic_read_lock sem
  *
@@ -1960,7 +1956,7 @@ static inline int input_available_p(struct tty_struct *tty, int poll)
  */
 
 static int copy_from_read_buf(struct tty_struct *tty,
-				      unsigned char __user **b,
+				      unsigned char **kbp,
 				      size_t *nr)
 
 {
@@ -1976,8 +1972,7 @@ static int copy_from_read_buf(struct tty_struct *tty,
 	n = min(*nr, n);
 	if (n) {
 		unsigned char *from = read_buf_addr(ldata, tail);
-		retval = copy_to_user(*b, from, n);
-		n -= retval;
+		memcpy(*kbp, from, n);
 		is_eof = n == 1 && *from == EOF_CHAR(tty);
 		tty_audit_add_data(tty, from, n);
 		zero_buffer(tty, from, n);
@@ -1986,7 +1981,7 @@ static int copy_from_read_buf(struct tty_struct *tty,
 		if (L_EXTPROC(tty) && ldata->icanon && is_eof &&
 		    (head == ldata->read_tail))
 			n = 0;
-		*b += n;
+		*kbp += n;
 		*nr -= n;
 	}
 	return retval;
@@ -1995,12 +1990,12 @@ static int copy_from_read_buf(struct tty_struct *tty,
 /**
  *	canon_copy_from_read_buf	-	copy read data in canonical mode
  *	@tty: terminal device
- *	@b: user data
+ *	@kbp: data
  *	@nr: size of data
  *
  *	Helper function for n_tty_read.  It is only called when ICANON is on;
  *	it copies one line of input up to and including the line-delimiting
- *	character into the user-space buffer.
+ *	character into the result buffer.
  *
  *	NB: When termios is changed from non-canonical to canonical mode and
  *	the read buffer contains data, n_tty_set_termios() simulates an EOF
@@ -2016,14 +2011,14 @@ static int copy_from_read_buf(struct tty_struct *tty,
  */
 
 static int canon_copy_from_read_buf(struct tty_struct *tty,
-				    unsigned char __user **b,
+				    unsigned char **kbp,
 				    size_t *nr)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	size_t n, size, more, c;
 	size_t eol;
 	size_t tail;
-	int ret, found = 0;
+	int found = 0;
 
 	/* N.B. avoid overrun if nr == 0 */
 	if (!*nr)
@@ -2059,10 +2054,8 @@ static int canon_copy_from_read_buf(struct tty_struct *tty,
 	n_tty_trace("%s: eol:%zu found:%d n:%zu c:%zu tail:%zu more:%zu\n",
 		    __func__, eol, found, n, c, tail, more);
 
-	ret = tty_copy_to_user(tty, *b, tail, n);
-	if (ret)
-		return -EFAULT;
-	*b += n;
+	tty_copy(tty, *kbp, tail, n);
+	*kbp += n;
 	*nr -= n;
 
 	if (found)
@@ -2130,10 +2123,11 @@ static int job_control(struct tty_struct *tty, struct file *file)
  */
 
 static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
-			 unsigned char __user *buf, size_t nr)
+			  unsigned char *kbuf, size_t nr,
+			  void **cookie, unsigned long offset)
 {
 	struct n_tty_data *ldata = tty->disc_data;
-	unsigned char __user *b = buf;
+	unsigned char *kb = kbuf;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	int c;
 	int minimum, time;
@@ -2179,17 +2173,13 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 		/* First test for status change. */
 		if (packet && tty->link->ctrl_status) {
 			unsigned char cs;
-			if (b != buf)
+			if (kb != kbuf)
 				break;
 			spin_lock_irq(&tty->link->ctrl_lock);
 			cs = tty->link->ctrl_status;
 			tty->link->ctrl_status = 0;
 			spin_unlock_irq(&tty->link->ctrl_lock);
-			if (put_user(cs, b)) {
-				retval = -EFAULT;
-				break;
-			}
-			b++;
+			*kb++ = cs;
 			nr--;
 			break;
 		}
@@ -2232,24 +2222,20 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 		}
 
 		if (ldata->icanon && !L_EXTPROC(tty)) {
-			retval = canon_copy_from_read_buf(tty, &b, &nr);
+			retval = canon_copy_from_read_buf(tty, &kb, &nr);
 			if (retval)
 				break;
 		} else {
 			int uncopied;
 
 			/* Deal with packet mode. */
-			if (packet && b == buf) {
-				if (put_user(TIOCPKT_DATA, b)) {
-					retval = -EFAULT;
-					break;
-				}
-				b++;
+			if (packet && kb == kbuf) {
+				*kb++ = TIOCPKT_DATA;
 				nr--;
 			}
 
-			uncopied = copy_from_read_buf(tty, &b, &nr);
-			uncopied += copy_from_read_buf(tty, &b, &nr);
+			uncopied = copy_from_read_buf(tty, &kb, &nr);
+			uncopied += copy_from_read_buf(tty, &kb, &nr);
 			if (uncopied) {
 				retval = -EFAULT;
 				break;
@@ -2258,7 +2244,7 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 
 		n_tty_check_unthrottle(tty);
 
-		if (b - buf >= minimum)
+		if (kb - kbuf >= minimum)
 			break;
 		if (time)
 			timeout = time;
@@ -2270,8 +2256,8 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file,
 	remove_wait_queue(&tty->read_wait, &wait);
 	mutex_unlock(&ldata->atomic_read_lock);
 
-	if (b - buf)
-		retval = b - buf;
+	if (kb - kbuf)
+		retval = kb - kbuf;
 
 	return retval;
 }
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 338bc4ef5549..a34f8bcf875e 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -833,6 +833,65 @@ static void tty_update_time(struct timespec64 *time)
 		time->tv_sec = sec;
 }
 
+/*
+ * Iterate on the ldisc ->read() function until we've gotten all
+ * the data the ldisc has for us.
+ *
+ * The "cookie" is something that the ldisc read function can fill
+ * in to let us know that there is more data to be had.
+ *
+ * We promise to continue to call the ldisc until it stops returning
+ * data or clears the cookie. The cookie may be something that the
+ * ldisc maintains state for and needs to free.
+ */
+static int iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file,
+		char __user *buf, size_t count)
+{
+	int retval = 0;
+	void *cookie = NULL;
+	unsigned long offset = 0;
+	char kernel_buf[64];
+
+	do {
+		int size, uncopied;
+
+		size = count > sizeof(kernel_buf) ? sizeof(kernel_buf) : count;
+		size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset);
+		if (!size)
+			break;
+
+		/*
+		 * A ldisc read error return will override any previously copied
+		 * data (eg -EOVERFLOW from HDLC)
+		 */
+		if (size < 0) {
+			memzero_explicit(kernel_buf, sizeof(kernel_buf));
+			return size;
+		}
+
+		uncopied = copy_to_user(buf+offset, kernel_buf, size);
+		size -= uncopied;
+		offset += size;
+		count -= size;
+
+		/*
+		 * If the user copy failed, we still need to do another ->read()
+		 * call if we had a cookie to let the ldisc clear up.
+		 *
+		 * But make sure size is zeroed.
+		 */
+		if (unlikely(uncopied)) {
+			count = 0;
+			retval = -EFAULT;
+		}
+	} while (cookie);
+
+	/* We always clear tty buffer in case they contained passwords */
+	memzero_explicit(kernel_buf, sizeof(kernel_buf));
+	return offset ? offset : retval;
+}
+
+
 /**
  *	tty_read	-	read method for tty device files
  *	@file: pointer to tty file
@@ -866,10 +925,9 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 	ld = tty_ldisc_ref_wait(tty);
 	if (!ld)
 		return hung_up_tty_read(file, buf, count, ppos);
+	i = -EIO;
 	if (ld->ops->read)
-		i = ld->ops->read(tty, file, buf, count);
-	else
-		i = -EIO;
+		i = iterate_tty_read(ld, tty, file, buf, count);
 	tty_ldisc_deref(ld);
 
 	if (i > 0)
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index b1e6043e9917..572a07976116 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -185,7 +185,8 @@ struct tty_ldisc_ops {
 	void	(*close)(struct tty_struct *);
 	void	(*flush_buffer)(struct tty_struct *tty);
 	ssize_t	(*read)(struct tty_struct *tty, struct file *file,
-			unsigned char __user *buf, size_t nr);
+			unsigned char *buf, size_t nr,
+			void **cookie, unsigned long offset);
 	ssize_t	(*write)(struct tty_struct *tty, struct file *file,
 			 const unsigned char *buf, size_t nr);
 	int	(*ioctl)(struct tty_struct *tty, struct file *file,
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 11b554ce07ff..1204c438e87d 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -292,7 +292,8 @@ static int nci_uart_tty_ioctl(struct tty_struct *tty, struct file *file,
 
 /* We don't provide read/write/poll interface for user space. */
 static ssize_t nci_uart_tty_read(struct tty_struct *tty, struct file *file,
-				 unsigned char __user *buf, size_t nr)
+				 unsigned char *buf, size_t nr,
+				 void **cookie, unsigned long offset)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 7baf2429a1a965369b0ce44efb6315cdd515aa9c Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Tue, 19 Jan 2021 16:31:50 +0800
Subject: net/sched: cls_flower add CT_FLAGS_INVALID flag support

This patch add the TCA_FLOWER_KEY_CT_FLAGS_INVALID flag to
match the ct_state with invalid for conntrack.

Signed-off-by: wenxu <wenxu@ucloud.cn>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Link: https://lore.kernel.org/r/1611045110-682-1-git-send-email-wenxu@ucloud.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h       |  4 ++--
 include/net/sch_generic.h    |  1 +
 include/uapi/linux/pkt_cls.h |  1 +
 net/core/dev.c               |  2 ++
 net/core/flow_dissector.c    | 13 +++++++++----
 net/sched/act_ct.c           |  1 +
 net/sched/cls_flower.c       |  4 +++-
 7 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 46f901adf1a8..186dad231e30 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1353,8 +1353,8 @@ void
 skb_flow_dissect_ct(const struct sk_buff *skb,
 		    struct flow_dissector *flow_dissector,
 		    void *target_container,
-		    u16 *ctinfo_map,
-		    size_t mapsize);
+		    u16 *ctinfo_map, size_t mapsize,
+		    bool post_ct);
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 639e465a108f..e7bee99aebce 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -388,6 +388,7 @@ struct qdisc_skb_cb {
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
 	u16			mru;
+	bool			post_ct;
 };
 
 typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index ee95f42fb0ec..709668e264b0 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -591,6 +591,7 @@ enum {
 	TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */
 	TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */
 	TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */
+	TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */
 };
 
 enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index 00f970ba0248..d9ce02e95992 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3878,6 +3878,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 
 	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 	qdisc_skb_cb(skb)->mru = 0;
+	qdisc_skb_cb(skb)->post_ct = false;
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 
 	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@ -4960,6 +4961,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
 	qdisc_skb_cb(skb)->mru = 0;
+	qdisc_skb_cb(skb)->post_ct = false;
 	skb->tc_at_ingress = 1;
 	mini_qdisc_bstats_cpu_update(miniq, skb);
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2d70ded389ae..c565c7a17091 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -237,9 +237,8 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
 void
 skb_flow_dissect_ct(const struct sk_buff *skb,
 		    struct flow_dissector *flow_dissector,
-		    void *target_container,
-		    u16 *ctinfo_map,
-		    size_t mapsize)
+		    void *target_container, u16 *ctinfo_map,
+		    size_t mapsize, bool post_ct)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	struct flow_dissector_key_ct *key;
@@ -251,13 +250,19 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
 		return;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct)
+	if (!ct && !post_ct)
 		return;
 
 	key = skb_flow_dissector_target(flow_dissector,
 					FLOW_DISSECTOR_KEY_CT,
 					target_container);
 
+	if (!ct) {
+		key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+				TCA_FLOWER_KEY_CT_FLAGS_INVALID;
+		return;
+	}
+
 	if (ctinfo < mapsize)
 		key->ct_state = ctinfo_map[ctinfo];
 #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 83a5c6722a06..b3442078aabc 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1030,6 +1030,7 @@ out_push:
 
 out:
 	tcf_action_update_bstats(&c->common, skb);
+	qdisc_skb_cb(skb)->post_ct = true;
 	if (defrag)
 		qdisc_skb_cb(skb)->pkt_len = skb->len;
 	return retval;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 84f932532db7..4a9297a89c77 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -302,6 +302,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
+	bool post_ct = qdisc_skb_cb(skb)->post_ct;
 	struct fl_flow_key skb_key;
 	struct fl_flow_mask *mask;
 	struct cls_fl_filter *f;
@@ -318,7 +319,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
 		skb_flow_dissect_ct(skb, &mask->dissector, &skb_key,
 				    fl_ct_info_to_flower_map,
-				    ARRAY_SIZE(fl_ct_info_to_flower_map));
+				    ARRAY_SIZE(fl_ct_info_to_flower_map),
+				    post_ct);
 		skb_flow_dissect_hash(skb, &mask->dissector, &skb_key);
 		skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
 
-- 
cgit v1.2.3


From fcba4b2047a31a55e1cef73849363a3cf7c2736d Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Tue, 5 Jan 2021 17:44:35 +0100
Subject: mhi: unconstify mhi_event_config

Some parameters may have to be determined at runtime.
It is the case for the event ring MSI vector.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 54afcae1709a..48f5c9168ae0 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -279,7 +279,7 @@ struct mhi_controller_config {
 	u32 num_channels;
 	const struct mhi_channel_config *ch_cfg;
 	u32 num_events;
-	const struct mhi_event_config *event_cfg;
+	struct mhi_event_config *event_cfg;
 	bool use_bounce_buf;
 	bool m2_no_db;
 };
-- 
cgit v1.2.3


From 6ffcc18d9c0b4522c95aab71ff3ff5a56e699945 Mon Sep 17 00:00:00 2001
From: Carl Huang <cjhuang@codeaurora.org>
Date: Mon, 4 Jan 2021 18:11:28 +0800
Subject: mhi: use irq_flags if controller driver configures it

If controller driver has specified the irq_flags, mhi uses this specified
irq_flags. Otherwise, mhi uses default irq_flags.

The purpose of this change is to support one MSI vector for QCA6390.
MHI will use one same MSI vector too in this scenario.

In case of one MSI vector, IRQ_NO_BALANCING is needed when irq handler
is requested. The reason is if irq migration happens, the msi_data may
change too. However, the msi_data is already programmed to QCA6390
hardware during initialization phase. This msi_data inconsistence will
result in crash in kernel.

Another issue is in case of one MSI vector, IRQF_NO_SUSPEND will trigger
WARNINGS because QCA6390 wants to disable the IRQ during the suspend.

To avoid above two issues, QCA6390 driver specifies the irq_flags in case
of one MSI vector when mhi_register_controller is called.

Signed-off-by: Carl Huang <cjhuang@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c | 9 +++++++--
 include/linux/mhi.h         | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index f0697f433c2f..aa575d3fb3ae 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -151,12 +151,17 @@ int mhi_init_irq_setup(struct mhi_controller *mhi_cntrl)
 {
 	struct mhi_event *mhi_event = mhi_cntrl->mhi_event;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
+	unsigned long irq_flags = IRQF_SHARED | IRQF_NO_SUSPEND;
 	int i, ret;
 
+	/* if controller driver has set irq_flags, use it */
+	if (mhi_cntrl->irq_flags)
+		irq_flags = mhi_cntrl->irq_flags;
+
 	/* Setup BHI_INTVEC IRQ */
 	ret = request_threaded_irq(mhi_cntrl->irq[0], mhi_intvec_handler,
 				   mhi_intvec_threaded_handler,
-				   IRQF_SHARED | IRQF_NO_SUSPEND,
+				   irq_flags,
 				   "bhi", mhi_cntrl);
 	if (ret)
 		return ret;
@@ -174,7 +179,7 @@ int mhi_init_irq_setup(struct mhi_controller *mhi_cntrl)
 
 		ret = request_irq(mhi_cntrl->irq[mhi_event->irq],
 				  mhi_irq_handler,
-				  IRQF_SHARED | IRQF_NO_SUSPEND,
+				  irq_flags,
 				  "mhi", mhi_event);
 		if (ret) {
 			dev_err(dev, "Error requesting irq:%d for ev:%d\n",
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 562862ff819c..b7138127664f 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -353,6 +353,7 @@ struct mhi_controller_config {
  * @fbc_download: MHI host needs to do complete image transfer (optional)
  * @pre_init: MHI host needs to do pre-initialization before power up
  * @wake_set: Device wakeup set flag
+ * @irq_flags: irq flags passed to request_irq (optional)
  *
  * Fields marked as (required) need to be populated by the controller driver
  * before calling mhi_register_controller(). For the fields marked as (optional)
@@ -444,6 +445,7 @@ struct mhi_controller {
 	bool fbc_download;
 	bool pre_init;
 	bool wake_set;
+	unsigned long irq_flags;
 };
 
 /**
-- 
cgit v1.2.3


From 0908c5aca31eb5e0c72d7a5dba422629b88e877d Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Thu, 14 Jan 2021 22:50:51 +0800
Subject: usb: typec: tcpm: AMS and Collision Avoidance

This patch provides the implementation of Collision Avoidance introduced
in PD3.0. The start of each Atomic Message Sequence (AMS) initiated by
the port will be denied if the current AMS is not interruptible. The
Source port will set the CC to SinkTxNG if it is going to initiate an
AMS, and SinkTxOk otherwise. Meanwhile, any AMS initiated by a Sink port
will be denied in TCPM if the port partner (Source) sets SinkTxNG except
for HARD_RESET and SOFT_RESET.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210114145053.1952756-2-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 620 +++++++++++++++++++++++++++++++++++-------
 include/linux/usb/pd.h        |   1 +
 include/linux/usb/tcpm.h      |   4 +
 3 files changed, 527 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 22a85b396f69..2b16d2764092 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -76,6 +76,8 @@
 	S(SNK_HARD_RESET_SINK_ON),		\
 						\
 	S(SOFT_RESET),				\
+	S(SRC_SOFT_RESET_WAIT_SNK_TX),		\
+	S(SNK_SOFT_RESET),			\
 	S(SOFT_RESET_SEND),			\
 						\
 	S(DR_SWAP_ACCEPT),			\
@@ -139,7 +141,45 @@
 						\
 	S(ERROR_RECOVERY),			\
 	S(PORT_RESET),				\
-	S(PORT_RESET_WAIT_OFF)
+	S(PORT_RESET_WAIT_OFF),			\
+						\
+	S(AMS_START)
+
+#define FOREACH_AMS(S)				\
+	S(NONE_AMS),				\
+	S(POWER_NEGOTIATION),			\
+	S(GOTOMIN),				\
+	S(SOFT_RESET_AMS),			\
+	S(HARD_RESET),				\
+	S(CABLE_RESET),				\
+	S(GET_SOURCE_CAPABILITIES),		\
+	S(GET_SINK_CAPABILITIES),		\
+	S(POWER_ROLE_SWAP),			\
+	S(FAST_ROLE_SWAP),			\
+	S(DATA_ROLE_SWAP),			\
+	S(VCONN_SWAP),				\
+	S(SOURCE_ALERT),			\
+	S(GETTING_SOURCE_EXTENDED_CAPABILITIES),\
+	S(GETTING_SOURCE_SINK_STATUS),		\
+	S(GETTING_BATTERY_CAPABILITIES),	\
+	S(GETTING_BATTERY_STATUS),		\
+	S(GETTING_MANUFACTURER_INFORMATION),	\
+	S(SECURITY),				\
+	S(FIRMWARE_UPDATE),			\
+	S(DISCOVER_IDENTITY),			\
+	S(SOURCE_STARTUP_CABLE_PLUG_DISCOVER_IDENTITY),	\
+	S(DISCOVER_SVIDS),			\
+	S(DISCOVER_MODES),			\
+	S(DFP_TO_UFP_ENTER_MODE),		\
+	S(DFP_TO_UFP_EXIT_MODE),		\
+	S(DFP_TO_CABLE_PLUG_ENTER_MODE),	\
+	S(DFP_TO_CABLE_PLUG_EXIT_MODE),		\
+	S(ATTENTION),				\
+	S(BIST),				\
+	S(UNSTRUCTURED_VDMS),			\
+	S(STRUCTURED_VDMS),			\
+	S(COUNTRY_INFO),			\
+	S(COUNTRY_CODES)
 
 #define GENERATE_ENUM(e)	e
 #define GENERATE_STRING(s)	#s
@@ -152,6 +192,14 @@ static const char * const tcpm_states[] = {
 	FOREACH_STATE(GENERATE_STRING)
 };
 
+enum tcpm_ams {
+	FOREACH_AMS(GENERATE_ENUM)
+};
+
+static const char * const tcpm_ams_str[] = {
+	FOREACH_AMS(GENERATE_STRING)
+};
+
 enum vdm_states {
 	VDM_STATE_ERR_BUSY = -3,
 	VDM_STATE_ERR_SEND = -2,
@@ -161,6 +209,7 @@ enum vdm_states {
 	VDM_STATE_READY = 1,
 	VDM_STATE_BUSY = 2,
 	VDM_STATE_WAIT_RSP_BUSY = 3,
+	VDM_STATE_SEND_MESSAGE = 4,
 };
 
 enum pd_msg_request {
@@ -381,6 +430,11 @@ struct tcpm_port {
 	/* Sink caps have been queried */
 	bool sink_cap_done;
 
+	/* Collision Avoidance and Atomic Message Sequence */
+	enum tcpm_state upcoming_state;
+	enum tcpm_ams ams;
+	bool in_ams;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
 	struct mutex logbuffer_lock;	/* log buffer access lock */
@@ -396,6 +450,12 @@ struct pd_rx_event {
 	struct pd_message msg;
 };
 
+static const char * const pd_rev[] = {
+	[PD_REV10]		= "rev1",
+	[PD_REV20]		= "rev2",
+	[PD_REV30]		= "rev3",
+};
+
 #define tcpm_cc_is_sink(cc) \
 	((cc) == TYPEC_CC_RP_DEF || (cc) == TYPEC_CC_RP_1_5 || \
 	 (cc) == TYPEC_CC_RP_3_0)
@@ -440,6 +500,10 @@ struct pd_rx_event {
 	((port)->typec_caps.data == TYPEC_PORT_DFP ? \
 	TYPEC_HOST : TYPEC_DEVICE)
 
+#define tcpm_sink_tx_ok(port) \
+	(tcpm_port_is_sink(port) && \
+	((port)->cc1 == TYPEC_CC_RP_3_0 || (port)->cc2 == TYPEC_CC_RP_3_0))
+
 static enum tcpm_state tcpm_default_state(struct tcpm_port *port)
 {
 	if (port->port_type == TYPEC_PORT_DRP) {
@@ -666,6 +730,67 @@ static void tcpm_debugfs_exit(const struct tcpm_port *port) { }
 
 #endif
 
+static void tcpm_set_cc(struct tcpm_port *port, enum typec_cc_status cc)
+{
+	tcpm_log(port, "cc:=%d", cc);
+	port->cc_req = cc;
+	port->tcpc->set_cc(port->tcpc, cc);
+}
+
+/*
+ * Determine RP value to set based on maximum current supported
+ * by a port if configured as source.
+ * Returns CC value to report to link partner.
+ */
+static enum typec_cc_status tcpm_rp_cc(struct tcpm_port *port)
+{
+	const u32 *src_pdo = port->src_pdo;
+	int nr_pdo = port->nr_src_pdo;
+	int i;
+
+	/*
+	 * Search for first entry with matching voltage.
+	 * It should report the maximum supported current.
+	 */
+	for (i = 0; i < nr_pdo; i++) {
+		const u32 pdo = src_pdo[i];
+
+		if (pdo_type(pdo) == PDO_TYPE_FIXED &&
+		    pdo_fixed_voltage(pdo) == 5000) {
+			unsigned int curr = pdo_max_current(pdo);
+
+			if (curr >= 3000)
+				return TYPEC_CC_RP_3_0;
+			else if (curr >= 1500)
+				return TYPEC_CC_RP_1_5;
+			return TYPEC_CC_RP_DEF;
+		}
+	}
+
+	return TYPEC_CC_RP_DEF;
+}
+
+static int tcpm_ams_finish(struct tcpm_port *port)
+{
+	int ret = 0;
+
+	tcpm_log(port, "AMS %s finished", tcpm_ams_str[port->ams]);
+
+	if (port->pd_capable && port->pwr_role == TYPEC_SOURCE) {
+		if (port->negotiated_rev >= PD_REV30)
+			tcpm_set_cc(port, SINK_TX_OK);
+		else
+			tcpm_set_cc(port, SINK_TX_NG);
+	} else if (port->pwr_role == TYPEC_SOURCE) {
+		tcpm_set_cc(port, tcpm_rp_cc(port));
+	}
+
+	port->in_ams = false;
+	port->ams = NONE_AMS;
+
+	return ret;
+}
+
 static int tcpm_pd_transmit(struct tcpm_port *port,
 			    enum tcpm_transmit_type type,
 			    const struct pd_message *msg)
@@ -693,13 +818,30 @@ static int tcpm_pd_transmit(struct tcpm_port *port,
 	switch (port->tx_status) {
 	case TCPC_TX_SUCCESS:
 		port->message_id = (port->message_id + 1) & PD_HEADER_ID_MASK;
-		return 0;
+		/*
+		 * USB PD rev 2.0, 8.3.2.2.1:
+		 * USB PD rev 3.0, 8.3.2.1.3:
+		 * "... Note that every AMS is Interruptible until the first
+		 * Message in the sequence has been successfully sent (GoodCRC
+		 * Message received)."
+		 */
+		if (port->ams != NONE_AMS)
+			port->in_ams = true;
+		break;
 	case TCPC_TX_DISCARDED:
-		return -EAGAIN;
+		ret = -EAGAIN;
+		break;
 	case TCPC_TX_FAILED:
 	default:
-		return -EIO;
+		ret = -EIO;
+		break;
 	}
+
+	/* Some AMS don't expect responses. Finish them here. */
+	if (port->ams == ATTENTION || port->ams == SOURCE_ALERT)
+		tcpm_ams_finish(port);
+
+	return ret;
 }
 
 void tcpm_pd_transmit_complete(struct tcpm_port *port,
@@ -804,39 +946,6 @@ static int tcpm_set_current_limit(struct tcpm_port *port, u32 max_ma, u32 mv)
 	return ret;
 }
 
-/*
- * Determine RP value to set based on maximum current supported
- * by a port if configured as source.
- * Returns CC value to report to link partner.
- */
-static enum typec_cc_status tcpm_rp_cc(struct tcpm_port *port)
-{
-	const u32 *src_pdo = port->src_pdo;
-	int nr_pdo = port->nr_src_pdo;
-	int i;
-
-	/*
-	 * Search for first entry with matching voltage.
-	 * It should report the maximum supported current.
-	 */
-	for (i = 0; i < nr_pdo; i++) {
-		const u32 pdo = src_pdo[i];
-
-		if (pdo_type(pdo) == PDO_TYPE_FIXED &&
-		    pdo_fixed_voltage(pdo) == 5000) {
-			unsigned int curr = pdo_max_current(pdo);
-
-			if (curr >= 3000)
-				return TYPEC_CC_RP_3_0;
-			else if (curr >= 1500)
-				return TYPEC_CC_RP_1_5;
-			return TYPEC_CC_RP_DEF;
-		}
-	}
-
-	return TYPEC_CC_RP_DEF;
-}
-
 static int tcpm_set_attached_state(struct tcpm_port *port, bool attached)
 {
 	return port->tcpc->set_roles(port->tcpc, attached, port->pwr_role,
@@ -1000,16 +1109,17 @@ static void tcpm_set_state(struct tcpm_port *port, enum tcpm_state state,
 			   unsigned int delay_ms)
 {
 	if (delay_ms) {
-		tcpm_log(port, "pending state change %s -> %s @ %u ms",
-			 tcpm_states[port->state], tcpm_states[state],
-			 delay_ms);
+		tcpm_log(port, "pending state change %s -> %s @ %u ms [%s %s]",
+			 tcpm_states[port->state], tcpm_states[state], delay_ms,
+			 pd_rev[port->negotiated_rev], tcpm_ams_str[port->ams]);
 		port->delayed_state = state;
 		mod_tcpm_delayed_work(port, delay_ms);
 		port->delayed_runtime = ktime_add(ktime_get(), ms_to_ktime(delay_ms));
 		port->delay_ms = delay_ms;
 	} else {
-		tcpm_log(port, "state change %s -> %s",
-			 tcpm_states[port->state], tcpm_states[state]);
+		tcpm_log(port, "state change %s -> %s [%s %s]",
+			 tcpm_states[port->state], tcpm_states[state],
+			 pd_rev[port->negotiated_rev], tcpm_ams_str[port->ams]);
 		port->delayed_state = INVALID_STATE;
 		port->prev_state = port->state;
 		port->state = state;
@@ -1031,10 +1141,11 @@ static void tcpm_set_state_cond(struct tcpm_port *port, enum tcpm_state state,
 		tcpm_set_state(port, state, delay_ms);
 	else
 		tcpm_log(port,
-			 "skipped %sstate change %s -> %s [%u ms], context state %s",
+			 "skipped %sstate change %s -> %s [%u ms], context state %s [%s %s]",
 			 delay_ms ? "delayed " : "",
 			 tcpm_states[port->state], tcpm_states[state],
-			 delay_ms, tcpm_states[port->enter_state]);
+			 delay_ms, tcpm_states[port->enter_state],
+			 pd_rev[port->negotiated_rev], tcpm_ams_str[port->ams]);
 }
 
 static void tcpm_queue_message(struct tcpm_port *port,
@@ -1044,6 +1155,149 @@ static void tcpm_queue_message(struct tcpm_port *port,
 	mod_tcpm_delayed_work(port, 0);
 }
 
+static bool tcpm_vdm_ams(struct tcpm_port *port)
+{
+	switch (port->ams) {
+	case DISCOVER_IDENTITY:
+	case SOURCE_STARTUP_CABLE_PLUG_DISCOVER_IDENTITY:
+	case DISCOVER_SVIDS:
+	case DISCOVER_MODES:
+	case DFP_TO_UFP_ENTER_MODE:
+	case DFP_TO_UFP_EXIT_MODE:
+	case DFP_TO_CABLE_PLUG_ENTER_MODE:
+	case DFP_TO_CABLE_PLUG_EXIT_MODE:
+	case ATTENTION:
+	case UNSTRUCTURED_VDMS:
+	case STRUCTURED_VDMS:
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool tcpm_ams_interruptible(struct tcpm_port *port)
+{
+	switch (port->ams) {
+	/* Interruptible AMS */
+	case NONE_AMS:
+	case SECURITY:
+	case FIRMWARE_UPDATE:
+	case DISCOVER_IDENTITY:
+	case SOURCE_STARTUP_CABLE_PLUG_DISCOVER_IDENTITY:
+	case DISCOVER_SVIDS:
+	case DISCOVER_MODES:
+	case DFP_TO_UFP_ENTER_MODE:
+	case DFP_TO_UFP_EXIT_MODE:
+	case DFP_TO_CABLE_PLUG_ENTER_MODE:
+	case DFP_TO_CABLE_PLUG_EXIT_MODE:
+	case UNSTRUCTURED_VDMS:
+	case STRUCTURED_VDMS:
+	case COUNTRY_INFO:
+	case COUNTRY_CODES:
+		break;
+	/* Non-Interruptible AMS */
+	default:
+		if (port->in_ams)
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+static int tcpm_ams_start(struct tcpm_port *port, enum tcpm_ams ams)
+{
+	int ret = 0;
+
+	tcpm_log(port, "AMS %s start", tcpm_ams_str[ams]);
+
+	if (!tcpm_ams_interruptible(port) && ams != HARD_RESET) {
+		port->upcoming_state = INVALID_STATE;
+		tcpm_log(port, "AMS %s not interruptible, aborting",
+			 tcpm_ams_str[port->ams]);
+		return -EAGAIN;
+	}
+
+	if (port->pwr_role == TYPEC_SOURCE) {
+		enum typec_cc_status cc_req = port->cc_req;
+
+		port->ams = ams;
+
+		if (ams == HARD_RESET) {
+			tcpm_set_cc(port, tcpm_rp_cc(port));
+			tcpm_pd_transmit(port, TCPC_TX_HARD_RESET, NULL);
+			tcpm_set_state(port, HARD_RESET_START, 0);
+			return ret;
+		} else if (ams == SOFT_RESET_AMS) {
+			if (!port->explicit_contract) {
+				port->upcoming_state = INVALID_STATE;
+				tcpm_set_cc(port, tcpm_rp_cc(port));
+				return ret;
+			}
+		} else if (tcpm_vdm_ams(port)) {
+			/* tSinkTx is enforced in vdm_run_state_machine */
+			if (port->negotiated_rev >= PD_REV30)
+				tcpm_set_cc(port, SINK_TX_NG);
+			return ret;
+		}
+
+		if (port->negotiated_rev >= PD_REV30)
+			tcpm_set_cc(port, SINK_TX_NG);
+
+		switch (port->state) {
+		case SRC_READY:
+		case SRC_STARTUP:
+		case SRC_SOFT_RESET_WAIT_SNK_TX:
+		case SOFT_RESET:
+		case SOFT_RESET_SEND:
+			if (port->negotiated_rev >= PD_REV30)
+				tcpm_set_state(port, AMS_START,
+					       cc_req == SINK_TX_OK ?
+					       PD_T_SINK_TX : 0);
+			else
+				tcpm_set_state(port, AMS_START, 0);
+			break;
+		default:
+			if (port->negotiated_rev >= PD_REV30)
+				tcpm_set_state(port, SRC_READY,
+					       cc_req == SINK_TX_OK ?
+					       PD_T_SINK_TX : 0);
+			else
+				tcpm_set_state(port, SRC_READY, 0);
+			break;
+		}
+	} else {
+		if (port->negotiated_rev >= PD_REV30 &&
+		    !tcpm_sink_tx_ok(port) &&
+		    ams != SOFT_RESET_AMS &&
+		    ams != HARD_RESET) {
+			port->upcoming_state = INVALID_STATE;
+			tcpm_log(port, "Sink TX No Go");
+			return -EAGAIN;
+		}
+
+		port->ams = ams;
+
+		if (ams == HARD_RESET) {
+			tcpm_pd_transmit(port, TCPC_TX_HARD_RESET, NULL);
+			tcpm_set_state(port, HARD_RESET_START, 0);
+			return ret;
+		} else if (tcpm_vdm_ams(port)) {
+			return ret;
+		}
+
+		if (port->state == SNK_READY ||
+		    port->state == SNK_SOFT_RESET)
+			tcpm_set_state(port, AMS_START, 0);
+		else
+			tcpm_set_state(port, SNK_READY, 0);
+	}
+
+	return ret;
+}
+
 /*
  * VDM/VDO handling functions
  */
@@ -1236,6 +1490,8 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
 		if (IS_ERR_OR_NULL(port->partner))
 			break;
 
+		tcpm_ams_finish(port);
+
 		switch (cmd) {
 		case CMD_DISCOVER_IDENT:
 			/* 6.4.4.3.1 */
@@ -1286,6 +1542,7 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
 		}
 		break;
 	case CMDT_RSP_NAK:
+		tcpm_ams_finish(port);
 		switch (cmd) {
 		case CMD_ENTER_MODE:
 			/* Back to USB Operation */
@@ -1435,7 +1692,8 @@ static unsigned int vdm_ready_timeout(u32 vdm_hdr)
 static void vdm_run_state_machine(struct tcpm_port *port)
 {
 	struct pd_message msg;
-	int i, res;
+	int i, res = 0;
+	u32 vdo_hdr = port->vdo_data[0];
 
 	switch (port->vdm_state) {
 	case VDM_STATE_READY:
@@ -1452,26 +1710,45 @@ static void vdm_run_state_machine(struct tcpm_port *port)
 		if (port->state != SRC_READY && port->state != SNK_READY)
 			break;
 
-		/* Prepare and send VDM */
-		memset(&msg, 0, sizeof(msg));
-		msg.header = PD_HEADER_LE(PD_DATA_VENDOR_DEF,
-					  port->pwr_role,
-					  port->data_role,
-					  port->negotiated_rev,
-					  port->message_id, port->vdo_count);
-		for (i = 0; i < port->vdo_count; i++)
-			msg.payload[i] = cpu_to_le32(port->vdo_data[i]);
-		res = tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
-		if (res < 0) {
-			port->vdm_state = VDM_STATE_ERR_SEND;
-		} else {
-			unsigned long timeout;
+		/* TODO: AMS operation for Unstructured VDM */
+		if (PD_VDO_SVDM(vdo_hdr) && PD_VDO_CMDT(vdo_hdr) == CMDT_INIT) {
+			switch (PD_VDO_CMD(vdo_hdr)) {
+			case CMD_DISCOVER_IDENT:
+				res = tcpm_ams_start(port, DISCOVER_IDENTITY);
+				break;
+			case CMD_DISCOVER_SVID:
+				res = tcpm_ams_start(port, DISCOVER_SVIDS);
+				break;
+			case CMD_DISCOVER_MODES:
+				res = tcpm_ams_start(port, DISCOVER_MODES);
+				break;
+			case CMD_ENTER_MODE:
+				res = tcpm_ams_start(port, DFP_TO_UFP_ENTER_MODE);
+				break;
+			case CMD_EXIT_MODE:
+				res = tcpm_ams_start(port, DFP_TO_UFP_EXIT_MODE);
+				break;
+			case CMD_ATTENTION:
+				res = tcpm_ams_start(port, ATTENTION);
+				break;
+			case VDO_CMD_VENDOR(0) ... VDO_CMD_VENDOR(15):
+				res = tcpm_ams_start(port, STRUCTURED_VDMS);
+				break;
+			default:
+				res = -EOPNOTSUPP;
+				break;
+			}
 
-			port->vdm_retries = 0;
-			port->vdm_state = VDM_STATE_BUSY;
-			timeout = vdm_ready_timeout(port->vdo_data[0]);
-			mod_vdm_delayed_work(port, timeout);
+			if (res < 0)
+				return;
 		}
+
+		port->vdm_state = VDM_STATE_SEND_MESSAGE;
+		mod_vdm_delayed_work(port, (port->negotiated_rev >= PD_REV30 &&
+					    port->pwr_role == TYPEC_SOURCE &&
+					    PD_VDO_SVDM(vdo_hdr) &&
+					    PD_VDO_CMDT(vdo_hdr) == CMDT_INIT) ?
+					   PD_T_SINK_TX : 0);
 		break;
 	case VDM_STATE_WAIT_RSP_BUSY:
 		port->vdo_data[0] = port->vdo_retry;
@@ -1480,6 +1757,8 @@ static void vdm_run_state_machine(struct tcpm_port *port)
 		break;
 	case VDM_STATE_BUSY:
 		port->vdm_state = VDM_STATE_ERR_TMOUT;
+		if (port->ams != NONE_AMS)
+			tcpm_ams_finish(port);
 		break;
 	case VDM_STATE_ERR_SEND:
 		/*
@@ -1492,6 +1771,29 @@ static void vdm_run_state_machine(struct tcpm_port *port)
 			tcpm_log(port, "VDM Tx error, retry");
 			port->vdm_retries++;
 			port->vdm_state = VDM_STATE_READY;
+			tcpm_ams_finish(port);
+		}
+		break;
+	case VDM_STATE_SEND_MESSAGE:
+		/* Prepare and send VDM */
+		memset(&msg, 0, sizeof(msg));
+		msg.header = PD_HEADER_LE(PD_DATA_VENDOR_DEF,
+					  port->pwr_role,
+					  port->data_role,
+					  port->negotiated_rev,
+					  port->message_id, port->vdo_count);
+		for (i = 0; i < port->vdo_count; i++)
+			msg.payload[i] = cpu_to_le32(port->vdo_data[i]);
+		res = tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
+		if (res < 0) {
+			port->vdm_state = VDM_STATE_ERR_SEND;
+		} else {
+			unsigned long timeout;
+
+			port->vdm_retries = 0;
+			port->vdm_state = VDM_STATE_BUSY;
+			timeout = vdm_ready_timeout(vdo_hdr);
+			mod_vdm_delayed_work(port, timeout);
 		}
 		break;
 	default:
@@ -1514,7 +1816,8 @@ static void vdm_state_machine_work(struct kthread_work *work)
 		prev_state = port->vdm_state;
 		vdm_run_state_machine(port);
 	} while (port->vdm_state != prev_state &&
-		 port->vdm_state != VDM_STATE_BUSY);
+		 port->vdm_state != VDM_STATE_BUSY &&
+		 port->vdm_state != VDM_STATE_SEND_MESSAGE);
 
 	mutex_unlock(&port->lock);
 }
@@ -1997,11 +2300,14 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 		case SOFT_RESET_SEND:
 			port->message_id = 0;
 			port->rx_msgid = -1;
-			if (port->pwr_role == TYPEC_SOURCE)
-				next_state = SRC_SEND_CAPABILITIES;
-			else
-				next_state = SNK_WAIT_CAPABILITIES;
-			tcpm_set_state(port, next_state, 0);
+			if (port->ams == SOFT_RESET_AMS)
+				tcpm_ams_finish(port);
+			if (port->pwr_role == TYPEC_SOURCE) {
+				port->upcoming_state = SRC_SEND_CAPABILITIES;
+				tcpm_ams_start(port, POWER_NEGOTIATION);
+			} else {
+				tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
+			}
 			break;
 		case DR_SWAP_SEND:
 			tcpm_set_state(port, DR_SWAP_CHANGE_DR, 0);
@@ -2776,13 +3082,6 @@ static bool tcpm_start_toggling(struct tcpm_port *port, enum typec_cc_status cc)
 	return ret == 0;
 }
 
-static void tcpm_set_cc(struct tcpm_port *port, enum typec_cc_status cc)
-{
-	tcpm_log(port, "cc:=%d", cc);
-	port->cc_req = cc;
-	port->tcpc->set_cc(port->tcpc, cc);
-}
-
 static int tcpm_init_vbus(struct tcpm_port *port)
 {
 	int ret;
@@ -2912,6 +3211,8 @@ static void tcpm_reset_port(struct tcpm_port *port)
 		ret = port->tcpc->enable_auto_vbus_discharge(port->tcpc, false);
 		tcpm_log_force(port, "Disable vbus discharge ret:%d", ret);
 	}
+	port->in_ams = false;
+	port->ams = NONE_AMS;
 	tcpm_unregister_altmodes(port);
 	tcpm_typec_disconnect(port);
 	port->attached = false;
@@ -3090,6 +3391,7 @@ static void run_state_machine(struct tcpm_port *port)
 	int ret;
 	enum typec_pwr_opmode opmode;
 	unsigned int msecs;
+	enum tcpm_state upcoming_state;
 
 	port->enter_state = port->state;
 	switch (port->state) {
@@ -3190,7 +3492,12 @@ static void run_state_machine(struct tcpm_port *port)
 		port->message_id = 0;
 		port->rx_msgid = -1;
 		port->explicit_contract = false;
-		tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
+		/* SNK -> SRC POWER/FAST_ROLE_SWAP finished */
+		if (port->ams == POWER_ROLE_SWAP ||
+		    port->ams == FAST_ROLE_SWAP)
+			tcpm_ams_finish(port);
+		port->upcoming_state = SRC_SEND_CAPABILITIES;
+		tcpm_ams_start(port, POWER_NEGOTIATION);
 		break;
 	case SRC_SEND_CAPABILITIES:
 		port->caps_count++;
@@ -3272,6 +3579,19 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_swap_complete(port, 0);
 		tcpm_typec_connect(port);
 
+		if (port->ams != NONE_AMS)
+			tcpm_ams_finish(port);
+		/*
+		 * If previous AMS is interrupted, switch to the upcoming
+		 * state.
+		 */
+		if (port->upcoming_state != INVALID_STATE) {
+			upcoming_state = port->upcoming_state;
+			port->upcoming_state = INVALID_STATE;
+			tcpm_set_state(port, upcoming_state, 0);
+			break;
+		}
+
 		tcpm_check_send_discover(port);
 		/*
 		 * 6.3.5
@@ -3389,6 +3709,12 @@ static void run_state_machine(struct tcpm_port *port)
 		port->message_id = 0;
 		port->rx_msgid = -1;
 		port->explicit_contract = false;
+
+		if (port->ams == POWER_ROLE_SWAP ||
+		    port->ams == FAST_ROLE_SWAP)
+			/* SRC -> SNK POWER/FAST_ROLE_SWAP finished */
+			tcpm_ams_finish(port);
+
 		tcpm_set_state(port, SNK_DISCOVERY, 0);
 		break;
 	case SNK_DISCOVERY:
@@ -3437,7 +3763,7 @@ static void run_state_machine(struct tcpm_port *port)
 		 */
 		if (port->vbus_never_low) {
 			port->vbus_never_low = false;
-			tcpm_set_state(port, SOFT_RESET_SEND,
+			tcpm_set_state(port, SNK_SOFT_RESET,
 				       PD_T_SINK_WAIT_CAP);
 		} else {
 			tcpm_set_state(port, hard_reset_state(port),
@@ -3490,9 +3816,23 @@ static void run_state_machine(struct tcpm_port *port)
 
 		tcpm_swap_complete(port, 0);
 		tcpm_typec_connect(port);
-		tcpm_check_send_discover(port);
 		mod_enable_frs_delayed_work(port, 0);
 		tcpm_pps_complete(port, port->pps_status);
+
+		if (port->ams != NONE_AMS)
+			tcpm_ams_finish(port);
+		/*
+		 * If previous AMS is interrupted, switch to the upcoming
+		 * state.
+		 */
+		if (port->upcoming_state != INVALID_STATE) {
+			upcoming_state = port->upcoming_state;
+			port->upcoming_state = INVALID_STATE;
+			tcpm_set_state(port, upcoming_state, 0);
+			break;
+		}
+
+		tcpm_check_send_discover(port);
 		power_supply_changed(port->psy);
 		break;
 
@@ -3513,8 +3853,14 @@ static void run_state_machine(struct tcpm_port *port)
 
 	/* Hard_Reset states */
 	case HARD_RESET_SEND:
-		tcpm_pd_transmit(port, TCPC_TX_HARD_RESET, NULL);
-		tcpm_set_state(port, HARD_RESET_START, 0);
+		if (port->ams != NONE_AMS)
+			tcpm_ams_finish(port);
+		/*
+		 * State machine will be directed to HARD_RESET_START,
+		 * thus set upcoming_state to INVALID_STATE.
+		 */
+		port->upcoming_state = INVALID_STATE;
+		tcpm_ams_start(port, HARD_RESET);
 		break;
 	case HARD_RESET_START:
 		port->sink_cap_done = false;
@@ -3558,6 +3904,8 @@ static void run_state_machine(struct tcpm_port *port)
 	case SRC_HARD_RESET_VBUS_ON:
 		tcpm_set_vconn(port, true);
 		tcpm_set_vbus(port, true);
+		if (port->ams == HARD_RESET)
+			tcpm_ams_finish(port);
 		port->tcpc->set_pd_rx(port->tcpc, true);
 		tcpm_set_attached_state(port, true);
 		tcpm_set_state(port, SRC_UNATTACHED, PD_T_PS_SOURCE_ON);
@@ -3579,6 +3927,8 @@ static void run_state_machine(struct tcpm_port *port)
 		tcpm_set_state(port, SNK_HARD_RESET_SINK_ON, PD_T_SAFE_0V);
 		break;
 	case SNK_HARD_RESET_WAIT_VBUS:
+		if (port->ams == HARD_RESET)
+			tcpm_ams_finish(port);
 		/* Assume we're disconnected if VBUS doesn't come back. */
 		tcpm_set_state(port, SNK_UNATTACHED,
 			       PD_T_SRC_RECOVER_MAX + PD_T_SRC_TURN_ON);
@@ -3606,6 +3956,8 @@ static void run_state_machine(struct tcpm_port *port)
 					       5000);
 			tcpm_set_charge(port, true);
 		}
+		if (port->ams == HARD_RESET)
+			tcpm_ams_finish(port);
 		tcpm_set_attached_state(port, true);
 		tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, false, VSAFE5V);
 		tcpm_set_state(port, SNK_STARTUP, 0);
@@ -3616,10 +3968,19 @@ static void run_state_machine(struct tcpm_port *port)
 		port->message_id = 0;
 		port->rx_msgid = -1;
 		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
-		if (port->pwr_role == TYPEC_SOURCE)
-			tcpm_set_state(port, SRC_SEND_CAPABILITIES, 0);
-		else
+		if (port->pwr_role == TYPEC_SOURCE) {
+			port->upcoming_state = SRC_SEND_CAPABILITIES;
+			tcpm_ams_start(port, POWER_NEGOTIATION);
+		} else {
 			tcpm_set_state(port, SNK_WAIT_CAPABILITIES, 0);
+		}
+		break;
+	case SRC_SOFT_RESET_WAIT_SNK_TX:
+	case SNK_SOFT_RESET:
+		if (port->ams != NONE_AMS)
+			tcpm_ams_finish(port);
+		port->upcoming_state = SOFT_RESET_SEND;
+		tcpm_ams_start(port, SOFT_RESET_AMS);
 		break;
 	case SOFT_RESET_SEND:
 		port->message_id = 0;
@@ -3886,6 +4247,19 @@ static void run_state_machine(struct tcpm_port *port)
 			       tcpm_default_state(port),
 			       port->vbus_present ? PD_T_PS_SOURCE_OFF : 0);
 		break;
+
+	/* AMS intermediate state */
+	case AMS_START:
+		if (port->upcoming_state == INVALID_STATE) {
+			tcpm_set_state(port, port->pwr_role == TYPEC_SOURCE ?
+				       SRC_READY : SNK_READY, 0);
+			break;
+		}
+
+		upcoming_state = port->upcoming_state;
+		port->upcoming_state = INVALID_STATE;
+		tcpm_set_state(port, upcoming_state, 0);
+		break;
 	default:
 		WARN(1, "Unexpected port state %d\n", port->state);
 		break;
@@ -4313,6 +4687,8 @@ static void _tcpm_pd_hard_reset(struct tcpm_port *port)
 	if (port->bist_request == BDO_MODE_TESTDATA && port->tcpc->set_bist_data)
 		port->tcpc->set_bist_data(port->tcpc, false);
 
+	if (port->ams != NONE_AMS)
+		port->ams = NONE_AMS;
 	/*
 	 * If we keep receiving hard reset requests, executing the hard reset
 	 * must have failed. Revert to error recovery if that happens.
@@ -4363,10 +4739,16 @@ static void tcpm_pd_event_handler(struct kthread_work *work)
 				_tcpm_cc_change(port, cc1, cc2);
 		}
 		if (events & TCPM_FRS_EVENT) {
-			if (port->state == SNK_READY)
-				tcpm_set_state(port, FR_SWAP_SEND, 0);
-			else
+			if (port->state == SNK_READY) {
+				int ret;
+
+				port->upcoming_state = FR_SWAP_SEND;
+				ret = tcpm_ams_start(port, FAST_ROLE_SWAP);
+				if (ret == -EAGAIN)
+					port->upcoming_state = INVALID_STATE;
+			} else {
 				tcpm_log(port, "Discarding FRS_SIGNAL! Not in sink ready");
+			}
 		}
 		if (events & TCPM_SOURCING_VBUS) {
 			tcpm_log(port, "sourcing vbus");
@@ -4435,6 +4817,7 @@ EXPORT_SYMBOL_GPL(tcpm_sourcing_vbus);
 static void tcpm_enable_frs_work(struct kthread_work *work)
 {
 	struct tcpm_port *port = container_of(work, struct tcpm_port, enable_frs);
+	int ret;
 
 	mutex_lock(&port->lock);
 	/* Not FRS capable */
@@ -4449,9 +4832,14 @@ static void tcpm_enable_frs_work(struct kthread_work *work)
 	if (port->state != SNK_READY || port->vdm_state != VDM_STATE_DONE || port->send_discover)
 		goto resched;
 
-	tcpm_set_state(port, GET_SINK_CAP, 0);
-	port->sink_cap_done = true;
-
+	port->upcoming_state = GET_SINK_CAP;
+	ret = tcpm_ams_start(port, GET_SINK_CAPABILITIES);
+	if (ret == -EAGAIN) {
+		port->upcoming_state = INVALID_STATE;
+	} else {
+		port->sink_cap_done = true;
+		goto unlock;
+	}
 resched:
 	mod_enable_frs_delayed_work(port, GET_SINK_CAP_RETRY_MS);
 unlock:
@@ -4501,7 +4889,12 @@ static int tcpm_dr_set(struct typec_port *p, enum typec_data_role data)
 		port->non_pd_role_swap = true;
 		tcpm_set_state(port, PORT_RESET, 0);
 	} else {
-		tcpm_set_state(port, DR_SWAP_SEND, 0);
+		port->upcoming_state = DR_SWAP_SEND;
+		ret = tcpm_ams_start(port, DATA_ROLE_SWAP);
+		if (ret == -EAGAIN) {
+			port->upcoming_state = INVALID_STATE;
+			goto port_unlock;
+		}
 	}
 
 	port->swap_status = 0;
@@ -4547,10 +4940,16 @@ static int tcpm_pr_set(struct typec_port *p, enum typec_role role)
 		goto port_unlock;
 	}
 
+	port->upcoming_state = PR_SWAP_SEND;
+	ret = tcpm_ams_start(port, POWER_ROLE_SWAP);
+	if (ret == -EAGAIN) {
+		port->upcoming_state = INVALID_STATE;
+		goto port_unlock;
+	}
+
 	port->swap_status = 0;
 	port->swap_pending = true;
 	reinit_completion(&port->swap_complete);
-	tcpm_set_state(port, PR_SWAP_SEND, 0);
 	mutex_unlock(&port->lock);
 
 	if (!wait_for_completion_timeout(&port->swap_complete,
@@ -4586,10 +4985,16 @@ static int tcpm_vconn_set(struct typec_port *p, enum typec_role role)
 		goto port_unlock;
 	}
 
+	port->upcoming_state = VCONN_SWAP_SEND;
+	ret = tcpm_ams_start(port, VCONN_SWAP);
+	if (ret == -EAGAIN) {
+		port->upcoming_state = INVALID_STATE;
+		goto port_unlock;
+	}
+
 	port->swap_status = 0;
 	port->swap_pending = true;
 	reinit_completion(&port->swap_complete);
-	tcpm_set_state(port, VCONN_SWAP_SEND, 0);
 	mutex_unlock(&port->lock);
 
 	if (!wait_for_completion_timeout(&port->swap_complete,
@@ -4654,6 +5059,13 @@ static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 op_curr)
 		goto port_unlock;
 	}
 
+	port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES;
+	ret = tcpm_ams_start(port, POWER_NEGOTIATION);
+	if (ret == -EAGAIN) {
+		port->upcoming_state = INVALID_STATE;
+		goto port_unlock;
+	}
+
 	/* Round down operating current to align with PPS valid steps */
 	op_curr = op_curr - (op_curr % RDO_PROG_CURR_MA_STEP);
 
@@ -4661,7 +5073,6 @@ static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 op_curr)
 	port->pps_data.op_curr = op_curr;
 	port->pps_status = 0;
 	port->pps_pending = true;
-	tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
 	mutex_unlock(&port->lock);
 
 	if (!wait_for_completion_timeout(&port->pps_complete,
@@ -4710,6 +5121,13 @@ static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 out_volt)
 		goto port_unlock;
 	}
 
+	port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES;
+	ret = tcpm_ams_start(port, POWER_NEGOTIATION);
+	if (ret == -EAGAIN) {
+		port->upcoming_state = INVALID_STATE;
+		goto port_unlock;
+	}
+
 	/* Round down output voltage to align with PPS valid steps */
 	out_volt = out_volt - (out_volt % RDO_PROG_VOLT_MV_STEP);
 
@@ -4717,7 +5135,6 @@ static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 out_volt)
 	port->pps_data.out_volt = out_volt;
 	port->pps_status = 0;
 	port->pps_pending = true;
-	tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
 	mutex_unlock(&port->lock);
 
 	if (!wait_for_completion_timeout(&port->pps_complete,
@@ -4757,6 +5174,16 @@ static int tcpm_pps_activate(struct tcpm_port *port, bool activate)
 		goto port_unlock;
 	}
 
+	if (activate)
+		port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES;
+	else
+		port->upcoming_state = SNK_NEGOTIATE_CAPABILITIES;
+	ret = tcpm_ams_start(port, POWER_NEGOTIATION);
+	if (ret == -EAGAIN) {
+		port->upcoming_state = INVALID_STATE;
+		goto port_unlock;
+	}
+
 	reinit_completion(&port->pps_complete);
 	port->pps_status = 0;
 	port->pps_pending = true;
@@ -4765,9 +5192,6 @@ static int tcpm_pps_activate(struct tcpm_port *port, bool activate)
 	if (activate) {
 		port->pps_data.out_volt = port->supply_voltage;
 		port->pps_data.op_curr = port->current_limit;
-		tcpm_set_state(port, SNK_NEGOTIATE_PPS_CAPABILITIES, 0);
-	} else {
-		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
 	}
 	mutex_unlock(&port->lock);
 
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index bb9a782e1411..79599b90ba55 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -479,6 +479,7 @@ static inline unsigned int rdo_max_power(u32 rdo)
 #define PD_T_NEWSRC		250	/* Maximum of 275ms */
 #define PD_T_SWAP_SRC_START	20	/* Minimum of 20ms */
 #define PD_T_BIST_CONT_MODE	50	/* 30 - 60 ms */
+#define PD_T_SINK_TX		16	/* 16 - 20 ms */
 
 #define PD_T_DRP_TRY		100	/* 75 - 150 ms */
 #define PD_T_DRP_TRYWAIT	600	/* 400 - 800 ms */
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index f4a18427f5c4..3af99f85e8b9 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -19,6 +19,10 @@ enum typec_cc_status {
 	TYPEC_CC_RP_3_0,
 };
 
+/* Collision Avoidance */
+#define SINK_TX_NG	TYPEC_CC_RP_1_5
+#define SINK_TX_OK	TYPEC_CC_RP_3_0
+
 enum typec_cc_polarity {
 	TYPEC_POLARITY_CC1,
 	TYPEC_POLARITY_CC2,
-- 
cgit v1.2.3


From 8dea75e11380fc59656fe965766ac192a831455f Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Thu, 14 Jan 2021 22:50:52 +0800
Subject: usb: typec: tcpm: Protocol Error handling

PD3.0 Spec 6.8.1 describes how to handle Protocol Error. There are
general rules defined in Table 6-61 which regulate incoming Message
handling. If the incoming Message is unexpected, unsupported, or
unrecognized, Protocol Error occurs. Follow the rules to handle these
situations. Also consider PD2.0 connection (PD2.0 Spec Table 6-36) for
backward compatibilities.

To know the types of AMS in all the recipient's states, identify those
AMS who are initiated by the port partner but not yet recorded in the
current code.

Besides, introduce a new state CHUNK_NOT_SUPP to delay the NOT_SUPPORTED
message after receiving a chunked message.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210114145053.1952756-3-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 335 ++++++++++++++++++++++++++++++------------
 include/linux/usb/pd.h        |   1 +
 2 files changed, 246 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 2b16d2764092..70922723da4b 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -143,7 +143,8 @@
 	S(PORT_RESET),				\
 	S(PORT_RESET_WAIT_OFF),			\
 						\
-	S(AMS_START)
+	S(AMS_START),				\
+	S(CHUNK_NOT_SUPP)
 
 #define FOREACH_AMS(S)				\
 	S(NONE_AMS),				\
@@ -433,6 +434,7 @@ struct tcpm_port {
 	/* Collision Avoidance and Atomic Message Sequence */
 	enum tcpm_state upcoming_state;
 	enum tcpm_ams ams;
+	enum tcpm_ams next_ams;
 	bool in_ams;
 
 #ifdef CONFIG_DEBUG_FS
@@ -1213,7 +1215,8 @@ static int tcpm_ams_start(struct tcpm_port *port, enum tcpm_ams ams)
 
 	tcpm_log(port, "AMS %s start", tcpm_ams_str[ams]);
 
-	if (!tcpm_ams_interruptible(port) && ams != HARD_RESET) {
+	if (!tcpm_ams_interruptible(port) &&
+	    !(ams == HARD_RESET || ams == SOFT_RESET_AMS)) {
 		port->upcoming_state = INVALID_STATE;
 		tcpm_log(port, "AMS %s not interruptible, aborting",
 			 tcpm_ams_str[port->ams]);
@@ -1231,11 +1234,10 @@ static int tcpm_ams_start(struct tcpm_port *port, enum tcpm_ams ams)
 			tcpm_set_state(port, HARD_RESET_START, 0);
 			return ret;
 		} else if (ams == SOFT_RESET_AMS) {
-			if (!port->explicit_contract) {
-				port->upcoming_state = INVALID_STATE;
+			if (!port->explicit_contract)
 				tcpm_set_cc(port, tcpm_rp_cc(port));
-				return ret;
-			}
+			tcpm_set_state(port, SOFT_RESET_SEND, 0);
+			return ret;
 		} else if (tcpm_vdm_ams(port)) {
 			/* tSinkTx is enforced in vdm_run_state_machine */
 			if (port->negotiated_rev >= PD_REV30)
@@ -1452,6 +1454,9 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
 	case CMDT_INIT:
 		switch (cmd) {
 		case CMD_DISCOVER_IDENT:
+			if (PD_VDO_VID(p[0]) != USB_SID_PD)
+				break;
+
 			/* 6.4.4.3.1: Only respond as UFP (device) */
 			if (port->data_role == TYPEC_DEVICE &&
 			    port->nr_snk_vdo) {
@@ -1537,22 +1542,37 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
 				return 0;
 			}
 			break;
+		case VDO_CMD_VENDOR(0) ... VDO_CMD_VENDOR(15):
+			break;
 		default:
+			/* Unrecognized SVDM */
+			response[0] = p[0] | VDO_CMDT(CMDT_RSP_NAK);
+			rlen = 1;
 			break;
 		}
 		break;
 	case CMDT_RSP_NAK:
 		tcpm_ams_finish(port);
 		switch (cmd) {
+		case CMD_DISCOVER_IDENT:
+		case CMD_DISCOVER_SVID:
+		case CMD_DISCOVER_MODES:
+		case VDO_CMD_VENDOR(0) ... VDO_CMD_VENDOR(15):
+			break;
 		case CMD_ENTER_MODE:
 			/* Back to USB Operation */
 			*adev_action = ADEV_NOTIFY_USB_AND_QUEUE_VDM;
 			return 0;
 		default:
+			/* Unrecognized SVDM */
+			response[0] = p[0] | VDO_CMDT(CMDT_RSP_NAK);
+			rlen = 1;
 			break;
 		}
 		break;
 	default:
+		response[0] = p[0] | VDO_CMDT(CMDT_RSP_NAK);
+		rlen = 1;
 		break;
 	}
 
@@ -1588,8 +1608,12 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port,
 		port->vdm_state = VDM_STATE_DONE;
 	}
 
-	if (PD_VDO_SVDM(p[0]))
+	if (PD_VDO_SVDM(p[0])) {
 		rlen = tcpm_pd_svdm(port, adev, p, cnt, response, &adev_action);
+	} else {
+		if (port->negotiated_rev >= PD_REV30)
+			tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
+	}
 
 	/*
 	 * We are done with any state stored in the port struct now, except
@@ -2039,6 +2063,71 @@ static int tcpm_set_auto_vbus_discharge_threshold(struct tcpm_port *port,
 	return ret;
 }
 
+static void tcpm_pd_handle_state(struct tcpm_port *port,
+				 enum tcpm_state state,
+				 enum tcpm_ams ams,
+				 unsigned int delay_ms)
+{
+	switch (port->state) {
+	case SRC_READY:
+	case SNK_READY:
+		port->ams = ams;
+		tcpm_set_state(port, state, delay_ms);
+		break;
+	/* 8.3.3.4.1.1 and 6.8.1 power transitioning */
+	case SNK_TRANSITION_SINK:
+	case SNK_TRANSITION_SINK_VBUS:
+	case SRC_TRANSITION_SUPPLY:
+		tcpm_set_state(port, HARD_RESET_SEND, 0);
+		break;
+	default:
+		if (!tcpm_ams_interruptible(port)) {
+			tcpm_set_state(port, port->pwr_role == TYPEC_SOURCE ?
+				       SRC_SOFT_RESET_WAIT_SNK_TX :
+				       SNK_SOFT_RESET,
+				       0);
+		} else {
+			/* process the Message 6.8.1 */
+			port->upcoming_state = state;
+			port->next_ams = ams;
+			tcpm_set_state(port, ready_state(port), delay_ms);
+		}
+		break;
+	}
+}
+
+static void tcpm_pd_handle_msg(struct tcpm_port *port,
+			       enum pd_msg_request message,
+			       enum tcpm_ams ams)
+{
+	switch (port->state) {
+	case SRC_READY:
+	case SNK_READY:
+		port->ams = ams;
+		tcpm_queue_message(port, message);
+		break;
+	/* PD 3.0 Spec 8.3.3.4.1.1 and 6.8.1 */
+	case SNK_TRANSITION_SINK:
+	case SNK_TRANSITION_SINK_VBUS:
+	case SRC_TRANSITION_SUPPLY:
+		tcpm_set_state(port, HARD_RESET_SEND, 0);
+		break;
+	default:
+		if (!tcpm_ams_interruptible(port)) {
+			tcpm_set_state(port, port->pwr_role == TYPEC_SOURCE ?
+				       SRC_SOFT_RESET_WAIT_SNK_TX :
+				       SNK_SOFT_RESET,
+				       0);
+		} else {
+			port->next_ams = ams;
+			tcpm_set_state(port, ready_state(port), 0);
+			/* 6.8.1 process the Message */
+			tcpm_queue_message(port, message);
+		}
+		break;
+	}
+}
+
 static void tcpm_pd_data_request(struct tcpm_port *port,
 				 const struct pd_message *msg)
 {
@@ -2052,9 +2141,6 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 
 	switch (type) {
 	case PD_DATA_SOURCE_CAP:
-		if (port->pwr_role != TYPEC_SINK)
-			break;
-
 		for (i = 0; i < cnt; i++)
 			port->source_caps[i] = le32_to_cpu(msg->payload[i]);
 
@@ -2070,12 +2156,26 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 		 * to comply with 6.2.1.1.5 of the USB PD 3.0 spec. We don't
 		 * support Rev 1.0 so just do nothing in that scenario.
 		 */
-		if (rev == PD_REV10)
+		if (rev == PD_REV10) {
+			if (port->ams == GET_SOURCE_CAPABILITIES)
+				tcpm_ams_finish(port);
 			break;
+		}
 
 		if (rev < PD_MAX_REV)
 			port->negotiated_rev = rev;
 
+		if (port->pwr_role == TYPEC_SOURCE) {
+			if (port->ams == GET_SOURCE_CAPABILITIES)
+				tcpm_pd_handle_state(port, SRC_READY, NONE_AMS, 0);
+			/* Unexpected Source Capabilities */
+			else
+				tcpm_pd_handle_msg(port,
+						   port->negotiated_rev < PD_REV30 ?
+						   PD_MSG_CTRL_REJECT :
+						   PD_MSG_CTRL_NOT_SUPP,
+						   NONE_AMS);
+		} else if (port->state == SNK_WAIT_CAPABILITIES) {
 		/*
 		 * This message may be received even if VBUS is not
 		 * present. This is quite unexpected; see USB PD
@@ -2089,30 +2189,48 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 		 * but be prepared to keep waiting for VBUS after it was
 		 * handled.
 		 */
-		tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+			port->ams = POWER_NEGOTIATION;
+			tcpm_set_state(port, SNK_NEGOTIATE_CAPABILITIES, 0);
+		} else {
+			if (port->ams == GET_SOURCE_CAPABILITIES)
+				tcpm_ams_finish(port);
+			tcpm_pd_handle_state(port, SNK_NEGOTIATE_CAPABILITIES,
+					     POWER_NEGOTIATION, 0);
+		}
 		break;
 	case PD_DATA_REQUEST:
-		if (port->pwr_role != TYPEC_SOURCE ||
-		    cnt != 1) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-
 		/*
 		 * Adjust revision in subsequent message headers, as required,
 		 * to comply with 6.2.1.1.5 of the USB PD 3.0 spec. We don't
 		 * support Rev 1.0 so just reject in that scenario.
 		 */
 		if (rev == PD_REV10) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
+			tcpm_pd_handle_msg(port,
+					   port->negotiated_rev < PD_REV30 ?
+					   PD_MSG_CTRL_REJECT :
+					   PD_MSG_CTRL_NOT_SUPP,
+					   NONE_AMS);
 			break;
 		}
 
 		if (rev < PD_MAX_REV)
 			port->negotiated_rev = rev;
 
+		if (port->pwr_role != TYPEC_SOURCE || cnt != 1) {
+			tcpm_pd_handle_msg(port,
+					   port->negotiated_rev < PD_REV30 ?
+					   PD_MSG_CTRL_REJECT :
+					   PD_MSG_CTRL_NOT_SUPP,
+					   NONE_AMS);
+			break;
+		}
+
 		port->sink_request = le32_to_cpu(msg->payload[0]);
-		tcpm_set_state(port, SRC_NEGOTIATE_CAPABILITIES, 0);
+		if (port->state == SRC_SEND_CAPABILITIES)
+			tcpm_set_state(port, SRC_NEGOTIATE_CAPABILITIES, 0);
+		else
+			tcpm_pd_handle_state(port, SRC_NEGOTIATE_CAPABILITIES,
+					     POWER_NEGOTIATION, 0);
 		break;
 	case PD_DATA_SINK_CAP:
 		/* We don't do anything with this at the moment... */
@@ -2133,16 +2251,22 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 
 		port->nr_sink_caps = cnt;
 		port->sink_cap_done = true;
-		tcpm_set_state(port, SNK_READY, 0);
+		if (port->ams == GET_SINK_CAPABILITIES)
+			tcpm_pd_handle_state(port, ready_state(port), NONE_AMS, 0);
+		/* Unexpected Sink Capabilities */
+		else
+			tcpm_pd_handle_msg(port,
+					   port->negotiated_rev < PD_REV30 ?
+					   PD_MSG_CTRL_REJECT :
+					   PD_MSG_CTRL_NOT_SUPP,
+					   NONE_AMS);
 		break;
 	case PD_DATA_VENDOR_DEF:
 		tcpm_handle_vdm_request(port, msg->payload, cnt);
 		break;
 	case PD_DATA_BIST:
-		if (port->state == SRC_READY || port->state == SNK_READY) {
-			port->bist_request = le32_to_cpu(msg->payload[0]);
-			tcpm_set_state(port, BIST_RX, 0);
-		}
+		port->bist_request = le32_to_cpu(msg->payload[0]);
+		tcpm_pd_handle_state(port, BIST_RX, BIST, 0);
 		break;
 	case PD_DATA_ALERT:
 		tcpm_handle_alert(port, msg->payload, cnt);
@@ -2150,10 +2274,17 @@ static void tcpm_pd_data_request(struct tcpm_port *port,
 	case PD_DATA_BATT_STATUS:
 	case PD_DATA_GET_COUNTRY_INFO:
 		/* Currently unsupported */
-		tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
+		tcpm_pd_handle_msg(port, port->negotiated_rev < PD_REV30 ?
+				   PD_MSG_CTRL_REJECT :
+				   PD_MSG_CTRL_NOT_SUPP,
+				   NONE_AMS);
 		break;
 	default:
-		tcpm_log(port, "Unhandled data message type %#x", type);
+		tcpm_pd_handle_msg(port, port->negotiated_rev < PD_REV30 ?
+				   PD_MSG_CTRL_REJECT :
+				   PD_MSG_CTRL_NOT_SUPP,
+				   NONE_AMS);
+		tcpm_log(port, "Unrecognized data message type %#x", type);
 		break;
 	}
 }
@@ -2178,26 +2309,10 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 	case PD_CTRL_PING:
 		break;
 	case PD_CTRL_GET_SOURCE_CAP:
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_queue_message(port, PD_MSG_DATA_SOURCE_CAP);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
+		tcpm_pd_handle_msg(port, PD_MSG_DATA_SOURCE_CAP, GET_SOURCE_CAPABILITIES);
 		break;
 	case PD_CTRL_GET_SINK_CAP:
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_queue_message(port, PD_MSG_DATA_SINK_CAP);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
+		tcpm_pd_handle_msg(port, PD_MSG_DATA_SINK_CAP, GET_SINK_CAPABILITIES);
 		break;
 	case PD_CTRL_GOTO_MIN:
 		break;
@@ -2236,6 +2351,11 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 			tcpm_set_state(port, FR_SWAP_SNK_SRC_NEW_SINK_READY, 0);
 			break;
 		default:
+			tcpm_pd_handle_state(port,
+					     port->pwr_role == TYPEC_SOURCE ?
+					     SRC_SOFT_RESET_WAIT_SNK_TX :
+					     SNK_SOFT_RESET,
+					     NONE_AMS, 0);
 			break;
 		}
 		break;
@@ -2282,6 +2402,11 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 			tcpm_set_state(port, ready_state(port), 0);
 			break;
 		default:
+			tcpm_pd_handle_state(port,
+					     port->pwr_role == TYPEC_SOURCE ?
+					     SRC_SOFT_RESET_WAIT_SNK_TX :
+					     SNK_SOFT_RESET,
+					     NONE_AMS, 0);
 			break;
 		}
 		break;
@@ -2298,8 +2423,6 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 			tcpm_set_state(port, SNK_TRANSITION_SINK, 0);
 			break;
 		case SOFT_RESET_SEND:
-			port->message_id = 0;
-			port->rx_msgid = -1;
 			if (port->ams == SOFT_RESET_AMS)
 				tcpm_ams_finish(port);
 			if (port->pwr_role == TYPEC_SOURCE) {
@@ -2322,57 +2445,45 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 			tcpm_set_state(port, FR_SWAP_SNK_SRC_TRANSITION_TO_OFF, 0);
 			break;
 		default:
+			tcpm_pd_handle_state(port,
+					     port->pwr_role == TYPEC_SOURCE ?
+					     SRC_SOFT_RESET_WAIT_SNK_TX :
+					     SNK_SOFT_RESET,
+					     NONE_AMS, 0);
 			break;
 		}
 		break;
 	case PD_CTRL_SOFT_RESET:
+		port->ams = SOFT_RESET_AMS;
 		tcpm_set_state(port, SOFT_RESET, 0);
 		break;
 	case PD_CTRL_DR_SWAP:
-		if (port->typec_caps.data != TYPEC_PORT_DRD) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
 		/*
 		 * XXX
 		 * 6.3.9: If an alternate mode is active, a request to swap
 		 * alternate modes shall trigger a port reset.
 		 */
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_set_state(port, DR_SWAP_ACCEPT, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
-		}
+		if (port->typec_caps.data != TYPEC_PORT_DRD)
+			tcpm_pd_handle_msg(port,
+					   port->negotiated_rev < PD_REV30 ?
+					   PD_MSG_CTRL_REJECT :
+					   PD_MSG_CTRL_NOT_SUPP,
+					   NONE_AMS);
+		else
+			tcpm_pd_handle_state(port, DR_SWAP_ACCEPT, DATA_ROLE_SWAP, 0);
 		break;
 	case PD_CTRL_PR_SWAP:
-		if (port->port_type != TYPEC_PORT_DRP) {
-			tcpm_queue_message(port, PD_MSG_CTRL_REJECT);
-			break;
-		}
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_set_state(port, PR_SWAP_ACCEPT, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
-		}
+		if (port->port_type != TYPEC_PORT_DRP)
+			tcpm_pd_handle_msg(port,
+					   port->negotiated_rev < PD_REV30 ?
+					   PD_MSG_CTRL_REJECT :
+					   PD_MSG_CTRL_NOT_SUPP,
+					   NONE_AMS);
+		else
+			tcpm_pd_handle_state(port, PR_SWAP_ACCEPT, POWER_ROLE_SWAP, 0);
 		break;
 	case PD_CTRL_VCONN_SWAP:
-		switch (port->state) {
-		case SRC_READY:
-		case SNK_READY:
-			tcpm_set_state(port, VCONN_SWAP_ACCEPT, 0);
-			break;
-		default:
-			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
-			break;
-		}
+		tcpm_pd_handle_state(port, VCONN_SWAP_ACCEPT, VCONN_SWAP, 0);
 		break;
 	case PD_CTRL_GET_SOURCE_CAP_EXT:
 	case PD_CTRL_GET_STATUS:
@@ -2380,10 +2491,19 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 	case PD_CTRL_GET_PPS_STATUS:
 	case PD_CTRL_GET_COUNTRY_CODES:
 		/* Currently not supported */
-		tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
+		tcpm_pd_handle_msg(port,
+				   port->negotiated_rev < PD_REV30 ?
+				   PD_MSG_CTRL_REJECT :
+				   PD_MSG_CTRL_NOT_SUPP,
+				   NONE_AMS);
 		break;
 	default:
-		tcpm_log(port, "Unhandled ctrl message type %#x", type);
+		tcpm_pd_handle_msg(port,
+				   port->negotiated_rev < PD_REV30 ?
+				   PD_MSG_CTRL_REJECT :
+				   PD_MSG_CTRL_NOT_SUPP,
+				   NONE_AMS);
+		tcpm_log(port, "Unrecognized ctrl message type %#x", type);
 		break;
 	}
 }
@@ -2395,11 +2515,13 @@ static void tcpm_pd_ext_msg_request(struct tcpm_port *port,
 	unsigned int data_size = pd_ext_header_data_size_le(msg->ext_msg.header);
 
 	if (!(msg->ext_msg.header & PD_EXT_HDR_CHUNKED)) {
+		tcpm_pd_handle_msg(port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS);
 		tcpm_log(port, "Unchunked extended messages unsupported");
 		return;
 	}
 
 	if (data_size > PD_EXT_MAX_CHUNK_DATA) {
+		tcpm_pd_handle_state(port, CHUNK_NOT_SUPP, NONE_AMS, PD_T_CHUNK_NOT_SUPP);
 		tcpm_log(port, "Chunk handling not yet supported");
 		return;
 	}
@@ -2412,16 +2534,18 @@ static void tcpm_pd_ext_msg_request(struct tcpm_port *port,
 		 */
 		if (msg->ext_msg.data[USB_PD_EXT_SDB_EVENT_FLAGS] &
 		    USB_PD_EXT_SDB_PPS_EVENTS)
-			tcpm_set_state(port, GET_PPS_STATUS_SEND, 0);
+			tcpm_pd_handle_state(port, GET_PPS_STATUS_SEND,
+					     GETTING_SOURCE_SINK_STATUS, 0);
+
 		else
-			tcpm_set_state(port, ready_state(port), 0);
+			tcpm_pd_handle_state(port, ready_state(port), NONE_AMS, 0);
 		break;
 	case PD_EXT_PPS_STATUS:
 		/*
 		 * For now the PPS status message is used to clear events
 		 * and nothing more.
 		 */
-		tcpm_set_state(port, ready_state(port), 0);
+		tcpm_pd_handle_state(port, ready_state(port), NONE_AMS, 0);
 		break;
 	case PD_EXT_SOURCE_CAP_EXT:
 	case PD_EXT_GET_BATT_CAP:
@@ -2435,10 +2559,11 @@ static void tcpm_pd_ext_msg_request(struct tcpm_port *port,
 	case PD_EXT_FW_UPDATE_RESPONSE:
 	case PD_EXT_COUNTRY_INFO:
 	case PD_EXT_COUNTRY_CODES:
-		tcpm_queue_message(port, PD_MSG_CTRL_NOT_SUPP);
+		tcpm_pd_handle_msg(port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS);
 		break;
 	default:
-		tcpm_log(port, "Unhandled extended message type %#x", type);
+		tcpm_pd_handle_msg(port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS);
+		tcpm_log(port, "Unrecognized extended message type %#x", type);
 		break;
 	}
 }
@@ -2551,7 +2676,12 @@ static bool tcpm_send_queued_message(struct tcpm_port *port)
 			tcpm_pd_send_control(port, PD_CTRL_NOT_SUPP);
 			break;
 		case PD_MSG_DATA_SINK_CAP:
-			tcpm_pd_send_sink_caps(port);
+			ret = tcpm_pd_send_sink_caps(port);
+			if (ret < 0) {
+				tcpm_log(port, "Unable to send snk caps, ret=%d", ret);
+				tcpm_set_state(port, SNK_SOFT_RESET, 0);
+			}
+			tcpm_ams_finish(port);
 			break;
 		case PD_MSG_DATA_SOURCE_CAP:
 			ret = tcpm_pd_send_source_caps(port);
@@ -2561,8 +2691,11 @@ static bool tcpm_send_queued_message(struct tcpm_port *port)
 					 ret);
 				tcpm_set_state(port, SOFT_RESET_SEND, 0);
 			} else if (port->pwr_role == TYPEC_SOURCE) {
+				tcpm_ams_finish(port);
 				tcpm_set_state(port, HARD_RESET_SEND,
 					       PD_T_SENDER_RESPONSE);
+			} else {
+				tcpm_ams_finish(port);
 			}
 			break;
 		default:
@@ -3581,6 +3714,11 @@ static void run_state_machine(struct tcpm_port *port)
 
 		if (port->ams != NONE_AMS)
 			tcpm_ams_finish(port);
+		if (port->next_ams != NONE_AMS) {
+			port->ams = port->next_ams;
+			port->next_ams = NONE_AMS;
+		}
+
 		/*
 		 * If previous AMS is interrupted, switch to the upcoming
 		 * state.
@@ -3821,6 +3959,11 @@ static void run_state_machine(struct tcpm_port *port)
 
 		if (port->ams != NONE_AMS)
 			tcpm_ams_finish(port);
+		if (port->next_ams != NONE_AMS) {
+			port->ams = port->next_ams;
+			port->next_ams = NONE_AMS;
+		}
+
 		/*
 		 * If previous AMS is interrupted, switch to the upcoming
 		 * state.
@@ -3968,6 +4111,7 @@ static void run_state_machine(struct tcpm_port *port)
 		port->message_id = 0;
 		port->rx_msgid = -1;
 		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+		tcpm_ams_finish(port);
 		if (port->pwr_role == TYPEC_SOURCE) {
 			port->upcoming_state = SRC_SEND_CAPABILITIES;
 			tcpm_ams_start(port, POWER_NEGOTIATION);
@@ -4004,6 +4148,7 @@ static void run_state_machine(struct tcpm_port *port)
 		break;
 	case DR_SWAP_SEND_TIMEOUT:
 		tcpm_swap_complete(port, -ETIMEDOUT);
+		tcpm_ams_finish(port);
 		tcpm_set_state(port, ready_state(port), 0);
 		break;
 	case DR_SWAP_CHANGE_DR:
@@ -4016,6 +4161,7 @@ static void run_state_machine(struct tcpm_port *port)
 				       TYPEC_HOST);
 			port->send_discover = true;
 		}
+		tcpm_ams_finish(port);
 		tcpm_set_state(port, ready_state(port), 0);
 		break;
 
@@ -4143,6 +4289,7 @@ static void run_state_machine(struct tcpm_port *port)
 
 	case VCONN_SWAP_ACCEPT:
 		tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+		tcpm_ams_finish(port);
 		tcpm_set_state(port, VCONN_SWAP_START, 0);
 		break;
 	case VCONN_SWAP_SEND:
@@ -4260,6 +4407,12 @@ static void run_state_machine(struct tcpm_port *port)
 		port->upcoming_state = INVALID_STATE;
 		tcpm_set_state(port, upcoming_state, 0);
 		break;
+
+	/* Chunk state */
+	case CHUNK_NOT_SUPP:
+		tcpm_pd_send_control(port, PD_CTRL_NOT_SUPP);
+		tcpm_set_state(port, port->pwr_role == TYPEC_SOURCE ? SRC_READY : SNK_READY, 0);
+		break;
 	default:
 		WARN(1, "Unexpected port state %d\n", port->state);
 		break;
@@ -4689,6 +4842,8 @@ static void _tcpm_pd_hard_reset(struct tcpm_port *port)
 
 	if (port->ams != NONE_AMS)
 		port->ams = NONE_AMS;
+	if (port->hard_reset_count < PD_N_HARD_RESET_COUNT)
+		port->ams = HARD_RESET;
 	/*
 	 * If we keep receiving hard reset requests, executing the hard reset
 	 * must have failed. Revert to error recovery if that happens.
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 79599b90ba55..272454f9cd67 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -480,6 +480,7 @@ static inline unsigned int rdo_max_power(u32 rdo)
 #define PD_T_SWAP_SRC_START	20	/* Minimum of 20ms */
 #define PD_T_BIST_CONT_MODE	50	/* 30 - 60 ms */
 #define PD_T_SINK_TX		16	/* 16 - 20 ms */
+#define PD_T_CHUNK_NOT_SUPP	42	/* 40 - 50 ms */
 
 #define PD_T_DRP_TRY		100	/* 75 - 150 ms */
 #define PD_T_DRP_TRYWAIT	600	/* 400 - 800 ms */
-- 
cgit v1.2.3


From 51dfb6ca3728bd0a0a3c23776a12d2a15a1d2457 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 20 Jan 2021 23:58:44 +0300
Subject: regulator: consumer: Add missing stubs to regulator/consumer.h

Add missing stubs to regulator/consumer.h in order to fix COMPILE_TEST
of the kernel. In particular this should fix compile-testing of OPP core
because of a missing stub for regulator_sync_voltage().

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20210120205844.12658-1-digetx@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 2024944fd2f7..20e84a84fb77 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -331,6 +331,12 @@ regulator_get_exclusive(struct device *dev, const char *id)
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct regulator *__must_check
+devm_regulator_get_exclusive(struct device *dev, const char *id)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct regulator *__must_check
 regulator_get_optional(struct device *dev, const char *id)
 {
@@ -486,6 +492,11 @@ static inline int regulator_get_voltage(struct regulator *regulator)
 	return -EINVAL;
 }
 
+static inline int regulator_sync_voltage(struct regulator *regulator)
+{
+	return -EINVAL;
+}
+
 static inline int regulator_is_supported_voltage(struct regulator *regulator,
 				   int min_uV, int max_uV)
 {
@@ -578,6 +589,25 @@ static inline int devm_regulator_unregister_notifier(struct regulator *regulator
 	return 0;
 }
 
+static inline int regulator_suspend_enable(struct regulator_dev *rdev,
+					   suspend_state_t state)
+{
+	return -EINVAL;
+}
+
+static inline int regulator_suspend_disable(struct regulator_dev *rdev,
+					    suspend_state_t state)
+{
+	return -EINVAL;
+}
+
+static inline int regulator_set_suspend_voltage(struct regulator *regulator,
+						int min_uV, int max_uV,
+						suspend_state_t state)
+{
+	return -EINVAL;
+}
+
 static inline void *regulator_get_drvdata(struct regulator *regulator)
 {
 	return NULL;
-- 
cgit v1.2.3


From 742d33729a0df11c9d8d4625dbf21dd20cdefd44 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 20 Jan 2021 14:34:23 +0000
Subject: mm: Move immutable fields of 'struct vm_fault' into anonymous struct

'struct vm_fault' contains both information about the fault being
serviced alongside mutable fields contributing to the state of the
fault-handling logic. Unfortunately, the distinction between the two is
not clear-cut, and a number of callers end up manipulating the structure
temporarily before restoring it when returning.

Try to clean this up by moving the immutable fault information into an
anonymous struct, which will later be marked as 'const'. Ideally, the
'flags' field would be part of the new structure too, but it seems as
though the ->page_mkwrite() path is not ready for this yet.

Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=whYs9XsO88iqJzN6NC=D-dp2m0oYXuOoZ=eWnvv=5OA+w@mail.gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/mm.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 251a2339befb..b4a5cb9bff7d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -517,11 +517,14 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags)
  * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
-	struct vm_area_struct *vma;	/* Target VMA */
-	unsigned int flags;		/* FAULT_FLAG_xxx flags */
-	gfp_t gfp_mask;			/* gfp mask to be used for allocations */
-	pgoff_t pgoff;			/* Logical page offset based on vma */
-	unsigned long address;		/* Faulting virtual address */
+	struct {
+		struct vm_area_struct *vma;	/* Target VMA */
+		gfp_t gfp_mask;			/* gfp mask to be used for allocations */
+		pgoff_t pgoff;			/* Logical page offset based on vma */
+		unsigned long address;		/* Faulting virtual address */
+	};
+	unsigned int flags;		/* FAULT_FLAG_xxx flags
+					 * XXX: should really be 'const' */
 	pmd_t *pmd;			/* Pointer to pmd entry matching
 					 * the 'address' */
 	pud_t *pud;			/* Pointer to pud entry matching
-- 
cgit v1.2.3


From 9d3af4b448a119ac81378d3bc775f1c4a2a7ff36 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 14 Jan 2021 15:24:19 +0000
Subject: mm: Pass 'address' to map to do_set_pte() and drop
 FAULT_FLAG_PREFAULT

Rather than modifying the 'address' field of the 'struct vm_fault'
passed to do_set_pte(), leave that to identify the real faulting address
and pass in the virtual address to be mapped by the new pte as a
separate argument.

This makes FAULT_FLAG_PREFAULT redundant, as a prefault entry can be
identified simply by comparing the new address parameter with the
faulting address, so remove the redundant flag at the same time.

Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/mm.h |  7 ++-----
 mm/filemap.c       | 21 +++++++--------------
 mm/memory.c        | 10 +++++-----
 3 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b4a5cb9bff7d..e0f056753bef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -434,7 +434,6 @@ extern pgprot_t protection_map[16];
  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
- * @FAULT_FLAG_PREFAULT: Fault was a prefault.
  *
  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
  * whether we would allow page faults to retry by specifying these two
@@ -465,7 +464,6 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE			0x80
 #define FAULT_FLAG_INSTRUCTION  		0x100
 #define FAULT_FLAG_INTERRUPTIBLE		0x200
-#define FAULT_FLAG_PREFAULT			0x400
 
 /*
  * The default fault flags that should be used by most of the
@@ -503,8 +501,7 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags)
 	{ FAULT_FLAG_USER,		"USER" }, \
 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
-	{ FAULT_FLAG_PREFAULT,		"PREFAULT" }
+	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
@@ -995,7 +992,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
-void do_set_pte(struct vm_fault *vmf, struct page *page);
+void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
diff --git a/mm/filemap.c b/mm/filemap.c
index a6dc97906c8e..fb7a8d9b5603 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3018,8 +3018,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	pgoff_t last_pgoff = start_pgoff;
-	unsigned long address = vmf->address;
-	unsigned long flags = vmf->flags;
+	unsigned long addr;
 	XA_STATE(xas, &mapping->i_pages, start_pgoff);
 	struct page *head, *page;
 	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
@@ -3035,8 +3034,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		goto out;
 	}
 
-	vmf->address = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
+	addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
 	do {
 		page = find_subpage(head, xas.xa_index);
 		if (PageHWPoison(page))
@@ -3045,7 +3044,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 		if (mmap_miss > 0)
 			mmap_miss--;
 
-		vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
+		addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
 		vmf->pte += xas.xa_index - last_pgoff;
 		last_pgoff = xas.xa_index;
 
@@ -3053,16 +3052,12 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 			goto unlock;
 
 		/* We're about to handle the fault */
-		if (vmf->address == address) {
-			vmf->flags &= ~FAULT_FLAG_PREFAULT;
+		if (vmf->address == addr)
 			ret = VM_FAULT_NOPAGE;
-		} else {
-			vmf->flags |= FAULT_FLAG_PREFAULT;
-		}
 
-		do_set_pte(vmf, page);
+		do_set_pte(vmf, page, addr);
 		/* no need to invalidate: a not-present page won't be cached */
-		update_mmu_cache(vma, vmf->address, vmf->pte);
+		update_mmu_cache(vma, addr, vmf->pte);
 		unlock_page(head);
 		continue;
 unlock:
@@ -3072,8 +3067,6 @@ unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
 	rcu_read_unlock();
-	vmf->flags = flags;
-	vmf->address = address;
 	WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
 	return ret;
 }
diff --git a/mm/memory.c b/mm/memory.c
index f0e7c589ca9d..7b1307873325 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3733,11 +3733,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 }
 #endif
 
-void do_set_pte(struct vm_fault *vmf, struct page *page)
+void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	bool prefault = vmf->flags & FAULT_FLAG_PREFAULT;
+	bool prefault = vmf->address != addr;
 	pte_t entry;
 
 	flush_icache_page(vma, page);
@@ -3753,13 +3753,13 @@ void do_set_pte(struct vm_fault *vmf, struct page *page)
 	/* copy-on-write page */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-		page_add_new_anon_rmap(page, vma, vmf->address, false);
+		page_add_new_anon_rmap(page, vma, addr, false);
 		lru_cache_add_inactive_or_unevictable(page, vma);
 	} else {
 		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
 		page_add_file_rmap(page, false);
 	}
-	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
 }
 
 /**
@@ -3819,7 +3819,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	ret = 0;
 	/* Re-check under ptl */
 	if (likely(pte_none(*vmf->pte)))
-		do_set_pte(vmf, page);
+		do_set_pte(vmf, page, vmf->address);
 	else
 		ret = VM_FAULT_NOPAGE;
 
-- 
cgit v1.2.3


From 5857c9209ce58f8e262889539ccdf63e73ad7a93 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 14 Jan 2021 15:44:09 +0000
Subject: mm: Mark anonymous struct field of 'struct vm_fault' as 'const'

The fields of this struct are only ever read after being initialised, so
mark it 'const' before somebody tries to modify it again. GCC will then
complain (with an error) about modification of these fields after they
have been initialised, although LLVM currently allows them without even
a warning:

https://bugs.llvm.org/show_bug.cgi?id=48755

Hopefully, future versions of LLVM will emit a warning.

Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0f056753bef..7ff3d9817d38 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -514,7 +514,7 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags)
  * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
-	struct {
+	const struct {
 		struct vm_area_struct *vma;	/* Target VMA */
 		gfp_t gfp_mask;			/* gfp mask to be used for allocations */
 		pgoff_t pgoff;			/* Logical page offset based on vma */
-- 
cgit v1.2.3


From 464e96aeb16ab4e071d353f69ebbddfa08c8d731 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 27 Nov 2020 11:15:43 -0800
Subject: keys: remove trailing semicolon in macro definition

The macro use will already have a semicolon.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Ben Boeckel <mathstuf@gmail.com>
---
 include/linux/key.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 0f2e24f13c2b..1b0837c975b9 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -360,7 +360,7 @@ static inline struct key *request_key(struct key_type *type,
  * completion of keys undergoing construction with a non-interruptible wait.
  */
 #define request_key_net(type, description, net, callout_info) \
-	request_key_tag(type, description, net->key_domain, callout_info);
+	request_key_tag(type, description, net->key_domain, callout_info)
 
 /**
  * request_key_net_rcu - Request a key for a net namespace under RCU conditions
@@ -372,7 +372,7 @@ static inline struct key *request_key(struct key_type *type,
  * network namespace are used.
  */
 #define request_key_net_rcu(type, description, net) \
-	request_key_rcu(type, description, net->key_domain);
+	request_key_rcu(type, description, net->key_domain)
 #endif /* CONFIG_NET */
 
 extern int wait_for_key_construction(struct key *key, bool intr);
-- 
cgit v1.2.3


From f14602caf4faef18999985bc87a414b552844ad2 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Fri, 20 Nov 2020 19:04:22 +0100
Subject: PKCS#7: Fix missing include
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing linux/types.h for size_t.

[DH: Changed from stddef.h]

Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Ben Boeckel <mathstuf@gmail.com>
---
 include/linux/verification.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/verification.h b/include/linux/verification.h
index 911ab7c2b1ab..a655923335ae 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -8,6 +8,8 @@
 #ifndef _LINUX_VERIFICATION_H
 #define _LINUX_VERIFICATION_H
 
+#include <linux/types.h>
+
 /*
  * Indicate that both builtin trusted keys and secondary trusted keys
  * should be used.
-- 
cgit v1.2.3


From 4993e1f9479a4161fd7d93e2b8b30b438f00cb0f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Nov 2020 19:04:23 +0100
Subject: certs: Fix blacklist flag type confusion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KEY_FLAG_KEEP is not meant to be passed to keyring_alloc() or key_alloc(),
as these only take KEY_ALLOC_* flags.  KEY_FLAG_KEEP has the same value as
KEY_ALLOC_BYPASS_RESTRICTION, but fortunately only key_create_or_update()
uses it.  LSMs using the key_alloc hook don't check that flag.

KEY_FLAG_KEEP is then ignored but fortunately (again) the root user cannot
write to the blacklist keyring, so it is not possible to remove a key/hash
from it.

Fix this by adding a KEY_ALLOC_SET_KEEP flag that tells key_alloc() to set
KEY_FLAG_KEEP on the new key.  blacklist_init() can then, correctly, pass
this to keyring_alloc().

We can also use this in ima_mok_init() rather than setting the flag
manually.

Note that this doesn't fix an observable bug with the current
implementation but it is required to allow addition of new hashes to the
blacklist in the future without making it possible for them to be removed.

Fixes: 734114f8782f ("KEYS: Add a system blacklist keyring")
Reported-by: Mickaël Salaün <mic@linux.microsoft.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Mickaël Salaün <mic@linux.microsoft.com>
cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: David Woodhouse <dwmw2@infradead.org>
---
 certs/blacklist.c                | 2 +-
 include/linux/key.h              | 1 +
 security/integrity/ima/ima_mok.c | 5 ++---
 security/keys/key.c              | 2 ++
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/certs/blacklist.c b/certs/blacklist.c
index a888b934a1cd..029471947838 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -162,7 +162,7 @@ static int __init blacklist_init(void)
 			      KEY_USR_VIEW | KEY_USR_READ |
 			      KEY_USR_SEARCH,
 			      KEY_ALLOC_NOT_IN_QUOTA |
-			      KEY_FLAG_KEEP,
+			      KEY_ALLOC_SET_KEEP,
 			      NULL, NULL);
 	if (IS_ERR(blacklist_keyring))
 		panic("Can't allocate system blacklist keyring\n");
diff --git a/include/linux/key.h b/include/linux/key.h
index 1b0837c975b9..7febc4881363 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -289,6 +289,7 @@ extern struct key *key_alloc(struct key_type *type,
 #define KEY_ALLOC_BUILT_IN		0x0004	/* Key is built into kernel */
 #define KEY_ALLOC_BYPASS_RESTRICTION	0x0008	/* Override the check on restricted keyrings */
 #define KEY_ALLOC_UID_KEYRING		0x0010	/* allocating a user or user session keyring */
+#define KEY_ALLOC_SET_KEEP		0x0020	/* Set the KEEP flag on the key/keyring */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index 36cadadbfba4..1e5c01916173 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -38,13 +38,12 @@ __init int ima_mok_init(void)
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ |
 				KEY_USR_WRITE | KEY_USR_SEARCH,
-				KEY_ALLOC_NOT_IN_QUOTA,
+				KEY_ALLOC_NOT_IN_QUOTA |
+				KEY_ALLOC_SET_KEEP,
 				restriction, NULL);
 
 	if (IS_ERR(ima_blacklist_keyring))
 		panic("Can't allocate IMA blacklist keyring.");
-
-	set_bit(KEY_FLAG_KEEP, &ima_blacklist_keyring->flags);
 	return 0;
 }
 device_initcall(ima_mok_init);
diff --git a/security/keys/key.c b/security/keys/key.c
index ebe752b137aa..c45afdd1dfbb 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -303,6 +303,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 		key->flags |= 1 << KEY_FLAG_BUILTIN;
 	if (flags & KEY_ALLOC_UID_KEYRING)
 		key->flags |= 1 << KEY_FLAG_UID_KEYRING;
+	if (flags & KEY_ALLOC_SET_KEEP)
+		key->flags |= 1 << KEY_FLAG_KEEP;
 
 #ifdef KEY_DEBUGGING
 	key->magic = KEY_DEBUG_MAGIC;
-- 
cgit v1.2.3


From 4c9a3a6c9c5478ce6b4b9db6c964244474676bcb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Jan 2021 09:04:37 +0100
Subject: parport: fix a kernel-doc markup

The kernel-doc markup inside share.c is actually for
__parport_register_driver. The actual goal seems to be
to document parport_register_driver().

So, fix the existing markup and add a new one.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/dc0778af8c466cc667409ead05876a5cfd3cbece.1610610937.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 drivers/parport/share.c |  2 +-
 include/linux/parport.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 7fec4fefe151..62f8407923d4 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -243,7 +243,7 @@ static int port_detect(struct device *dev, void *dev_drv)
 }
 
 /**
- *	parport_register_driver - register a parallel port device driver
+ *	__parport_register_driver - register a parallel port device driver
  *	@drv: structure describing the driver
  *	@owner: owner module of drv
  *	@mod_name: module name string
diff --git a/include/linux/parport.h b/include/linux/parport.h
index 1fb508c19e83..f981f794c850 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -297,6 +297,37 @@ int __must_check __parport_register_driver(struct parport_driver *,
  * parport_register_driver must be a macro so that KBUILD_MODNAME can
  * be expanded
  */
+
+/**
+ *	parport_register_driver - register a parallel port device driver
+ *	@driver: structure describing the driver
+ *
+ *	This can be called by a parallel port device driver in order
+ *	to receive notifications about ports being found in the
+ *	system, as well as ports no longer available.
+ *
+ *	If devmodel is true then the new device model is used
+ *	for registration.
+ *
+ *	The @driver structure is allocated by the caller and must not be
+ *	deallocated until after calling parport_unregister_driver().
+ *
+ *	If using the non device model:
+ *	The driver's attach() function may block.  The port that
+ *	attach() is given will be valid for the duration of the
+ *	callback, but if the driver wants to take a copy of the
+ *	pointer it must call parport_get_port() to do so.  Calling
+ *	parport_register_device() on that port will do this for you.
+ *
+ *	The driver's detach() function may block.  The port that
+ *	detach() is given will be valid for the duration of the
+ *	callback, but if the driver wants to take a copy of the
+ *	pointer it must call parport_get_port() to do so.
+ *
+ *
+ *	Returns 0 on success. The non device model will always succeeds.
+ *	but the new device model can fail and will return the error code.
+ **/
 #define parport_register_driver(driver)             \
 	__parport_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)
 
-- 
cgit v1.2.3


From e23bd83368af41ce420bd0eacb32817ccf7d3ca2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Jan 2021 09:04:41 +0100
Subject: firmware: stratix10-svc: fix kernel-doc markups

There are some common comments marked, instead, with kernel-doc
notation, which won't work.

While here, rename an identifier, in order to match the
function prototype below kernel-doc markup.

Acked-by: Richard Gong <richard.gong@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/02a1eb47767e01e875d8840805b8b2d4f3c6bdee.1610610937.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/firmware/intel/stratix10-svc-client.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index a93d85932eb9..ebc295647581 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -6,7 +6,7 @@
 #ifndef __STRATIX10_SVC_CLIENT_H
 #define __STRATIX10_SVC_CLIENT_H
 
-/**
+/*
  * Service layer driver supports client names
  *
  * fpga: for FPGA configuration
@@ -15,7 +15,7 @@
 #define SVC_CLIENT_FPGA			"fpga"
 #define SVC_CLIENT_RSU			"rsu"
 
-/**
+/*
  * Status of the sent command, in bit number
  *
  * SVC_STATUS_OK:
@@ -50,7 +50,7 @@
 #define SVC_STATUS_ERROR		5
 #define SVC_STATUS_NO_SUPPORT		6
 
-/**
+/*
  * Flag bit for COMMAND_RECONFIG
  *
  * COMMAND_RECONFIG_FLAG_PARTIAL:
@@ -58,7 +58,7 @@
  */
 #define COMMAND_RECONFIG_FLAG_PARTIAL	1
 
-/**
+/*
  * Timeout settings for service clients:
  * timeout value used in Stratix10 FPGA manager driver.
  * timeout value used in RSU driver
@@ -218,7 +218,7 @@ void stratix10_svc_free_memory(struct stratix10_svc_chan *chan, void *kaddr);
 int stratix10_svc_send(struct stratix10_svc_chan *chan, void *msg);
 
 /**
- * intel_svc_done() - complete service request
+ * stratix10_svc_done() - complete service request
  * @chan: service channel assigned to the client
  *
  * This function is used by service client to inform service layer that
-- 
cgit v1.2.3


From 3aa1141f9916af3258e2bc69e294e2cc07d2ddf1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Jan 2021 09:04:42 +0100
Subject: connector: fix a kernel-doc markup

A function has a different name between their prototype
and its kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/889cfb141a98ae06d5bc79b744786ec2e8f92d93.1610610937.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/connector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/connector.h b/include/linux/connector.h
index 8ea860efea37..487350bb19c3 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -99,7 +99,7 @@ void cn_del_callback(const struct cb_id *id);
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask);
 
 /**
- * cn_netlink_send_mult - Sends message to the specified groups.
+ * cn_netlink_send - Sends message to the specified groups.
  *
  * @msg:	message header(with attached data).
  * @portid:	destination port.
-- 
cgit v1.2.3


From 909782ad0a36e4e5f875a431e280cba7b9b046fa Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Jan 2021 09:04:44 +0100
Subject: memblock: fix kernel-doc markups

Some identifiers have different names between their prototypes
and the kernel-doc markup.

Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/f3c65f61367993a607f9daf9dc1a3bdab1f0a040.1610610937.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/memblock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index b93c44b9121e..9cc6da7b513e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -272,7 +272,7 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
 				  unsigned long *out_spfn,
 				  unsigned long *out_epfn);
 /**
- * for_each_free_mem_range_in_zone - iterate through zone specific free
+ * for_each_free_mem_pfn_range_in_zone - iterate through zone specific free
  * memblock areas
  * @i: u64 used as loop variable
  * @zone: zone in which all of the memory blocks reside
@@ -292,7 +292,7 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
 	     __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
 
 /**
- * for_each_free_mem_range_in_zone_from - iterate through zone specific
+ * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific
  * free memblock areas from a given point
  * @i: u64 used as loop variable
  * @zone: zone in which all of the memory blocks reside
-- 
cgit v1.2.3


From 484cac791015e786f857c1cd98b7696aaf1e8a7a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Jan 2021 09:04:45 +0100
Subject: w1: fix a kernel-doc markup

A function has a different name between their prototype
and its kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/2dc136ff6290d7c8919599d21bee244f31647c8c.1610610937.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/w1.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/w1.h b/include/linux/w1.h
index 949d3b10e531..9a2a0ef39018 100644
--- a/include/linux/w1.h
+++ b/include/linux/w1.h
@@ -280,7 +280,7 @@ int w1_register_family(struct w1_family *family);
 void w1_unregister_family(struct w1_family *family);
 
 /**
- * module_w1_driver() - Helper macro for registering a 1-Wire families
+ * module_w1_family() - Helper macro for registering a 1-Wire families
  * @__w1_family: w1_family struct
  *
  * Helper macro for 1-Wire families which do not do anything special in module
-- 
cgit v1.2.3


From 660d2062190db131d2feaf19914e90f868fe285c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Jan 2021 10:11:35 +0100
Subject: crypto - shash: reduce minimum alignment of shash_desc structure

Unlike many other structure types defined in the crypto API, the
'shash_desc' structure is permitted to live on the stack, which
implies its contents may not be accessed by DMA masters. (This is
due to the fact that the stack may be located in the vmalloc area,
which requires a different virtual-to-physical translation than the
one implemented by the DMA subsystem)

Our definition of CRYPTO_MINALIGN_ATTR is based on ARCH_KMALLOC_MINALIGN,
which may take DMA constraints into account on architectures that support
non-cache coherent DMA such as ARM and arm64. In this case, the value is
chosen to reflect the largest cacheline size in the system, in order to
ensure that explicit cache maintenance as required by non-coherent DMA
masters does not affect adjacent, unrelated slab allocations. On arm64,
this value is currently set at 128 bytes.

This means that applying CRYPTO_MINALIGN_ATTR to struct shash_desc is both
unnecessary (as it is never used for DMA), and undesirable, given that it
wastes stack space (on arm64, performing the alignment costs 112 bytes in
the worst case, and the hole between the 'tfm' and '__ctx' members takes
up another 120 bytes, resulting in an increased stack footprint of up to
232 bytes.) So instead, let's switch to the minimum SLAB alignment, which
does not take DMA constraints into account.

Note that this is a no-op for x86.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/hash.h  | 8 ++++----
 include/linux/crypto.h | 9 ++++++---
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index af2ff31ff619..13f8a6a54ca8 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -149,7 +149,7 @@ struct ahash_alg {
 
 struct shash_desc {
 	struct crypto_shash *tfm;
-	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+	void *__ctx[] __aligned(ARCH_SLAB_MINALIGN);
 };
 
 #define HASH_MAX_DIGESTSIZE	 64
@@ -162,9 +162,9 @@ struct shash_desc {
 
 #define HASH_MAX_STATESIZE	512
 
-#define SHASH_DESC_ON_STACK(shash, ctx)				  \
-	char __##shash##_desc[sizeof(struct shash_desc) +	  \
-		HASH_MAX_DESCSIZE] CRYPTO_MINALIGN_ATTR; \
+#define SHASH_DESC_ON_STACK(shash, ctx)					     \
+	char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \
+		__aligned(__alignof__(struct shash_desc));		     \
 	struct shash_desc *shash = (struct shash_desc *)__##shash##_desc
 
 /**
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 9b55cd6b1f1b..da5e0d74bb2f 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -151,9 +151,12 @@
  * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
  * declaration) is used to ensure that the crypto_tfm context structure is
  * aligned correctly for the given architecture so that there are no alignment
- * faults for C data types.  In particular, this is required on platforms such
- * as arm where pointers are 32-bit aligned but there are data types such as
- * u64 which require 64-bit alignment.
+ * faults for C data types.  On architectures that support non-cache coherent
+ * DMA, such as ARM or arm64, it also takes into account the minimal alignment
+ * that is required to ensure that the context struct member does not share any
+ * cachelines with the rest of the struct. This is needed to ensure that cache
+ * maintenance for non-coherent DMA (cache invalidation in particular) does not
+ * affect data that may be accessed by the CPU concurrently.
  */
 #define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN
 
-- 
cgit v1.2.3


From 239319670e2a7c405eeb4b3e7721cf8bf8eef840 Mon Sep 17 00:00:00 2001
From: Ye Xiang <xiang.ye@intel.com>
Date: Tue, 15 Dec 2020 13:44:42 +0800
Subject: HID: hid-sensor-custom: Add custom sensor iio support

Currently custom sensors properties are not decoded and it is up to
user space to interpret.

Some manufacturers already standardized the meaning of some custom sensors.
They can be presented as a proper IIO sensor. We can identify these sensors
based on manufacturer and serial number property in the report.

This change is identifying hinge sensor when the manufacturer is "INTEL".
This creates a platform device so that a sensor driver can be loaded to
process these sensors.

Signed-off-by: Ye Xiang <xiang.ye@intel.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Link: https://lore.kernel.org/r/20201215054444.9324-2-xiang.ye@intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/hid/hid-sensor-custom.c | 143 ++++++++++++++++++++++++++++++++++++++++
 include/linux/hid-sensor-ids.h  |  14 ++++
 2 files changed, 157 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-sensor-custom.c b/drivers/hid/hid-sensor-custom.c
index 4d25577a8573..2628bc53ed80 100644
--- a/drivers/hid/hid-sensor-custom.c
+++ b/drivers/hid/hid-sensor-custom.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2015, Intel Corporation.
  */
 
+#include <linux/ctype.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -21,6 +22,7 @@
 #define HID_CUSTOM_TOTAL_ATTRS		(HID_CUSTOM_MAX_CORE_ATTRS + 1)
 #define HID_CUSTOM_FIFO_SIZE		4096
 #define HID_CUSTOM_MAX_FEATURE_BYTES	64
+#define HID_SENSOR_USAGE_LENGTH (4 + 1)
 
 struct hid_sensor_custom_field {
 	int report_id;
@@ -50,6 +52,7 @@ struct hid_sensor_custom {
 	struct kfifo data_fifo;
 	unsigned long misc_opened;
 	wait_queue_head_t wait;
+	struct platform_device *custom_pdev;
 };
 
 /* Header for each sample to user space via dev interface */
@@ -746,11 +749,130 @@ static void hid_sensor_custom_dev_if_remove(struct hid_sensor_custom
 
 }
 
+/* luid defined in FW (e.g. ISH).  Maybe used to identify sensor. */
+static const char *const known_sensor_luid[] = { "020B000000000000" };
+
+static int get_luid_table_index(unsigned char *usage_str)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(known_sensor_luid); i++) {
+		if (!strncmp(usage_str, known_sensor_luid[i],
+			     strlen(known_sensor_luid[i])))
+			return i;
+	}
+
+	return -ENODEV;
+}
+
+static int get_known_custom_sensor_index(struct hid_sensor_hub_device *hsdev)
+{
+	struct hid_sensor_hub_attribute_info sensor_manufacturer = { 0 };
+	struct hid_sensor_hub_attribute_info sensor_luid_info = { 0 };
+	int report_size;
+	int ret;
+	static u16 w_buf[HID_CUSTOM_MAX_FEATURE_BYTES];
+	static char buf[HID_CUSTOM_MAX_FEATURE_BYTES];
+	int i;
+
+	memset(w_buf, 0, sizeof(w_buf));
+	memset(buf, 0, sizeof(buf));
+
+	/* get manufacturer info */
+	ret = sensor_hub_input_get_attribute_info(hsdev,
+			HID_FEATURE_REPORT, hsdev->usage,
+			HID_USAGE_SENSOR_PROP_MANUFACTURER, &sensor_manufacturer);
+	if (ret < 0)
+		return ret;
+
+	report_size =
+		sensor_hub_get_feature(hsdev, sensor_manufacturer.report_id,
+				       sensor_manufacturer.index, sizeof(w_buf),
+				       w_buf);
+	if (report_size <= 0) {
+		hid_err(hsdev->hdev,
+			"Failed to get sensor manufacturer info %d\n",
+			report_size);
+		return -ENODEV;
+	}
+
+	/* convert from wide char to char */
+	for (i = 0; i < ARRAY_SIZE(buf) - 1 && w_buf[i]; i++)
+		buf[i] = (char)w_buf[i];
+
+	/* ensure it's ISH sensor */
+	if (strncmp(buf, "INTEL", strlen("INTEL")))
+		return -ENODEV;
+
+	memset(w_buf, 0, sizeof(w_buf));
+	memset(buf, 0, sizeof(buf));
+
+	/* get real usage id */
+	ret = sensor_hub_input_get_attribute_info(hsdev,
+			HID_FEATURE_REPORT, hsdev->usage,
+			HID_USAGE_SENSOR_PROP_SERIAL_NUM, &sensor_luid_info);
+	if (ret < 0)
+		return ret;
+
+	report_size = sensor_hub_get_feature(hsdev, sensor_luid_info.report_id,
+					     sensor_luid_info.index, sizeof(w_buf),
+					     w_buf);
+	if (report_size <= 0) {
+		hid_err(hsdev->hdev, "Failed to get real usage info %d\n",
+			report_size);
+		return -ENODEV;
+	}
+
+	/* convert from wide char to char */
+	for (i = 0; i < ARRAY_SIZE(buf) - 1 && w_buf[i]; i++)
+		buf[i] = (char)w_buf[i];
+
+	if (strlen(buf) != strlen(known_sensor_luid[0]) + 5) {
+		hid_err(hsdev->hdev,
+			"%s luid length not match %zu != (%zu + 5)\n", __func__,
+			strlen(buf), strlen(known_sensor_luid[0]));
+		return -ENODEV;
+	}
+
+	/* get table index with luid (not matching 'LUID: ' in luid) */
+	return get_luid_table_index(&buf[5]);
+}
+
+static struct platform_device *
+hid_sensor_register_platform_device(struct platform_device *pdev,
+				    struct hid_sensor_hub_device *hsdev,
+				    int index)
+{
+	char real_usage[HID_SENSOR_USAGE_LENGTH] = { 0 };
+	struct platform_device *custom_pdev;
+	const char *dev_name;
+	char *c;
+
+	/* copy real usage id */
+	memcpy(real_usage, known_sensor_luid[index], 4);
+
+	/* usage id are all lowcase */
+	for (c = real_usage; *c != '\0'; c++)
+		*c = tolower(*c);
+
+	/* HID-SENSOR-INT-REAL_USAGE_ID */
+	dev_name = kasprintf(GFP_KERNEL, "HID-SENSOR-INT-%s", real_usage);
+	if (!dev_name)
+		return ERR_PTR(-ENOMEM);
+
+	custom_pdev = platform_device_register_data(pdev->dev.parent, dev_name,
+						    PLATFORM_DEVID_NONE, hsdev,
+						    sizeof(*hsdev));
+	kfree(dev_name);
+	return custom_pdev;
+}
+
 static int hid_sensor_custom_probe(struct platform_device *pdev)
 {
 	struct hid_sensor_custom *sensor_inst;
 	struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data;
 	int ret;
+	int index;
 
 	sensor_inst = devm_kzalloc(&pdev->dev, sizeof(*sensor_inst),
 				   GFP_KERNEL);
@@ -764,6 +886,22 @@ static int hid_sensor_custom_probe(struct platform_device *pdev)
 	sensor_inst->pdev = pdev;
 	mutex_init(&sensor_inst->mutex);
 	platform_set_drvdata(pdev, sensor_inst);
+
+	index = get_known_custom_sensor_index(hsdev);
+	if (index >= 0 && index < ARRAY_SIZE(known_sensor_luid)) {
+		sensor_inst->custom_pdev =
+			hid_sensor_register_platform_device(pdev, hsdev, index);
+
+		ret = PTR_ERR_OR_ZERO(sensor_inst->custom_pdev);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"register_platform_device failed\n");
+			return ret;
+		}
+
+		return 0;
+	}
+
 	ret = sensor_hub_register_callback(hsdev, hsdev->usage,
 					   &sensor_inst->callbacks);
 	if (ret < 0) {
@@ -802,6 +940,11 @@ static int hid_sensor_custom_remove(struct platform_device *pdev)
 	struct hid_sensor_custom *sensor_inst = platform_get_drvdata(pdev);
 	struct hid_sensor_hub_device *hsdev = pdev->dev.platform_data;
 
+	if (sensor_inst->custom_pdev) {
+		platform_device_unregister(sensor_inst->custom_pdev);
+		return 0;
+	}
+
 	hid_sensor_custom_dev_if_remove(sensor_inst);
 	hid_sensor_custom_remove_attributes(sensor_inst);
 	sysfs_remove_group(&sensor_inst->pdev->dev.kobj,
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index d82a97e311d3..3bbdbccc5805 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -128,6 +128,10 @@
 #define HID_USAGE_SENSOR_UNITS_DEGREES_PER_SECOND		0x15
 
 /* Common selectors */
+#define HID_USAGE_SENSOR_PROP_DESC				0x200300
+#define HID_USAGE_SENSOR_PROP_FRIENDLY_NAME			0x200301
+#define HID_USAGE_SENSOR_PROP_SERIAL_NUM			0x200307
+#define HID_USAGE_SENSOR_PROP_MANUFACTURER			0x200305
 #define HID_USAGE_SENSOR_PROP_REPORT_INTERVAL			0x20030E
 #define HID_USAGE_SENSOR_PROP_SENSITIVITY_ABS			0x20030F
 #define HID_USAGE_SENSOR_PROP_SENSITIVITY_RANGE_PCT		0x200310
@@ -158,4 +162,14 @@
 #define HID_USAGE_SENSOR_PROP_REPORTING_STATE_NO_EVENTS_ENUM	0x200840
 #define HID_USAGE_SENSOR_PROP_REPORTING_STATE_ALL_EVENTS_ENUM	0x200841
 
+/* Custom Sensor (2000e1) */
+#define HID_USAGE_SENSOR_HINGE				        0x20020B
+#define HID_USAGE_SENSOR_DATA_FIELD_LOCATION			0x200400
+#define HID_USAGE_SENSOR_DATA_FIELE_TIME_SINCE_SYS_BOOT		0x20052B
+#define HID_USAGE_SENSOR_DATA_FIELD_CUSTOM_USAGE		0x200541
+#define HID_USAGE_SENSOR_DATA_FIELD_CUSTOM_VALUE_BASE           0x200543
+/* Custom Sensor data 28=>x>=0 */
+#define HID_USAGE_SENSOR_DATA_FIELD_CUSTOM_VALUE(x)                            \
+	(HID_USAGE_SENSOR_DATA_FIELD_CUSTOM_VALUE_BASE + (x))
+
 #endif
-- 
cgit v1.2.3


From 2f0df49c89acaa58571d509830bc481250699885 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 11 Dec 2020 16:37:54 -0500
Subject: jump_label: Do not profile branch annotations

While running my branch profiler that checks for incorrect "likely" and
"unlikely"s around the kernel, there's a large number of them that are
incorrect due to being "static_branches".

As static_branches are rather special, as they are likely or unlikely for
other reasons than normal annotations are used for, there's no reason to
have them be profiled.

Expose the "unlikely_notrace" and "likely_notrace" so that the
static_branch can use them, and have them be ignored by the branch
profilers.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201211163754.585174b9@gandalf.local.home
---
 include/linux/compiler.h   |  2 ++
 include/linux/jump_label.h | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b8fe0c23cfff..df5b405e6305 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -76,6 +76,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #else
 # define likely(x)	__builtin_expect(!!(x), 1)
 # define unlikely(x)	__builtin_expect(!!(x), 0)
+# define likely_notrace(x)	likely(x)
+# define unlikely_notrace(x)	unlikely(x)
 #endif
 
 /* Optimization barrier */
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 32809624d422..d92691262f51 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -261,14 +261,14 @@ static __always_inline void jump_label_init(void)
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
-	if (unlikely(static_key_count(key) > 0))
+	if (unlikely_notrace(static_key_count(key) > 0))
 		return true;
 	return false;
 }
 
 static __always_inline bool static_key_true(struct static_key *key)
 {
-	if (likely(static_key_count(key) > 0))
+	if (likely_notrace(static_key_count(key) > 0))
 		return true;
 	return false;
 }
@@ -460,7 +460,7 @@ extern bool ____wrong_branch_error(void);
 		branch = !arch_static_branch_jump(&(x)->key, true);		\
 	else									\
 		branch = ____wrong_branch_error();				\
-	likely(branch);								\
+	likely_notrace(branch);								\
 })
 
 #define static_branch_unlikely(x)						\
@@ -472,13 +472,13 @@ extern bool ____wrong_branch_error(void);
 		branch = arch_static_branch(&(x)->key, false);			\
 	else									\
 		branch = ____wrong_branch_error();				\
-	unlikely(branch);							\
+	unlikely_notrace(branch);							\
 })
 
 #else /* !CONFIG_JUMP_LABEL */
 
-#define static_branch_likely(x)		likely(static_key_enabled(&(x)->key))
-#define static_branch_unlikely(x)	unlikely(static_key_enabled(&(x)->key))
+#define static_branch_likely(x)		likely_notrace(static_key_enabled(&(x)->key))
+#define static_branch_unlikely(x)	unlikely_notrace(static_key_enabled(&(x)->key))
 
 #endif /* CONFIG_JUMP_LABEL */
 
-- 
cgit v1.2.3


From 997acaf6b4b59c6a9c259740312a69ea549cc684 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 11 Jan 2021 15:37:07 +0000
Subject: lockdep: report broken irq restoration

We generally expect local_irq_save() and local_irq_restore() to be
paired and sanely nested, and so local_irq_restore() expects to be
called with irqs disabled. Thus, within local_irq_restore() we only
trace irq flag changes when unmasking irqs.

This means that a sequence such as:

| local_irq_disable();
| local_irq_save(flags);
| local_irq_enable();
| local_irq_restore(flags);

... is liable to break things, as the local_irq_restore() would mask
irqs without tracing this change. Similar problems may exist for
architectures whose arch_irq_restore() function depends on being called
with irqs disabled.

We don't consider such sequences to be a good idea, so let's define
those as forbidden, and add tooling to detect such broken cases.

This patch adds debug code to WARN() when raw_local_irq_restore() is
called with irqs enabled. As raw_local_irq_restore() is expected to pair
with raw_local_irq_save(), it should never be called with irqs enabled.

To avoid the possibility of circular header dependencies between
irqflags.h and bug.h, the warning is handled in a separate C file.

The new code is all conditional on a new CONFIG_DEBUG_IRQFLAGS symbol
which is independent of CONFIG_TRACE_IRQFLAGS. As noted above such cases
will confuse lockdep, so CONFIG_DEBUG_LOCKDEP now selects
CONFIG_DEBUG_IRQFLAGS.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210111153707.10071-1-mark.rutland@arm.com
---
 include/linux/irqflags.h       | 12 ++++++++++++
 kernel/locking/Makefile        |  1 +
 kernel/locking/irqflag-debug.c | 11 +++++++++++
 lib/Kconfig.debug              |  8 ++++++++
 4 files changed, 32 insertions(+)
 create mode 100644 kernel/locking/irqflag-debug.c

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 8de0e1373de7..600c10da321a 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -149,6 +149,17 @@ do {						\
 # define start_critical_timings() do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_IRQFLAGS
+extern void warn_bogus_irq_restore(void);
+#define raw_check_bogus_irq_restore()			\
+	do {						\
+		if (unlikely(!arch_irqs_disabled()))	\
+			warn_bogus_irq_restore();	\
+	} while (0)
+#else
+#define raw_check_bogus_irq_restore() do { } while (0)
+#endif
+
 /*
  * Wrap the arch provided IRQ routines to provide appropriate checks.
  */
@@ -162,6 +173,7 @@ do {						\
 #define raw_local_irq_restore(flags)			\
 	do {						\
 		typecheck(unsigned long, flags);	\
+		raw_check_bogus_irq_restore();		\
 		arch_local_irq_restore(flags);		\
 	} while (0)
 #define raw_local_save_flags(flags)			\
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6d11cfb9b41f..8838f1d7c4a2 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -15,6 +15,7 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
 endif
 
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 000000000000..9603d207a4ca
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+void warn_bogus_irq_restore(void)
+{
+	WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e6e58b26e888..78eadf615df8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1343,6 +1343,7 @@ config LOCKDEP_SMALL
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
+	select DEBUG_IRQFLAGS
 	help
 	  If you say Y here, the lock dependency engine will do
 	  additional runtime checks to debug itself, at the price
@@ -1431,6 +1432,13 @@ config TRACE_IRQFLAGS_NMI
 	depends on TRACE_IRQFLAGS
 	depends on TRACE_IRQFLAGS_NMI_SUPPORT
 
+config DEBUG_IRQFLAGS
+	bool "Debug IRQ flag manipulation"
+	help
+	  Enables checks for potentially unsafe enabling or disabling of
+	  interrupts, such as calling raw_local_irq_restore() when interrupts
+	  are enabled.
+
 config STACKTRACE
 	bool "Stack backtrace support"
 	depends on STACKTRACE_SUPPORT
-- 
cgit v1.2.3


From 67e3242ee28052daa90e1fa193efc601939fde58 Mon Sep 17 00:00:00 2001
From: Lina Iyer <ilina@codeaurora.org>
Date: Wed, 20 Jan 2021 08:50:41 -0700
Subject: PM: domains: inform PM domain of a device's next wakeup

Some devices may have a predictable interrupt pattern while executing
usecases. An example would be the VSYNC interrupt associated with
display devices. A 60 Hz display could cause a interrupt every 16 ms. If
the device were in a PM domain, the domain would need to be powered up
for device to resume and handle the interrupt.

Entering a domain idle state saves power, only if the residency of the
idle state is met. Without knowing the idle duration of the domain, the
governor would just choose the deepest idle state that matches the QoS
requirements. The domain might be powered off just as the device is
expecting to wake up. If devices could inform PM frameworks of their
next event, the parent PM domain's idle duration can be determined.

So let's add the dev_pm_genpd_set_next_wakeup() API for the device to
inform PM domains of the impending wakeup. This information will be the
domain governor to determine the best idle state given the wakeup.

Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 30 ++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   |  6 ++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 9a14eedacb92..014033c7c287 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -423,6 +423,35 @@ int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state)
 }
 EXPORT_SYMBOL_GPL(dev_pm_genpd_set_performance_state);
 
+/**
+ * dev_pm_genpd_set_next_wakeup - Notify PM framework of an impending wakeup.
+ *
+ * @dev: Device to handle
+ * @next: impending interrupt/wakeup for the device
+ *
+ *
+ * Allow devices to inform of the next wakeup. It's assumed that the users
+ * guarantee that the genpd wouldn't be detached while this routine is getting
+ * called. Additionally, it's also assumed that @dev isn't runtime suspended
+ * (RPM_SUSPENDED)."
+ * Although devices are expected to update the next_wakeup after the end of
+ * their usecase as well, it is possible the devices themselves may not know
+ * about that, so stale @next will be ignored when powering off the domain.
+ */
+void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next)
+{
+	struct generic_pm_domain_data *gpd_data;
+	struct generic_pm_domain *genpd;
+
+	genpd = dev_to_genpd_safe(dev);
+	if (!genpd)
+		return;
+
+	gpd_data = to_gpd_data(dev->power.subsys_data->domain_data);
+	gpd_data->next_wakeup = next;
+}
+EXPORT_SYMBOL_GPL(dev_pm_genpd_set_next_wakeup);
+
 static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 {
 	unsigned int state_idx = genpd->state_idx;
@@ -1465,6 +1494,7 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev)
 	gpd_data->td.constraint_changed = true;
 	gpd_data->td.effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS;
 	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
+	gpd_data->next_wakeup = KTIME_MAX;
 
 	spin_lock_irq(&dev->power.lock);
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 2ca919ae8d36..735583c0bc6d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -9,6 +9,7 @@
 #define _LINUX_PM_DOMAIN_H
 
 #include <linux/device.h>
+#include <linux/ktime.h>
 #include <linux/mutex.h>
 #include <linux/pm.h>
 #include <linux/err.h>
@@ -191,6 +192,7 @@ struct generic_pm_domain_data {
 	struct notifier_block *power_nb;
 	int cpu;
 	unsigned int performance_state;
+	ktime_t	next_wakeup;
 	void *data;
 };
 
@@ -217,6 +219,7 @@ int pm_genpd_remove(struct generic_pm_domain *genpd);
 int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state);
 int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb);
 int dev_pm_genpd_remove_notifier(struct device *dev);
+void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next);
 
 extern struct dev_power_governor simple_qos_governor;
 extern struct dev_power_governor pm_domain_always_on_gov;
@@ -275,6 +278,9 @@ static inline int dev_pm_genpd_remove_notifier(struct device *dev)
 	return -EOPNOTSUPP;
 }
 
+static inline void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next)
+{ }
+
 #define simple_qos_governor		(*(struct dev_power_governor *)(NULL))
 #define pm_domain_always_on_gov		(*(struct dev_power_governor *)(NULL))
 #endif
-- 
cgit v1.2.3


From c79aa080fb0f60a0e24c87014dc9c2f373e1379b Mon Sep 17 00:00:00 2001
From: Lina Iyer <ilina@codeaurora.org>
Date: Wed, 20 Jan 2021 08:50:42 -0700
Subject: PM: domains: use device's next wakeup to determine domain idle state

Currently, a PM domain's idle state is determined based on whether the
QoS requirements are met. However, even entering an idle state may waste
power if the minimum residency requirements aren't fulfilled.

CPU PM domains use the next timer wakeup for the CPUs in the domain to
determine the sleep duration of the domain. This is compared with the
idle state residencies to determine the optimal idle state. For other PM
domains, determining the sleep length is not that straight forward. But
if the device's next_event is available, we can use that to determine
the sleep duration of the PM domain.

Let's update the domain governor logic to check for idle state residency
based on the next wakeup of devices as well as QoS constraints. But
since, not all domains may contain devices capable of specifying the
next wakeup, let's enable this additional check only if specified by the
domain's flags when initializing the domain.

Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain_governor.c | 102 +++++++++++++++++++++++++++++++----
 include/linux/pm_domain.h            |   6 +++
 2 files changed, 99 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 490ed7deb99a..c6c218758f0b 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -117,6 +117,55 @@ static bool default_suspend_ok(struct device *dev)
 	return td->cached_suspend_ok;
 }
 
+static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
+{
+	ktime_t domain_wakeup = KTIME_MAX;
+	ktime_t next_wakeup;
+	struct pm_domain_data *pdd;
+	struct gpd_link *link;
+
+	if (!(genpd->flags & GENPD_FLAG_MIN_RESIDENCY))
+		return;
+
+	/*
+	 * Devices that have a predictable wakeup pattern, may specify
+	 * their next wakeup. Let's find the next wakeup from all the
+	 * devices attached to this domain and from all the sub-domains.
+	 * It is possible that component's a next wakeup may have become
+	 * stale when we read that here. We will ignore to ensure the domain
+	 * is able to enter its optimal idle state.
+	 */
+	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+		next_wakeup = to_gpd_data(pdd)->next_wakeup;
+		if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
+			if (ktime_before(next_wakeup, domain_wakeup))
+				domain_wakeup = next_wakeup;
+	}
+
+	list_for_each_entry(link, &genpd->parent_links, parent_node) {
+		next_wakeup = link->child->next_wakeup;
+		if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
+			if (ktime_before(next_wakeup, domain_wakeup))
+				domain_wakeup = next_wakeup;
+	}
+
+	genpd->next_wakeup = domain_wakeup;
+}
+
+static bool next_wakeup_allows_state(struct generic_pm_domain *genpd,
+				     unsigned int state, ktime_t now)
+{
+	ktime_t domain_wakeup = genpd->next_wakeup;
+	s64 idle_time_ns, min_sleep_ns;
+
+	min_sleep_ns = genpd->states[state].power_off_latency_ns +
+		       genpd->states[state].residency_ns;
+
+	idle_time_ns = ktime_to_ns(ktime_sub(domain_wakeup, now));
+
+	return idle_time_ns >= min_sleep_ns;
+}
+
 static bool __default_power_down_ok(struct dev_pm_domain *pd,
 				     unsigned int state)
 {
@@ -201,16 +250,41 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd,
 }
 
 /**
- * default_power_down_ok - Default generic PM domain power off governor routine.
+ * _default_power_down_ok - Default generic PM domain power off governor routine.
  * @pd: PM domain to check.
  *
  * This routine must be executed under the PM domain's lock.
  */
-static bool default_power_down_ok(struct dev_pm_domain *pd)
+static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now)
 {
 	struct generic_pm_domain *genpd = pd_to_genpd(pd);
+	int state_idx = genpd->state_count - 1;
 	struct gpd_link *link;
 
+	/*
+	 * Find the next wakeup from devices that can determine their own wakeup
+	 * to find when the domain would wakeup and do it for every device down
+	 * the hierarchy. It is not worth while to sleep if the state's residency
+	 * cannot be met.
+	 */
+	update_domain_next_wakeup(genpd, now);
+	if ((genpd->flags & GENPD_FLAG_MIN_RESIDENCY) && (genpd->next_wakeup != KTIME_MAX)) {
+		/* Let's find out the deepest domain idle state, the devices prefer */
+		while (state_idx >= 0) {
+			if (next_wakeup_allows_state(genpd, state_idx, now)) {
+				genpd->max_off_time_changed = true;
+				break;
+			}
+			state_idx--;
+		}
+
+		if (state_idx < 0) {
+			state_idx = 0;
+			genpd->cached_power_down_ok = false;
+			goto done;
+		}
+	}
+
 	if (!genpd->max_off_time_changed) {
 		genpd->state_idx = genpd->cached_power_down_state_idx;
 		return genpd->cached_power_down_ok;
@@ -228,21 +302,30 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
 	genpd->max_off_time_ns = -1;
 	genpd->max_off_time_changed = false;
 	genpd->cached_power_down_ok = true;
-	genpd->state_idx = genpd->state_count - 1;
 
-	/* Find a state to power down to, starting from the deepest. */
-	while (!__default_power_down_ok(pd, genpd->state_idx)) {
-		if (genpd->state_idx == 0) {
+	/*
+	 * Find a state to power down to, starting from the state
+	 * determined by the next wakeup.
+	 */
+	while (!__default_power_down_ok(pd, state_idx)) {
+		if (state_idx == 0) {
 			genpd->cached_power_down_ok = false;
 			break;
 		}
-		genpd->state_idx--;
+		state_idx--;
 	}
 
+done:
+	genpd->state_idx = state_idx;
 	genpd->cached_power_down_state_idx = genpd->state_idx;
 	return genpd->cached_power_down_ok;
 }
 
+static bool default_power_down_ok(struct dev_pm_domain *pd)
+{
+	return _default_power_down_ok(pd, ktime_get());
+}
+
 static bool always_on_power_down_ok(struct dev_pm_domain *domain)
 {
 	return false;
@@ -254,11 +337,12 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd)
 	struct generic_pm_domain *genpd = pd_to_genpd(pd);
 	struct cpuidle_device *dev;
 	ktime_t domain_wakeup, next_hrtimer;
+	ktime_t now = ktime_get();
 	s64 idle_duration_ns;
 	int cpu, i;
 
 	/* Validate dev PM QoS constraints. */
-	if (!default_power_down_ok(pd))
+	if (!_default_power_down_ok(pd, now))
 		return false;
 
 	if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN))
@@ -280,7 +364,7 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd)
 	}
 
 	/* The minimum idle duration is from now - until the next wakeup. */
-	idle_duration_ns = ktime_to_ns(ktime_sub(domain_wakeup, ktime_get()));
+	idle_duration_ns = ktime_to_ns(ktime_sub(domain_wakeup, now));
 	if (idle_duration_ns <= 0)
 		return false;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 735583c0bc6d..dfcfbcecc34b 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -56,6 +56,10 @@
  *
  * GENPD_FLAG_RPM_ALWAYS_ON:	Instructs genpd to always keep the PM domain
  *				powered on except for system suspend.
+ *
+ * GENPD_FLAG_MIN_RESIDENCY:	Enable the genpd governor to consider its
+ *				components' next wakeup when determining the
+ *				optimal idle state.
  */
 #define GENPD_FLAG_PM_CLK	 (1U << 0)
 #define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
@@ -63,6 +67,7 @@
 #define GENPD_FLAG_ACTIVE_WAKEUP (1U << 3)
 #define GENPD_FLAG_CPU_DOMAIN	 (1U << 4)
 #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5)
+#define GENPD_FLAG_MIN_RESIDENCY (1U << 6)
 
 enum gpd_status {
 	GENPD_STATE_ON = 0,	/* PM domain is on */
@@ -130,6 +135,7 @@ struct generic_pm_domain {
 				     unsigned int state);
 	struct gpd_dev_ops dev_ops;
 	s64 max_off_time_ns;	/* Maximum allowed "suspended" time. */
+	ktime_t next_wakeup;	/* Maintained by the domain governor */
 	bool max_off_time_changed;
 	bool cached_power_down_ok;
 	bool cached_power_down_state_idx;
-- 
cgit v1.2.3


From f3196bb0f14c0ffb5089c15668bda196c98d3900 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 11 Dec 2020 22:12:16 -0800
Subject: net/mlx5: Introduce vhca state event notifier

vhca state events indicates change in the state of the vhca that may
occur due to a SF allocation, deallocation or enabling/disabling the
SF HCA.

Introduce vhca state event handler which will be used by SF devlink
port manager and SF hardware id allocator in subsequent patches
to act on the event.

This enables single entity to subscribe, query and rearm the event
for a function.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Vu Pham <vuhuong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |   9 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/events.c   |   7 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  16 ++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 .../mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h    |  82 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h    |  45 +++++
 .../ethernet/mellanox/mlx5/core/sf/vhca_event.c    | 189 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/sf/vhca_event.h    |  57 +++++++
 include/linux/mlx5/driver.h                        |   4 +
 12 files changed, 422 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 6e4d7bb7fea2..d6c48582e7a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -203,3 +203,12 @@ config MLX5_SW_STEERING
 	default y
 	help
 	Build support for software-managed steering in the NIC.
+
+config MLX5_SF
+	bool "Mellanox Technologies subfunction device support using auxiliary device"
+	depends on MLX5_CORE && MLX5_CORE_EN
+	default n
+	help
+	Build support for subfuction device in the NIC. A Mellanox subfunction
+	device can support RDMA, netdevice and vdpa device.
+	It is similar to a SRIOV VF but it doesn't require SRIOV support.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 134bd038ae8a..22ef2ebbee96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -86,3 +86,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
 					steering/dr_ste_v0.o \
 					steering/dr_cmd.o steering/dr_fw.o \
 					steering/dr_action.o steering/fs_dr.o
+#
+# SF device
+#
+mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 50c7b9ee80c3..47dcc3ac2cf0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -464,6 +464,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_ALLOC_MEMIC:
 	case MLX5_CMD_OP_MODIFY_XRQ:
 	case MLX5_CMD_OP_RELEASE_XRQ_ERROR:
+	case MLX5_CMD_OP_QUERY_VHCA_STATE:
+	case MLX5_CMD_OP_MODIFY_VHCA_STATE:
 		*status = MLX5_DRIVER_STATUS_ABORTED;
 		*synd = MLX5_DRIVER_SYND;
 		return -EIO;
@@ -657,6 +659,8 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(DESTROY_UMEM);
 	MLX5_COMMAND_STR_CASE(RELEASE_XRQ_ERROR);
 	MLX5_COMMAND_STR_CASE(MODIFY_XRQ);
+	MLX5_COMMAND_STR_CASE(QUERY_VHCA_STATE);
+	MLX5_COMMAND_STR_CASE(MODIFY_VHCA_STATE);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index fc0afa03d407..421febebc658 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -595,6 +595,9 @@ static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4])
 		async_event_mask |=
 			(1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED);
 
+	if (MLX5_CAP_GEN_MAX(dev, vhca_state))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_VHCA_STATE_CHANGE);
+
 	mask[0] = async_event_mask;
 
 	if (MLX5_CAP_GEN(dev, event_cap))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 3ce17c3d7a00..5523d218e5fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -110,6 +110,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_CMD";
 	case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED:
 		return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED";
+	case MLX5_EVENT_TYPE_VHCA_STATE_CHANGE:
+		return "MLX5_EVENT_TYPE_VHCA_STATE_CHANGE";
 	case MLX5_EVENT_TYPE_PAGE_REQUEST:
 		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
 	case MLX5_EVENT_TYPE_PAGE_FAULT:
@@ -403,3 +405,8 @@ int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, voi
 {
 	return atomic_notifier_call_chain(&events->nh, event, data);
 }
+
+void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work)
+{
+	queue_work(dev->priv.events->wq, work);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ca6f2fc39ea0..b16f57befe52 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -73,6 +73,7 @@
 #include "ecpf.h"
 #include "lib/hv_vhca.h"
 #include "diag/rsc_dump.h"
+#include "sf/vhca_event.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -567,6 +568,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx)
 	if (MLX5_CAP_GEN_MAX(dev, mkey_by_name))
 		MLX5_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1);
 
+	mlx5_vhca_state_cap_handle(dev, set_hca_cap);
+
 	return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 }
 
@@ -884,6 +887,12 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_eswitch_cleanup;
 	}
 
+	err = mlx5_vhca_event_init(dev);
+	if (err) {
+		mlx5_core_err(dev, "Failed to init vhca event notifier %d\n", err);
+		goto err_fpga_cleanup;
+	}
+
 	dev->dm = mlx5_dm_create(dev);
 	if (IS_ERR(dev->dm))
 		mlx5_core_warn(dev, "Failed to init device memory%d\n", err);
@@ -894,6 +903,8 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 	return 0;
 
+err_fpga_cleanup:
+	mlx5_fpga_cleanup(dev);
 err_eswitch_cleanup:
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 err_sriov_cleanup:
@@ -925,6 +936,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_dm_cleanup(dev);
+	mlx5_vhca_event_cleanup(dev);
 	mlx5_fpga_cleanup(dev);
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 	mlx5_sriov_cleanup(dev);
@@ -1129,6 +1141,8 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 		goto err_sriov;
 	}
 
+	mlx5_vhca_event_start(dev);
+
 	err = mlx5_ec_init(dev);
 	if (err) {
 		mlx5_core_err(dev, "Failed to init embedded CPU\n");
@@ -1146,6 +1160,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 err_sriov:
 	mlx5_ec_cleanup(dev);
 err_ec:
+	mlx5_vhca_event_stop(dev);
 	mlx5_cleanup_fs(dev);
 err_fs:
 	mlx5_accel_tls_cleanup(dev);
@@ -1173,6 +1188,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 {
 	mlx5_sriov_detach(dev);
 	mlx5_ec_cleanup(dev);
+	mlx5_vhca_event_stop(dev);
 	mlx5_cleanup_fs(dev);
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 0a0302ce7144..a33b7496d748 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -259,4 +259,6 @@ void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
 
 void mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup);
 int mlx5_load_one(struct mlx5_core_dev *dev, bool boot);
+
+void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work);
 #endif /* __MLX5_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h
new file mode 100644
index 000000000000..1daf5a122ba3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#ifndef __MLX5_IFC_VHCA_EVENT_H__
+#define __MLX5_IFC_VHCA_EVENT_H__
+
+enum mlx5_ifc_vhca_state {
+	MLX5_VHCA_STATE_INVALID = 0x0,
+	MLX5_VHCA_STATE_ALLOCATED = 0x1,
+	MLX5_VHCA_STATE_ACTIVE = 0x2,
+	MLX5_VHCA_STATE_IN_USE = 0x3,
+	MLX5_VHCA_STATE_TEARDOWN_REQUEST = 0x4,
+};
+
+struct mlx5_ifc_vhca_state_context_bits {
+	u8         arm_change_event[0x1];
+	u8         reserved_at_1[0xb];
+	u8         vhca_state[0x4];
+	u8         reserved_at_10[0x10];
+
+	u8         sw_function_id[0x20];
+
+	u8         reserved_at_40[0x80];
+};
+
+struct mlx5_ifc_query_vhca_state_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_vhca_state_context_bits vhca_state_context;
+};
+
+struct mlx5_ifc_query_vhca_state_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         embedded_cpu_function[0x1];
+	u8         reserved_at_41[0xf];
+	u8         function_id[0x10];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_vhca_state_field_select_bits {
+	u8         reserved_at_0[0x1e];
+	u8         sw_function_id[0x1];
+	u8         arm_change_event[0x1];
+};
+
+struct mlx5_ifc_modify_vhca_state_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_vhca_state_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         embedded_cpu_function[0x1];
+	u8         reserved_at_41[0xf];
+	u8         function_id[0x10];
+
+	struct mlx5_ifc_vhca_state_field_select_bits vhca_state_field_select;
+
+	struct mlx5_ifc_vhca_state_context_bits vhca_state_context;
+};
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
new file mode 100644
index 000000000000..623191679b49
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#ifndef __MLX5_SF_H__
+#define __MLX5_SF_H__
+
+#include <linux/mlx5/driver.h>
+
+static inline u16 mlx5_sf_start_function_id(const struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, sf_base_id);
+}
+
+#ifdef CONFIG_MLX5_SF
+
+static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, sf);
+}
+
+static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
+{
+	if (!mlx5_sf_supported(dev))
+		return 0;
+	if (MLX5_CAP_GEN(dev, max_num_sf))
+		return MLX5_CAP_GEN(dev, max_num_sf);
+	else
+		return 1 << MLX5_CAP_GEN(dev, log_max_sf);
+}
+
+#else
+
+static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev)
+{
+	return false;
+}
+
+static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
new file mode 100644
index 000000000000..af2f2dd9db25
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_ifc_vhca_event.h"
+#include "mlx5_core.h"
+#include "vhca_event.h"
+#include "ecpf.h"
+
+struct mlx5_vhca_state_notifier {
+	struct mlx5_core_dev *dev;
+	struct mlx5_nb nb;
+	struct blocking_notifier_head n_head;
+};
+
+struct mlx5_vhca_event_work {
+	struct work_struct work;
+	struct mlx5_vhca_state_notifier *notifier;
+	struct mlx5_vhca_state_event event;
+};
+
+int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id,
+			      bool ecpu, u32 *out, u32 outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {};
+
+	MLX5_SET(query_vhca_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_STATE);
+	MLX5_SET(query_vhca_state_in, in, function_id, function_id);
+	MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, ecpu);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+static int mlx5_cmd_modify_vhca_state(struct mlx5_core_dev *dev, u16 function_id,
+				      bool ecpu, u32 *in, u32 inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {};
+
+	MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE);
+	MLX5_SET(modify_vhca_state_in, in, function_id, function_id);
+	MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu);
+
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {};
+
+	MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE);
+	MLX5_SET(modify_vhca_state_in, in, function_id, function_id);
+	MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu);
+	MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.sw_function_id, 1);
+	MLX5_SET(modify_vhca_state_in, in, vhca_state_context.sw_function_id, sw_fn_id);
+
+	return mlx5_cmd_exec_inout(dev, modify_vhca_state, in, out);
+}
+
+int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {};
+
+	MLX5_SET(modify_vhca_state_in, in, vhca_state_context.arm_change_event, 1);
+	MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.arm_change_event, 1);
+
+	return mlx5_cmd_modify_vhca_state(dev, function_id, ecpu, in, sizeof(in));
+}
+
+static void
+mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event *event)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {};
+	int err;
+
+	err = mlx5_cmd_query_vhca_state(dev, event->function_id, event->ecpu, out, sizeof(out));
+	if (err)
+		return;
+
+	event->sw_function_id = MLX5_GET(query_vhca_state_out, out,
+					 vhca_state_context.sw_function_id);
+	event->new_vhca_state = MLX5_GET(query_vhca_state_out, out,
+					 vhca_state_context.vhca_state);
+
+	mlx5_vhca_event_arm(dev, event->function_id, event->ecpu);
+
+	blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event);
+}
+
+static void mlx5_vhca_state_work_handler(struct work_struct *_work)
+{
+	struct mlx5_vhca_event_work *work = container_of(_work, struct mlx5_vhca_event_work, work);
+	struct mlx5_vhca_state_notifier *notifier = work->notifier;
+	struct mlx5_core_dev *dev = notifier->dev;
+
+	mlx5_vhca_event_notify(dev, &work->event);
+}
+
+static int
+mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data)
+{
+	struct mlx5_vhca_state_notifier *notifier =
+				mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb);
+	struct mlx5_vhca_event_work *work;
+	struct mlx5_eqe *eqe = data;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return NOTIFY_DONE;
+	INIT_WORK(&work->work, &mlx5_vhca_state_work_handler);
+	work->notifier = notifier;
+	work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id);
+	work->event.ecpu = be16_to_cpu(eqe->data.vhca_state.ec_function);
+	mlx5_events_work_enqueue(notifier->dev, &work->work);
+	return NOTIFY_OK;
+}
+
+void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap)
+{
+	if (!mlx5_vhca_event_supported(dev))
+		return;
+
+	MLX5_SET(cmd_hca_cap, set_hca_cap, vhca_state, 1);
+	MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_allocated, 1);
+	MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_active, 1);
+	MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_in_use, 1);
+	MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_teardown_request, 1);
+}
+
+int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_vhca_state_notifier *notifier;
+
+	if (!mlx5_vhca_event_supported(dev))
+		return 0;
+
+	notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
+	if (!notifier)
+		return -ENOMEM;
+
+	dev->priv.vhca_state_notifier = notifier;
+	notifier->dev = dev;
+	BLOCKING_INIT_NOTIFIER_HEAD(&notifier->n_head);
+	MLX5_NB_INIT(&notifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE);
+	return 0;
+}
+
+void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_vhca_event_supported(dev))
+		return;
+
+	kfree(dev->priv.vhca_state_notifier);
+	dev->priv.vhca_state_notifier = NULL;
+}
+
+void mlx5_vhca_event_start(struct mlx5_core_dev *dev)
+{
+	struct mlx5_vhca_state_notifier *notifier;
+
+	if (!dev->priv.vhca_state_notifier)
+		return;
+
+	notifier = dev->priv.vhca_state_notifier;
+	mlx5_eq_notifier_register(dev, &notifier->nb);
+}
+
+void mlx5_vhca_event_stop(struct mlx5_core_dev *dev)
+{
+	struct mlx5_vhca_state_notifier *notifier;
+
+	if (!dev->priv.vhca_state_notifier)
+		return;
+
+	notifier = dev->priv.vhca_state_notifier;
+	mlx5_eq_notifier_unregister(dev, &notifier->nb);
+}
+
+int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb)
+{
+	if (!dev->priv.vhca_state_notifier)
+		return -EOPNOTSUPP;
+	return blocking_notifier_chain_register(&dev->priv.vhca_state_notifier->n_head, nb);
+}
+
+void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&dev->priv.vhca_state_notifier->n_head, nb);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h
new file mode 100644
index 000000000000..1fe1ec6f4d4b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#ifndef __MLX5_VHCA_EVENT_H__
+#define __MLX5_VHCA_EVENT_H__
+
+#ifdef CONFIG_MLX5_SF
+
+struct mlx5_vhca_state_event {
+	u16 function_id;
+	u16 sw_function_id;
+	u8 new_vhca_state;
+	bool ecpu;
+};
+
+static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN_MAX(dev, vhca_state);
+}
+
+void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap);
+int mlx5_vhca_event_init(struct mlx5_core_dev *dev);
+void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev);
+void mlx5_vhca_event_start(struct mlx5_core_dev *dev);
+void mlx5_vhca_event_stop(struct mlx5_core_dev *dev);
+int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
+void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
+int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id);
+int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu);
+int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id,
+			      bool ecpu, u32 *out, u32 outlen);
+#else
+
+static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap)
+{
+}
+
+static inline int mlx5_vhca_event_init(struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
+static inline void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev)
+{
+}
+
+static inline void mlx5_vhca_event_start(struct mlx5_core_dev *dev)
+{
+}
+
+static inline void mlx5_vhca_event_stop(struct mlx5_core_dev *dev)
+{
+}
+
+#endif
+
+#endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f93bfe7473aa..ffba0786051e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -507,6 +507,7 @@ struct mlx5_devcom;
 struct mlx5_fw_reset;
 struct mlx5_eq_table;
 struct mlx5_irq_table;
+struct mlx5_vhca_state_notifier;
 
 struct mlx5_rate_limit {
 	u32			rate;
@@ -603,6 +604,9 @@ struct mlx5_priv {
 
 	struct mlx5_bfreg_data		bfregs;
 	struct mlx5_uars_page	       *uar;
+#ifdef CONFIG_MLX5_SF
+	struct mlx5_vhca_state_notifier *vhca_state_notifier;
+#endif
 };
 
 enum mlx5_device_state {
-- 
cgit v1.2.3


From 90d010b8634b89a97ca3b7aa6a88fd566fc77717 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 11 Dec 2020 22:12:17 -0800
Subject: net/mlx5: SF, Add auxiliary device support

Introduce API to add and delete an auxiliary device for an SF.
Each SF has its own dedicated window in the PCI BAR 2.

SF device is similar to PCI PF and VF that supports multiple class of
devices such as net, rdma and vdpa.

SF device will be added or removed in subsequent patch during SF
devlink port function state change command.

A subfunction device exposes user supplied subfunction number which will
be further used by systemd/udev to have deterministic name for its
netdevice and rdma device.

An mlx5 subfunction auxiliary device example:

$ devlink dev eswitch set pci/0000:06:00.0 mode switchdev

$ devlink port show
pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false

$ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88
pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false
  function:
    hw_addr 00:00:00:00:00:00 state inactive opstate detached

$ devlink port show ens2f0npf0sf88
pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false
  function:
    hw_addr 00:00:00:00:88:88 state inactive opstate detached

$ devlink port function set ens2f0npf0sf88 hw_addr 00:00:00:00:88:88 state active

On activation,

$ ls -l /sys/bus/auxiliary/devices/
mlx5_core.sf.4 -> ../../../devices/pci0000:00/0000:00:03.0/0000:06:00.0/mlx5_core.sf.4

$ cat /sys/bus/auxiliary/devices/mlx5_core.sf.4/sfnum
88

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Vu Pham <vuhuong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../device_drivers/ethernet/mellanox/mlx5.rst      |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   4 +
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c   | 265 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h   |  35 +++
 include/linux/mlx5/driver.h                        |   2 +
 6 files changed, 312 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h

(limited to 'include/linux')

diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
index e9b65035cd47..a5eb22793bb9 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
@@ -97,6 +97,11 @@ Enabling the driver and kconfig options
 
 |   Provides low-level InfiniBand/RDMA and `RoCE <https://community.mellanox.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support.
 
+**CONFIG_MLX5_SF=(y/n)**
+
+|   Build support for subfunction.
+|   Subfunctons are more light weight than PCI SRIOV VFs. Choosing this option
+|   will enable support for creating subfunction devices.
 
 **External options** ( Choose if the corresponding mlx5 feature is required )
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 22ef2ebbee96..f5b2e9101348 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -89,4 +89,4 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
 #
 # SF device
 #
-mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o
+mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index b16f57befe52..26b5502712a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -74,6 +74,7 @@
 #include "lib/hv_vhca.h"
 #include "diag/rsc_dump.h"
 #include "sf/vhca_event.h"
+#include "sf/dev/dev.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -1155,6 +1156,8 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 		goto err_sriov;
 	}
 
+	mlx5_sf_dev_table_create(dev);
+
 	return 0;
 
 err_sriov:
@@ -1186,6 +1189,7 @@ err_irq_table:
 
 static void mlx5_unload(struct mlx5_core_dev *dev)
 {
+	mlx5_sf_dev_table_destroy(dev);
 	mlx5_sriov_detach(dev);
 	mlx5_ec_cleanup(dev);
 	mlx5_vhca_event_stop(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
new file mode 100644
index 000000000000..4a8eeb7c853e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+#include "mlx5_core.h"
+#include "dev.h"
+#include "sf/vhca_event.h"
+#include "sf/sf.h"
+#include "sf/mlx5_ifc_vhca_event.h"
+#include "ecpf.h"
+
+struct mlx5_sf_dev_table {
+	struct xarray devices;
+	unsigned int max_sfs;
+	phys_addr_t base_address;
+	u64 sf_bar_length;
+	struct notifier_block nb;
+	struct mlx5_core_dev *dev;
+};
+
+static bool mlx5_sf_dev_supported(const struct mlx5_core_dev *dev)
+{
+	return MLX5_CAP_GEN(dev, sf) && mlx5_vhca_event_supported(dev);
+}
+
+static ssize_t sfnum_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct auxiliary_device *adev = container_of(dev, struct auxiliary_device, dev);
+	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", sf_dev->sfnum);
+}
+static DEVICE_ATTR_RO(sfnum);
+
+static struct attribute *sf_device_attrs[] = {
+	&dev_attr_sfnum.attr,
+	NULL,
+};
+
+static const struct attribute_group sf_attr_group = {
+	.attrs = sf_device_attrs,
+};
+
+static const struct attribute_group *sf_attr_groups[2] = {
+	&sf_attr_group,
+	NULL
+};
+
+static void mlx5_sf_dev_release(struct device *device)
+{
+	struct auxiliary_device *adev = container_of(device, struct auxiliary_device, dev);
+	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
+
+	mlx5_adev_idx_free(adev->id);
+	kfree(sf_dev);
+}
+
+static void mlx5_sf_dev_remove(struct mlx5_sf_dev *sf_dev)
+{
+	auxiliary_device_delete(&sf_dev->adev);
+	auxiliary_device_uninit(&sf_dev->adev);
+}
+
+static void mlx5_sf_dev_add(struct mlx5_core_dev *dev, u16 sf_index, u32 sfnum)
+{
+	struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table;
+	struct mlx5_sf_dev *sf_dev;
+	struct pci_dev *pdev;
+	int err;
+	int id;
+
+	id = mlx5_adev_idx_alloc();
+	if (id < 0) {
+		err = id;
+		goto add_err;
+	}
+
+	sf_dev = kzalloc(sizeof(*sf_dev), GFP_KERNEL);
+	if (!sf_dev) {
+		mlx5_adev_idx_free(id);
+		err = -ENOMEM;
+		goto add_err;
+	}
+	pdev = dev->pdev;
+	sf_dev->adev.id = id;
+	sf_dev->adev.name = MLX5_SF_DEV_ID_NAME;
+	sf_dev->adev.dev.release = mlx5_sf_dev_release;
+	sf_dev->adev.dev.parent = &pdev->dev;
+	sf_dev->adev.dev.groups = sf_attr_groups;
+	sf_dev->sfnum = sfnum;
+	sf_dev->parent_mdev = dev;
+
+	if (!table->max_sfs) {
+		mlx5_adev_idx_free(id);
+		kfree(sf_dev);
+		err = -EOPNOTSUPP;
+		goto add_err;
+	}
+	sf_dev->bar_base_addr = table->base_address + (sf_index * table->sf_bar_length);
+
+	err = auxiliary_device_init(&sf_dev->adev);
+	if (err) {
+		mlx5_adev_idx_free(id);
+		kfree(sf_dev);
+		goto add_err;
+	}
+
+	err = auxiliary_device_add(&sf_dev->adev);
+	if (err) {
+		put_device(&sf_dev->adev.dev);
+		goto add_err;
+	}
+
+	err = xa_insert(&table->devices, sf_index, sf_dev, GFP_KERNEL);
+	if (err)
+		goto xa_err;
+	return;
+
+xa_err:
+	mlx5_sf_dev_remove(sf_dev);
+add_err:
+	mlx5_core_err(dev, "SF DEV: fail device add for index=%d sfnum=%d err=%d\n",
+		      sf_index, sfnum, err);
+}
+
+static void mlx5_sf_dev_del(struct mlx5_core_dev *dev, struct mlx5_sf_dev *sf_dev, u16 sf_index)
+{
+	struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table;
+
+	xa_erase(&table->devices, sf_index);
+	mlx5_sf_dev_remove(sf_dev);
+}
+
+static int
+mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_code, void *data)
+{
+	struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb);
+	const struct mlx5_vhca_state_event *event = data;
+	struct mlx5_sf_dev *sf_dev;
+	u16 sf_index;
+
+	sf_index = event->function_id - MLX5_CAP_GEN(table->dev, sf_base_id);
+	sf_dev = xa_load(&table->devices, sf_index);
+	switch (event->new_vhca_state) {
+	case MLX5_VHCA_STATE_ALLOCATED:
+		if (sf_dev)
+			mlx5_sf_dev_del(table->dev, sf_dev, sf_index);
+		break;
+	case MLX5_VHCA_STATE_TEARDOWN_REQUEST:
+		if (sf_dev)
+			mlx5_sf_dev_del(table->dev, sf_dev, sf_index);
+		else
+			mlx5_core_err(table->dev,
+				      "SF DEV: teardown state for invalid dev index=%d fn_id=0x%x\n",
+				      sf_index, event->sw_function_id);
+		break;
+	case MLX5_VHCA_STATE_ACTIVE:
+		if (!sf_dev)
+			mlx5_sf_dev_add(table->dev, sf_index, event->sw_function_id);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table)
+{
+	struct mlx5_core_dev *dev = table->dev;
+	u16 max_functions;
+	u16 function_id;
+	int err = 0;
+	bool ecpu;
+	int i;
+
+	max_functions = mlx5_sf_max_functions(dev);
+	function_id = MLX5_CAP_GEN(dev, sf_base_id);
+	ecpu = mlx5_read_embedded_cpu(dev);
+	/* Arm the vhca context as the vhca event notifier */
+	for (i = 0; i < max_functions; i++) {
+		err = mlx5_vhca_event_arm(dev, function_id, ecpu);
+		if (err)
+			return err;
+
+		function_id++;
+	}
+	return 0;
+}
+
+void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_dev_table *table;
+	unsigned int max_sfs;
+	int err;
+
+	if (!mlx5_sf_dev_supported(dev) || !mlx5_vhca_event_supported(dev))
+		return;
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table) {
+		err = -ENOMEM;
+		goto table_err;
+	}
+
+	table->nb.notifier_call = mlx5_sf_dev_state_change_handler;
+	table->dev = dev;
+	if (MLX5_CAP_GEN(dev, max_num_sf))
+		max_sfs = MLX5_CAP_GEN(dev, max_num_sf);
+	else
+		max_sfs = 1 << MLX5_CAP_GEN(dev, log_max_sf);
+	table->sf_bar_length = 1 << (MLX5_CAP_GEN(dev, log_min_sf_size) + 12);
+	table->base_address = pci_resource_start(dev->pdev, 2);
+	table->max_sfs = max_sfs;
+	xa_init(&table->devices);
+	dev->priv.sf_dev_table = table;
+
+	err = mlx5_vhca_event_notifier_register(dev, &table->nb);
+	if (err)
+		goto vhca_err;
+	err = mlx5_sf_dev_vhca_arm_all(table);
+	if (err)
+		goto arm_err;
+	mlx5_core_dbg(dev, "SF DEV: max sf devices=%d\n", max_sfs);
+	return;
+
+arm_err:
+	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
+vhca_err:
+	table->max_sfs = 0;
+	kfree(table);
+	dev->priv.sf_dev_table = NULL;
+table_err:
+	mlx5_core_err(dev, "SF DEV table create err = %d\n", err);
+}
+
+static void mlx5_sf_dev_destroy_all(struct mlx5_sf_dev_table *table)
+{
+	struct mlx5_sf_dev *sf_dev;
+	unsigned long index;
+
+	xa_for_each(&table->devices, index, sf_dev) {
+		xa_erase(&table->devices, index);
+		mlx5_sf_dev_remove(sf_dev);
+	}
+}
+
+void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table;
+
+	if (!table)
+		return;
+
+	mlx5_vhca_event_notifier_unregister(dev, &table->nb);
+
+	/* Now that event handler is not running, it is safe to destroy
+	 * the sf device without race.
+	 */
+	mlx5_sf_dev_destroy_all(table);
+
+	WARN_ON(!xa_empty(&table->devices));
+	kfree(table);
+	dev->priv.sf_dev_table = NULL;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
new file mode 100644
index 000000000000..a6fb7289ba2c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#ifndef __MLX5_SF_DEV_H__
+#define __MLX5_SF_DEV_H__
+
+#ifdef CONFIG_MLX5_SF
+
+#include <linux/auxiliary_bus.h>
+
+#define MLX5_SF_DEV_ID_NAME "sf"
+
+struct mlx5_sf_dev {
+	struct auxiliary_device adev;
+	struct mlx5_core_dev *parent_mdev;
+	phys_addr_t bar_base_addr;
+	u32 sfnum;
+};
+
+void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev);
+void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev);
+
+#else
+
+static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
+{
+}
+
+static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
+{
+}
+
+#endif
+
+#endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ffba0786051e..08e5fbe97df0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -508,6 +508,7 @@ struct mlx5_fw_reset;
 struct mlx5_eq_table;
 struct mlx5_irq_table;
 struct mlx5_vhca_state_notifier;
+struct mlx5_sf_dev_table;
 
 struct mlx5_rate_limit {
 	u32			rate;
@@ -606,6 +607,7 @@ struct mlx5_priv {
 	struct mlx5_uars_page	       *uar;
 #ifdef CONFIG_MLX5_SF
 	struct mlx5_vhca_state_notifier *vhca_state_notifier;
+	struct mlx5_sf_dev_table *sf_dev_table;
 #endif
 };
 
-- 
cgit v1.2.3


From 1958fc2f0712ae771bfd2351b55e5a5b6b0bcfa4 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 11 Dec 2020 22:12:18 -0800
Subject: net/mlx5: SF, Add auxiliary device driver

Add auxiliary device driver for mlx5 subfunction auxiliary device.

A mlx5 subfunction is similar to PCI PF and VF. For a subfunction
an auxiliary device is created.

As a result, when mlx5 SF auxiliary device binds to the driver,
its netdev and rdma device are created, they appear as

$ ls -l /sys/bus/auxiliary/devices/
mlx5_core.sf.4 -> ../../../devices/pci0000:00/0000:00:03.0/0000:06:00.0/mlx5_core.sf.4

$ ls -l /sys/class/net/eth1/device
/sys/class/net/eth1/device -> ../../../mlx5_core.sf.4

$ cat /sys/bus/auxiliary/devices/mlx5_core.sf.4/sfnum
88

$ devlink dev show
pci/0000:06:00.0
auxiliary/mlx5_core.sf.4

$ devlink port show auxiliary/mlx5_core.sf.4/1
auxiliary/mlx5_core.sf.4/1: type eth netdev p0sf88 flavour virtual port 0 splittable false

$ rdma link show mlx5_0/1
link mlx5_0/1 state ACTIVE physical_state LINK_UP netdev p0sf88

$ rdma dev show
8: rocep6s0f1: node_type ca fw 16.29.0550 node_guid 248a:0703:00b3:d113 sys_image_guid 248a:0703:00b3:d112
13: mlx5_0: node_type ca fw 16.29.0550 node_guid 0000:00ff:fe00:8888 sys_image_guid 248a:0703:00b3:d112

In future, devlink device instance name will adapt to have sfnum
annotation using either an alias or as devlink instance name described
in RFC [1].

[1] https://lore.kernel.org/netdev/20200519092258.GF4655@nanopsycho/

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Vu Pham <vuhuong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c  |  12 +++
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  12 ++-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  10 ++
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c  |  20 ++++
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.c   |  10 ++
 .../net/ethernet/mellanox/mlx5/core/sf/dev/dev.h   |  20 ++++
 .../ethernet/mellanox/mlx5/core/sf/dev/driver.c    | 101 +++++++++++++++++++++
 include/linux/mlx5/driver.h                        |   4 +-
 10 files changed, 187 insertions(+), 6 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index f5b2e9101348..43fa5efd403c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -89,4 +89,4 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
 #
 # SF device
 #
-mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o
+mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 3261d0dc1104..9afe918c5827 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -7,6 +7,7 @@
 #include "fw_reset.h"
 #include "fs_core.h"
 #include "eswitch.h"
+#include "sf/dev/dev.h"
 
 static int mlx5_devlink_flash_update(struct devlink *devlink,
 				     struct devlink_flash_update_params *params,
@@ -127,6 +128,17 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change,
 				    struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	bool sf_dev_allocated;
+
+	sf_dev_allocated = mlx5_sf_dev_allocated(dev);
+	if (sf_dev_allocated) {
+		/* Reload results in deleting SF device which further results in
+		 * unregistering devlink instance while holding devlink_mutext.
+		 * Hence, do not support reload.
+		 */
+		NL_SET_ERR_MSG_MOD(extack, "reload is unsupported when SFs are allocated\n");
+		return -EOPNOTSUPP;
+	}
 
 	switch (action) {
 	case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 421febebc658..174dfbc996c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -467,7 +467,7 @@ int mlx5_eq_table_init(struct mlx5_core_dev *dev)
 	for (i = 0; i < MLX5_EVENT_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&eq_table->nh[i]);
 
-	eq_table->irq_table = dev->priv.irq_table;
+	eq_table->irq_table = mlx5_irq_table_get(dev);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 26b5502712a6..ab68320e5516 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -84,7 +84,6 @@ unsigned int mlx5_core_debug_mask;
 module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644);
 MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
 
-#define MLX5_DEFAULT_PROF	2
 static unsigned int prof_sel = MLX5_DEFAULT_PROF;
 module_param_named(prof_sel, prof_sel, uint, 0444);
 MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
@@ -1303,7 +1302,7 @@ out:
 	mutex_unlock(&dev->intf_state_mutex);
 }
 
-static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
+int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	int err;
@@ -1353,7 +1352,7 @@ err_health_init:
 	return err;
 }
 
-static void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
+void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 
@@ -1696,6 +1695,10 @@ static int __init init(void)
 	if (err)
 		goto err_debug;
 
+	err = mlx5_sf_driver_register();
+	if (err)
+		goto err_sf;
+
 #ifdef CONFIG_MLX5_CORE_EN
 	err = mlx5e_init();
 	if (err) {
@@ -1706,6 +1709,8 @@ static int __init init(void)
 
 	return 0;
 
+err_sf:
+	pci_unregister_driver(&mlx5_core_driver);
 err_debug:
 	mlx5_unregister_debugfs();
 	return err;
@@ -1716,6 +1721,7 @@ static void __exit cleanup(void)
 #ifdef CONFIG_MLX5_CORE_EN
 	mlx5e_cleanup();
 #endif
+	mlx5_sf_driver_unregister();
 	pci_unregister_driver(&mlx5_core_driver);
 	mlx5_unregister_debugfs();
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a33b7496d748..3754ef98554f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -117,6 +117,8 @@ enum mlx5_semaphore_space_address {
 	MLX5_SEMAPHORE_SW_RESET         = 0x20,
 };
 
+#define MLX5_DEFAULT_PROF       2
+
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
 int mlx5_query_board_id(struct mlx5_core_dev *dev);
 int mlx5_cmd_init(struct mlx5_core_dev *dev);
@@ -176,6 +178,7 @@ struct cpumask *
 mlx5_irq_get_affinity_mask(struct mlx5_irq_table *irq_table, int vecidx);
 struct cpu_rmap *mlx5_irq_get_rmap(struct mlx5_irq_table *table);
 int mlx5_irq_get_num_comp(struct mlx5_irq_table *table);
+struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);
 
 int mlx5_events_init(struct mlx5_core_dev *dev);
 void mlx5_events_cleanup(struct mlx5_core_dev *dev);
@@ -257,6 +260,13 @@ enum {
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
 
+static inline bool mlx5_core_is_sf(const struct mlx5_core_dev *dev)
+{
+	return dev->coredev_type == MLX5_COREDEV_SF;
+}
+
+int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx);
+void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
 void mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup);
 int mlx5_load_one(struct mlx5_core_dev *dev, bool boot);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 6fd974920394..a61e09aff152 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -30,6 +30,9 @@ int mlx5_irq_table_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_irq_table *irq_table;
 
+	if (mlx5_core_is_sf(dev))
+		return 0;
+
 	irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL);
 	if (!irq_table)
 		return -ENOMEM;
@@ -40,6 +43,9 @@ int mlx5_irq_table_init(struct mlx5_core_dev *dev)
 
 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
 {
+	if (mlx5_core_is_sf(dev))
+		return;
+
 	kvfree(dev->priv.irq_table);
 }
 
@@ -268,6 +274,9 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
 	int nvec;
 	int err;
 
+	if (mlx5_core_is_sf(dev))
+		return 0;
+
 	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
 	       MLX5_IRQ_VEC_COMP_BASE;
 	nvec = min_t(int, nvec, num_eqs);
@@ -319,6 +328,9 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 	struct mlx5_irq_table *table = dev->priv.irq_table;
 	int i;
 
+	if (mlx5_core_is_sf(dev))
+		return;
+
 	/* free_irq requires that affinity and rmap will be cleared
 	 * before calling it. This is why there is asymmetry with set_rmap
 	 * which should be called after alloc_irq but before request_irq.
@@ -332,3 +344,11 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 	kfree(table->irq);
 }
 
+struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)
+{
+#ifdef CONFIG_MLX5_SF
+	if (mlx5_core_is_sf(dev))
+		return dev->priv.parent_mdev->priv.irq_table;
+#endif
+	return dev->priv.irq_table;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
index 4a8eeb7c853e..b265f27b2166 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
@@ -24,6 +24,16 @@ static bool mlx5_sf_dev_supported(const struct mlx5_core_dev *dev)
 	return MLX5_CAP_GEN(dev, sf) && mlx5_vhca_event_supported(dev);
 }
 
+bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table;
+
+	if (!mlx5_sf_dev_supported(dev))
+		return false;
+
+	return !xa_empty(&table->devices);
+}
+
 static ssize_t sfnum_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct auxiliary_device *adev = container_of(dev, struct auxiliary_device, dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
index a6fb7289ba2c..4de02902aef1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h
@@ -13,6 +13,7 @@
 struct mlx5_sf_dev {
 	struct auxiliary_device adev;
 	struct mlx5_core_dev *parent_mdev;
+	struct mlx5_core_dev *mdev;
 	phys_addr_t bar_base_addr;
 	u32 sfnum;
 };
@@ -20,6 +21,11 @@ struct mlx5_sf_dev {
 void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev);
 void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev);
 
+int mlx5_sf_driver_register(void);
+void mlx5_sf_driver_unregister(void);
+
+bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev);
+
 #else
 
 static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev)
@@ -30,6 +36,20 @@ static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev)
 {
 }
 
+static inline int mlx5_sf_driver_register(void)
+{
+	return 0;
+}
+
+static inline void mlx5_sf_driver_unregister(void)
+{
+}
+
+static inline bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
 #endif
 
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
new file mode 100644
index 000000000000..daf63a8115e0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+#include "mlx5_core.h"
+#include "dev.h"
+#include "devlink.h"
+
+static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id)
+{
+	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
+	struct mlx5_core_dev *mdev;
+	struct devlink *devlink;
+	int err;
+
+	devlink = mlx5_devlink_alloc();
+	if (!devlink)
+		return -ENOMEM;
+
+	mdev = devlink_priv(devlink);
+	mdev->device = &adev->dev;
+	mdev->pdev = sf_dev->parent_mdev->pdev;
+	mdev->bar_addr = sf_dev->bar_base_addr;
+	mdev->iseg_base = sf_dev->bar_base_addr;
+	mdev->coredev_type = MLX5_COREDEV_SF;
+	mdev->priv.parent_mdev = sf_dev->parent_mdev;
+	mdev->priv.adev_idx = adev->id;
+	sf_dev->mdev = mdev;
+
+	err = mlx5_mdev_init(mdev, MLX5_DEFAULT_PROF);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_mdev_init on err=%d\n", err);
+		goto mdev_err;
+	}
+
+	mdev->iseg = ioremap(mdev->iseg_base, sizeof(*mdev->iseg));
+	if (!mdev->iseg) {
+		mlx5_core_warn(mdev, "remap error\n");
+		goto remap_err;
+	}
+
+	err = mlx5_load_one(mdev, true);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_load_one err=%d\n", err);
+		goto load_one_err;
+	}
+	return 0;
+
+load_one_err:
+	iounmap(mdev->iseg);
+remap_err:
+	mlx5_mdev_uninit(mdev);
+mdev_err:
+	mlx5_devlink_free(devlink);
+	return err;
+}
+
+static void mlx5_sf_dev_remove(struct auxiliary_device *adev)
+{
+	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(sf_dev->mdev);
+	mlx5_unload_one(sf_dev->mdev, true);
+	iounmap(sf_dev->mdev->iseg);
+	mlx5_mdev_uninit(sf_dev->mdev);
+	mlx5_devlink_free(devlink);
+}
+
+static void mlx5_sf_dev_shutdown(struct auxiliary_device *adev)
+{
+	struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev);
+
+	mlx5_unload_one(sf_dev->mdev, false);
+}
+
+static const struct auxiliary_device_id mlx5_sf_dev_id_table[] = {
+	{ .name = MLX5_ADEV_NAME "." MLX5_SF_DEV_ID_NAME, },
+	{ },
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mlx5_sf_dev_id_table);
+
+static struct auxiliary_driver mlx5_sf_driver = {
+	.name = MLX5_SF_DEV_ID_NAME,
+	.probe = mlx5_sf_dev_probe,
+	.remove = mlx5_sf_dev_remove,
+	.shutdown = mlx5_sf_dev_shutdown,
+	.id_table = mlx5_sf_dev_id_table,
+};
+
+int mlx5_sf_driver_register(void)
+{
+	return auxiliary_driver_register(&mlx5_sf_driver);
+}
+
+void mlx5_sf_driver_unregister(void)
+{
+	auxiliary_driver_unregister(&mlx5_sf_driver);
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 08e5fbe97df0..48e3638b1185 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -193,7 +193,8 @@ enum port_state_policy {
 
 enum mlx5_coredev_type {
 	MLX5_COREDEV_PF,
-	MLX5_COREDEV_VF
+	MLX5_COREDEV_VF,
+	MLX5_COREDEV_SF,
 };
 
 struct mlx5_field_desc {
@@ -608,6 +609,7 @@ struct mlx5_priv {
 #ifdef CONFIG_MLX5_SF
 	struct mlx5_vhca_state_notifier *vhca_state_notifier;
 	struct mlx5_sf_dev_table *sf_dev_table;
+	struct mlx5_core_dev *parent_mdev;
 #endif
 };
 
-- 
cgit v1.2.3


From 8f01054186683fe0986d54d584aa13723d51edce Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 11 Dec 2020 22:12:21 -0800
Subject: net/mlx5: SF, Add port add delete functionality

To handle SF port management outside of the eswitch as independent
software layer, introduce eswitch notifier APIs so that mlx5 upper
layer who wish to support sf port management in switchdev mode can
perform its task whenever eswitch mode is set to switchdev or before
eswitch is disabled.

Initialize sf port table on such eswitch event.

Add SF port add and delete functionality in switchdev mode.
Destroy all SF ports when eswitch is disabled.
Expose SF port add and delete to user via devlink commands.

$ devlink dev eswitch set pci/0000:06:00.0 mode switchdev

$ devlink port show
pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false

$ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88
pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false
  function:
    hw_addr 00:00:00:00:00:00 state inactive opstate detached

$ devlink port show ens2f0npf0sf88
pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false
  function:
    hw_addr 00:00:00:00:00:00 state inactive opstate detached

or by its unique port index:
$ devlink port show pci/0000:06:00.0/32768
pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false
  function:
    hw_addr 00:00:00:00:00:00 state inactive opstate detached

$ devlink port show ens2f0npf0sf88 -jp
{
    "port": {
        "pci/0000:06:00.0/32768": {
            "type": "eth",
            "netdev": "ens2f0npf0sf88",
            "flavour": "pcisf",
            "controller": 0,
            "pfnum": 0,
            "sfnum": 88,
            "external": false,
            "splittable": false,
            "function": {
                "hw_addr": "00:00:00:00:00:00",
                "state": "inactive",
                "opstate": "detached"
            }
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Vu Pham <vuhuong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |  10 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c  |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  25 ++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  12 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  18 ++
 drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c   |  27 ++
 .../net/ethernet/mellanox/mlx5/core/sf/devlink.c   | 316 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/sf/hw_table.c  | 125 ++++++++
 drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h  |  17 ++
 drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h    |  37 +++
 include/linux/mlx5/driver.h                        |   6 +
 13 files changed, 607 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index d6c48582e7a8..ad45d20f9d44 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -212,3 +212,13 @@ config MLX5_SF
 	Build support for subfuction device in the NIC. A Mellanox subfunction
 	device can support RDMA, netdevice and vdpa device.
 	It is similar to a SRIOV VF but it doesn't require SRIOV support.
+
+config MLX5_SF_MANAGER
+	bool
+	depends on MLX5_SF && MLX5_ESWITCH
+	default y
+	help
+	Build support for subfuction port in the NIC. A Mellanox subfunction
+	port is managed through devlink.  A subfunction supports RDMA, netdevice
+	and vdpa device. It is similar to a SRIOV VF but it doesn't require
+	SRIOV support.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 43fa5efd403c..40ed31574c80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -90,3 +90,8 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o
 # SF device
 #
 mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o
+
+#
+# SF manager
+#
+mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 47dcc3ac2cf0..e8cecd50558d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -333,6 +333,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_DEALLOC_MEMIC:
 	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
 	case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS:
+	case MLX5_CMD_OP_DEALLOC_SF:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -466,6 +467,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_RELEASE_XRQ_ERROR:
 	case MLX5_CMD_OP_QUERY_VHCA_STATE:
 	case MLX5_CMD_OP_MODIFY_VHCA_STATE:
+	case MLX5_CMD_OP_ALLOC_SF:
 		*status = MLX5_DRIVER_STATUS_ABORTED;
 		*synd = MLX5_DRIVER_SYND;
 		return -EIO;
@@ -661,6 +663,8 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(MODIFY_XRQ);
 	MLX5_COMMAND_STR_CASE(QUERY_VHCA_STATE);
 	MLX5_COMMAND_STR_CASE(MODIFY_VHCA_STATE);
+	MLX5_COMMAND_STR_CASE(ALLOC_SF);
+	MLX5_COMMAND_STR_CASE(DEALLOC_SF);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 9afe918c5827..d4c0cdf5edd9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -8,6 +8,7 @@
 #include "fs_core.h"
 #include "eswitch.h"
 #include "sf/dev/dev.h"
+#include "sf/sf.h"
 
 static int mlx5_devlink_flash_update(struct devlink *devlink,
 				     struct devlink_flash_update_params *params,
@@ -190,6 +191,10 @@ static const struct devlink_ops mlx5_devlink_ops = {
 	.eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get,
 	.port_function_hw_addr_get = mlx5_devlink_port_function_hw_addr_get,
 	.port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set,
+#endif
+#ifdef CONFIG_MLX5_SF_MANAGER
+	.port_new = mlx5_devlink_sf_port_new,
+	.port_del = mlx5_devlink_sf_port_del,
 #endif
 	.flash_update = mlx5_devlink_flash_update,
 	.info_get = mlx5_devlink_info_get,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 538d2e44a589..820305b1664e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1599,6 +1599,15 @@ mlx5_eswitch_update_num_of_vfs(struct mlx5_eswitch *esw, int num_vfs)
 	kvfree(out);
 }
 
+static void mlx5_esw_mode_change_notify(struct mlx5_eswitch *esw, u16 mode)
+{
+	struct mlx5_esw_event_info info = {};
+
+	info.new_mode = mode;
+
+	blocking_notifier_call_chain(&esw->n_head, 0, &info);
+}
+
 /**
  * mlx5_eswitch_enable_locked - Enable eswitch
  * @esw:	Pointer to eswitch
@@ -1659,6 +1668,8 @@ int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs)
 		 mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
 		 esw->esw_funcs.num_vfs, esw->enabled_vports);
 
+	mlx5_esw_mode_change_notify(esw, mode);
+
 	return 0;
 
 abort:
@@ -1715,6 +1726,11 @@ void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf)
 		 esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
 		 esw->esw_funcs.num_vfs, esw->enabled_vports);
 
+	/* Notify eswitch users that it is exiting from current mode.
+	 * So that it can do necessary cleanup before the eswitch is disabled.
+	 */
+	mlx5_esw_mode_change_notify(esw, MLX5_ESWITCH_NONE);
+
 	mlx5_eswitch_event_handlers_unregister(esw);
 
 	if (esw->mode == MLX5_ESWITCH_LEGACY)
@@ -1815,6 +1831,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
 
 	dev->priv.eswitch = esw;
+	BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head);
 	return 0;
 abort:
 	if (esw->work_queue)
@@ -2506,4 +2523,12 @@ bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
 		dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS);
 }
 
+int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&esw->n_head, nb);
+}
 
+void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&esw->n_head, nb);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 54514b04808d..479d2ac2cd85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -278,6 +278,7 @@ struct mlx5_eswitch {
 	struct {
 		u32             large_group_num;
 	}  params;
+	struct blocking_notifier_head n_head;
 };
 
 void esw_offloads_disable(struct mlx5_eswitch *esw);
@@ -733,6 +734,17 @@ int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_p
 				      u16 vport_num, u32 sfnum);
 void mlx5_esw_offloads_sf_vport_disable(struct mlx5_eswitch *esw, u16 vport_num);
 
+/**
+ * mlx5_esw_event_info - Indicates eswitch mode changed/changing.
+ *
+ * @new_mode: New mode of eswitch.
+ */
+struct mlx5_esw_event_info {
+	u16 new_mode;
+};
+
+int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *n);
+void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *n);
 #else  /* CONFIG_MLX5_ESWITCH */
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ab68320e5516..d6bd09dd7490 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -893,6 +893,18 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 		goto err_fpga_cleanup;
 	}
 
+	err = mlx5_sf_hw_table_init(dev);
+	if (err) {
+		mlx5_core_err(dev, "Failed to init SF HW table %d\n", err);
+		goto err_sf_hw_table_cleanup;
+	}
+
+	err = mlx5_sf_table_init(dev);
+	if (err) {
+		mlx5_core_err(dev, "Failed to init SF table %d\n", err);
+		goto err_sf_table_cleanup;
+	}
+
 	dev->dm = mlx5_dm_create(dev);
 	if (IS_ERR(dev->dm))
 		mlx5_core_warn(dev, "Failed to init device memory%d\n", err);
@@ -903,6 +915,10 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 	return 0;
 
+err_sf_table_cleanup:
+	mlx5_sf_hw_table_cleanup(dev);
+err_sf_hw_table_cleanup:
+	mlx5_vhca_event_cleanup(dev);
 err_fpga_cleanup:
 	mlx5_fpga_cleanup(dev);
 err_eswitch_cleanup:
@@ -936,6 +952,8 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_dm_cleanup(dev);
+	mlx5_sf_table_cleanup(dev);
+	mlx5_sf_hw_table_cleanup(dev);
 	mlx5_vhca_event_cleanup(dev);
 	mlx5_fpga_cleanup(dev);
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c
new file mode 100644
index 000000000000..0bc3075f34fa
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#include <linux/mlx5/driver.h>
+#include "priv.h"
+
+int mlx5_cmd_alloc_sf(struct mlx5_core_dev *dev, u16 function_id)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_sf_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_sf_in)] = {};
+
+	MLX5_SET(alloc_sf_in, in, opcode, MLX5_CMD_OP_ALLOC_SF);
+	MLX5_SET(alloc_sf_in, in, function_id, function_id);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_sf_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(dealloc_sf_in)] = {};
+
+	MLX5_SET(dealloc_sf_in, in, opcode, MLX5_CMD_OP_DEALLOC_SF);
+	MLX5_SET(dealloc_sf_in, in, function_id, function_id);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
new file mode 100644
index 000000000000..7ad0d210ec30
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#include <linux/mlx5/driver.h>
+#include "eswitch.h"
+#include "priv.h"
+
+struct mlx5_sf {
+	struct devlink_port dl_port;
+	unsigned int port_index;
+	u16 id;
+};
+
+struct mlx5_sf_table {
+	struct mlx5_core_dev *dev; /* To refer from notifier context. */
+	struct xarray port_indices; /* port index based lookup. */
+	refcount_t refcount;
+	struct completion disable_complete;
+	struct notifier_block esw_nb;
+};
+
+static struct mlx5_sf *
+mlx5_sf_lookup_by_index(struct mlx5_sf_table *table, unsigned int port_index)
+{
+	return xa_load(&table->port_indices, port_index);
+}
+
+static int mlx5_sf_id_insert(struct mlx5_sf_table *table, struct mlx5_sf *sf)
+{
+	return xa_insert(&table->port_indices, sf->port_index, sf, GFP_KERNEL);
+}
+
+static void mlx5_sf_id_erase(struct mlx5_sf_table *table, struct mlx5_sf *sf)
+{
+	xa_erase(&table->port_indices, sf->port_index);
+}
+
+static struct mlx5_sf *
+mlx5_sf_alloc(struct mlx5_sf_table *table, u32 sfnum, struct netlink_ext_ack *extack)
+{
+	unsigned int dl_port_index;
+	struct mlx5_sf *sf;
+	u16 hw_fn_id;
+	int id_err;
+	int err;
+
+	id_err = mlx5_sf_hw_table_sf_alloc(table->dev, sfnum);
+	if (id_err < 0) {
+		err = id_err;
+		goto id_err;
+	}
+
+	sf = kzalloc(sizeof(*sf), GFP_KERNEL);
+	if (!sf) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+	sf->id = id_err;
+	hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sf->id);
+	dl_port_index = mlx5_esw_vport_to_devlink_port_index(table->dev, hw_fn_id);
+	sf->port_index = dl_port_index;
+
+	err = mlx5_sf_id_insert(table, sf);
+	if (err)
+		goto insert_err;
+
+	return sf;
+
+insert_err:
+	kfree(sf);
+alloc_err:
+	mlx5_sf_hw_table_sf_free(table->dev, id_err);
+id_err:
+	if (err == -EEXIST)
+		NL_SET_ERR_MSG_MOD(extack, "SF already exist. Choose different sfnum");
+	return ERR_PTR(err);
+}
+
+static void mlx5_sf_free(struct mlx5_sf_table *table, struct mlx5_sf *sf)
+{
+	mlx5_sf_id_erase(table, sf);
+	mlx5_sf_hw_table_sf_free(table->dev, sf->id);
+	kfree(sf);
+}
+
+static struct mlx5_sf_table *mlx5_sf_table_try_get(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_table *table = dev->priv.sf_table;
+
+	if (!table)
+		return NULL;
+
+	return refcount_inc_not_zero(&table->refcount) ? table : NULL;
+}
+
+static void mlx5_sf_table_put(struct mlx5_sf_table *table)
+{
+	if (refcount_dec_and_test(&table->refcount))
+		complete(&table->disable_complete);
+}
+
+static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table,
+		       const struct devlink_port_new_attrs *new_attr,
+		       struct netlink_ext_ack *extack,
+		       unsigned int *new_port_index)
+{
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_sf *sf;
+	u16 hw_fn_id;
+	int err;
+
+	sf = mlx5_sf_alloc(table, new_attr->sfnum, extack);
+	if (IS_ERR(sf))
+		return PTR_ERR(sf);
+
+	hw_fn_id = mlx5_sf_sw_to_hw_id(dev, sf->id);
+	err = mlx5_esw_offloads_sf_vport_enable(esw, &sf->dl_port, hw_fn_id, new_attr->sfnum);
+	if (err)
+		goto esw_err;
+	*new_port_index = sf->port_index;
+	return 0;
+
+esw_err:
+	mlx5_sf_free(table, sf);
+	return err;
+}
+
+static void mlx5_sf_del(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, struct mlx5_sf *sf)
+{
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	u16 hw_fn_id;
+
+	hw_fn_id = mlx5_sf_sw_to_hw_id(dev, sf->id);
+	mlx5_esw_offloads_sf_vport_disable(esw, hw_fn_id);
+	mlx5_sf_free(table, sf);
+}
+
+static int
+mlx5_sf_new_check_attr(struct mlx5_core_dev *dev, const struct devlink_port_new_attrs *new_attr,
+		       struct netlink_ext_ack *extack)
+{
+	if (new_attr->flavour != DEVLINK_PORT_FLAVOUR_PCI_SF) {
+		NL_SET_ERR_MSG_MOD(extack, "Driver supports only SF port addition");
+		return -EOPNOTSUPP;
+	}
+	if (new_attr->port_index_valid) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Driver does not support user defined port index assignment");
+		return -EOPNOTSUPP;
+	}
+	if (!new_attr->sfnum_valid) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "User must provide unique sfnum. Driver does not support auto assignment");
+		return -EOPNOTSUPP;
+	}
+	if (new_attr->controller_valid && new_attr->controller) {
+		NL_SET_ERR_MSG_MOD(extack, "External controller is unsupported");
+		return -EOPNOTSUPP;
+	}
+	if (new_attr->pfnum != PCI_FUNC(dev->pdev->devfn)) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid pfnum supplied");
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+int mlx5_devlink_sf_port_new(struct devlink *devlink,
+			     const struct devlink_port_new_attrs *new_attr,
+			     struct netlink_ext_ack *extack,
+			     unsigned int *new_port_index)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_sf_table *table;
+	int err;
+
+	err = mlx5_sf_new_check_attr(dev, new_attr, extack);
+	if (err)
+		return err;
+
+	table = mlx5_sf_table_try_get(dev);
+	if (!table) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Port add is only supported in eswitch switchdev mode or SF ports are disabled.");
+		return -EOPNOTSUPP;
+	}
+	err = mlx5_sf_add(dev, table, new_attr, extack, new_port_index);
+	mlx5_sf_table_put(table);
+	return err;
+}
+
+int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index,
+			     struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_sf_table *table;
+	struct mlx5_sf *sf;
+	int err = 0;
+
+	table = mlx5_sf_table_try_get(dev);
+	if (!table) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Port del is only supported in eswitch switchdev mode or SF ports are disabled.");
+		return -EOPNOTSUPP;
+	}
+	sf = mlx5_sf_lookup_by_index(table, port_index);
+	if (!sf) {
+		err = -ENODEV;
+		goto sf_err;
+	}
+
+	mlx5_sf_del(dev, table, sf);
+sf_err:
+	mlx5_sf_table_put(table);
+	return err;
+}
+
+static void mlx5_sf_destroy_all(struct mlx5_sf_table *table)
+{
+	struct mlx5_core_dev *dev = table->dev;
+	unsigned long index;
+	struct mlx5_sf *sf;
+
+	xa_for_each(&table->port_indices, index, sf)
+		mlx5_sf_del(dev, table, sf);
+}
+
+static void mlx5_sf_table_enable(struct mlx5_sf_table *table)
+{
+	if (!mlx5_sf_max_functions(table->dev))
+		return;
+
+	init_completion(&table->disable_complete);
+	refcount_set(&table->refcount, 1);
+}
+
+static void mlx5_sf_table_disable(struct mlx5_sf_table *table)
+{
+	if (!mlx5_sf_max_functions(table->dev))
+		return;
+
+	if (!refcount_read(&table->refcount))
+		return;
+
+	/* Balances with refcount_set; drop the reference so that new user cmd cannot start. */
+	mlx5_sf_table_put(table);
+	wait_for_completion(&table->disable_complete);
+
+	/* At this point, no new user commands can start.
+	 * It is safe to destroy all user created SFs.
+	 */
+	mlx5_sf_destroy_all(table);
+}
+
+static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, void *data)
+{
+	struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, esw_nb);
+	const struct mlx5_esw_event_info *mode = data;
+
+	switch (mode->new_mode) {
+	case MLX5_ESWITCH_OFFLOADS:
+		mlx5_sf_table_enable(table);
+		break;
+	case MLX5_ESWITCH_NONE:
+		mlx5_sf_table_disable(table);
+		break;
+	default:
+		break;
+	};
+
+	return 0;
+}
+
+static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev)
+{
+	return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && mlx5_sf_supported(dev);
+}
+
+int mlx5_sf_table_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_table *table;
+	int err;
+
+	if (!mlx5_sf_table_supported(dev))
+		return 0;
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	table->dev = dev;
+	xa_init(&table->port_indices);
+	dev->priv.sf_table = table;
+	table->esw_nb.notifier_call = mlx5_sf_esw_event;
+	err = mlx5_esw_event_notifier_register(dev->priv.eswitch, &table->esw_nb);
+	if (err)
+		goto reg_err;
+	return 0;
+
+reg_err:
+	kfree(table);
+	dev->priv.sf_table = NULL;
+	return err;
+}
+
+void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_table *table = dev->priv.sf_table;
+
+	if (!table)
+		return;
+
+	mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb);
+	WARN_ON(refcount_read(&table->refcount));
+	WARN_ON(!xa_empty(&table->port_indices));
+	kfree(table);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
new file mode 100644
index 000000000000..c7757f399e8a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+#include <linux/mlx5/driver.h>
+#include "vhca_event.h"
+#include "priv.h"
+#include "sf.h"
+#include "ecpf.h"
+
+struct mlx5_sf_hw {
+	u32 usr_sfnum;
+	u8 allocated: 1;
+};
+
+struct mlx5_sf_hw_table {
+	struct mlx5_core_dev *dev;
+	struct mlx5_sf_hw *sfs;
+	int max_local_functions;
+	u8 ecpu: 1;
+};
+
+u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id)
+{
+	return sw_id + mlx5_sf_start_function_id(dev);
+}
+
+int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum)
+{
+	struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table;
+	int sw_id = -ENOSPC;
+	u16 hw_fn_id;
+	int err;
+	int i;
+
+	if (!table->max_local_functions)
+		return -EOPNOTSUPP;
+
+	/* Check if sf with same sfnum already exists or not. */
+	for (i = 0; i < table->max_local_functions; i++) {
+		if (table->sfs[i].allocated && table->sfs[i].usr_sfnum == usr_sfnum)
+			return -EEXIST;
+	}
+
+	/* Find the free entry and allocate the entry from the array */
+	for (i = 0; i < table->max_local_functions; i++) {
+		if (!table->sfs[i].allocated) {
+			table->sfs[i].usr_sfnum = usr_sfnum;
+			table->sfs[i].allocated = true;
+			sw_id = i;
+			break;
+		}
+	}
+	if (sw_id == -ENOSPC) {
+		err = -ENOSPC;
+		goto err;
+	}
+
+	hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id);
+	err = mlx5_cmd_alloc_sf(table->dev, hw_fn_id);
+	if (err)
+		goto err;
+
+	err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum);
+	if (err)
+		goto vhca_err;
+
+	return sw_id;
+
+vhca_err:
+	mlx5_cmd_dealloc_sf(table->dev, hw_fn_id);
+err:
+	table->sfs[i].allocated = false;
+	return err;
+}
+
+void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id)
+{
+	struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table;
+	u16 hw_fn_id;
+
+	hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, id);
+	mlx5_cmd_dealloc_sf(table->dev, hw_fn_id);
+	table->sfs[id].allocated = false;
+}
+
+int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_hw_table *table;
+	struct mlx5_sf_hw *sfs;
+	int max_functions;
+
+	if (!mlx5_sf_supported(dev))
+		return 0;
+
+	max_functions = mlx5_sf_max_functions(dev);
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	sfs = kcalloc(max_functions, sizeof(*sfs), GFP_KERNEL);
+	if (!sfs)
+		goto table_err;
+
+	table->dev = dev;
+	table->sfs = sfs;
+	table->max_local_functions = max_functions;
+	table->ecpu = mlx5_read_embedded_cpu(dev);
+	dev->priv.sf_hw_table = table;
+	mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions);
+	return 0;
+
+table_err:
+	kfree(table);
+	return -ENOMEM;
+}
+
+void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table;
+
+	if (!table)
+		return;
+
+	kfree(table->sfs);
+	kfree(table);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h
new file mode 100644
index 000000000000..7f3622375a9c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020 Mellanox Technologies Ltd */
+
+#ifndef __MLX5_SF_PRIV_H__
+#define __MLX5_SF_PRIV_H__
+
+#include <linux/mlx5/driver.h>
+
+int mlx5_cmd_alloc_sf(struct mlx5_core_dev *dev, u16 function_id);
+int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id);
+
+u16 mlx5_sf_sw_to_hw_id(const struct mlx5_core_dev *dev, u16 sw_id);
+
+int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum);
+void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u16 id);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
index 623191679b49..31278dc42e72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -42,4 +42,41 @@ static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev)
 
 #endif
 
+#ifdef CONFIG_MLX5_SF_MANAGER
+
+int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev);
+void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev);
+
+int mlx5_sf_table_init(struct mlx5_core_dev *dev);
+void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev);
+
+int mlx5_devlink_sf_port_new(struct devlink *devlink,
+			     const struct devlink_port_new_attrs *add_attr,
+			     struct netlink_ext_ack *extack,
+			     unsigned int *new_port_index);
+int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index,
+			     struct netlink_ext_ack *extack);
+
+#else
+
+static inline int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
+static inline void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev)
+{
+}
+
+static inline int mlx5_sf_table_init(struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
+static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
+{
+}
+
+#endif
+
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 48e3638b1185..7e357c7f0d5e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -510,6 +510,8 @@ struct mlx5_eq_table;
 struct mlx5_irq_table;
 struct mlx5_vhca_state_notifier;
 struct mlx5_sf_dev_table;
+struct mlx5_sf_hw_table;
+struct mlx5_sf_table;
 
 struct mlx5_rate_limit {
 	u32			rate;
@@ -611,6 +613,10 @@ struct mlx5_priv {
 	struct mlx5_sf_dev_table *sf_dev_table;
 	struct mlx5_core_dev *parent_mdev;
 #endif
+#ifdef CONFIG_MLX5_SF_MANAGER
+	struct mlx5_sf_hw_table *sf_hw_table;
+	struct mlx5_sf_table *sf_table;
+#endif
 };
 
 enum mlx5_device_state {
-- 
cgit v1.2.3


From 8e7f37f2aaa56b723a24f6872817cf9c6410b613 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 7 Dec 2020 17:41:02 -0800
Subject: mm: Add mem_dump_obj() to print source of memory block

There are kernel facilities such as per-CPU reference counts that give
error messages in generic handlers or callbacks, whose messages are
unenlightening.  In the case of per-CPU reference-count underflow, this
is not a problem when creating a new use of this facility because in that
case the bug is almost certainly in the code implementing that new use.
However, trouble arises when deploying across many systems, which might
exercise corner cases that were not seen during development and testing.
Here, it would be really nice to get some kind of hint as to which of
several uses the underflow was caused by.

This commit therefore exposes a mem_dump_obj() function that takes
a pointer to memory (which must still be allocated if it has been
dynamically allocated) and prints available information on where that
memory came from.  This pointer can reference the middle of the block as
well as the beginning of the block, as needed by things like RCU callback
functions and timer handlers that might not know where the beginning of
the memory block is.  These functions and handlers can use mem_dump_obj()
to print out better hints as to where the problem might lie.

The information printed can depend on kernel configuration.  For example,
the allocation return address can be printed only for slab and slub,
and even then only when the necessary debug has been enabled.  For slab,
build with CONFIG_DEBUG_SLAB=y, and either use sizes with ample space
to the next power of two or use the SLAB_STORE_USER when creating the
kmem_cache structure.  For slub, build with CONFIG_SLUB_DEBUG=y and
boot with slub_debug=U, or pass SLAB_STORE_USER to kmem_cache_create()
if more focused use is desired.  Also for slub, use CONFIG_STACKTRACE
to enable printing of the allocation-time stack trace.

Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
[ paulmck: Convert to printing and change names per Joonsoo Kim. ]
[ paulmck: Move slab definition per Stephen Rothwell and kbuild test robot. ]
[ paulmck: Handle CONFIG_MMU=n case where vmalloc() is kmalloc(). ]
[ paulmck: Apply Vlastimil Babka feedback on slab.c kmem_provenance(). ]
[ paulmck: Extract more info from !SLUB_DEBUG per Joonsoo Kim. ]
[ paulmck: Explicitly check for small pointers per Naresh Kamboju. ]
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/mm.h   |  2 ++
 include/linux/slab.h |  2 ++
 mm/slab.c            | 20 ++++++++++++++
 mm/slab.h            | 12 +++++++++
 mm/slab_common.c     | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slob.c            |  6 +++++
 mm/slub.c            | 40 ++++++++++++++++++++++++++++
 mm/util.c            | 24 +++++++++++++++++
 8 files changed, 181 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5299b90a6c40..af7d050900e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3169,5 +3169,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
 
 extern int sysctl_nr_trim_pages;
 
+void mem_dump_obj(void *object);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index be4ba5867ac5..7ae604076767 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -186,6 +186,8 @@ void kfree(const void *);
 void kfree_sensitive(const void *);
 size_t __ksize(const void *);
 size_t ksize(const void *);
+bool kmem_valid_obj(void *object);
+void kmem_dump_obj(void *object);
 
 #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
 void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
diff --git a/mm/slab.c b/mm/slab.c
index d7c8da9319c7..dcc55e78f353 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3635,6 +3635,26 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #endif /* CONFIG_NUMA */
 
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+	struct kmem_cache *cachep;
+	unsigned int objnr;
+	void *objp;
+
+	kpp->kp_ptr = object;
+	kpp->kp_page = page;
+	cachep = page->slab_cache;
+	kpp->kp_slab_cache = cachep;
+	objp = object - obj_offset(cachep);
+	kpp->kp_data_offset = obj_offset(cachep);
+	page = virt_to_head_page(objp);
+	objnr = obj_to_index(cachep, page, objp);
+	objp = index_to_obj(cachep, page, objnr);
+	kpp->kp_objp = objp;
+	if (DEBUG && cachep->flags & SLAB_STORE_USER)
+		kpp->kp_ret = *dbg_userword(cachep, objp);
+}
+
 /**
  * __do_kmalloc - allocate memory
  * @size: how many bytes of memory are required.
diff --git a/mm/slab.h b/mm/slab.h
index 1a756a359fa8..ecad9b57bc44 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -615,4 +615,16 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c)
 	return false;
 }
 
+#define KS_ADDRS_COUNT 16
+struct kmem_obj_info {
+	void *kp_ptr;
+	struct page *kp_page;
+	void *kp_objp;
+	unsigned long kp_data_offset;
+	struct kmem_cache *kp_slab_cache;
+	void *kp_ret;
+	void *kp_stack[KS_ADDRS_COUNT];
+};
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e981c80d216c..adbace4256ef 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -537,6 +537,81 @@ bool slab_is_available(void)
 	return slab_state >= UP;
 }
 
+/**
+ * kmem_valid_obj - does the pointer reference a valid slab object?
+ * @object: pointer to query.
+ *
+ * Return: %true if the pointer is to a not-yet-freed object from
+ * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
+ * is to an already-freed object, and %false otherwise.
+ */
+bool kmem_valid_obj(void *object)
+{
+	struct page *page;
+
+	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
+	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
+		return false;
+	page = virt_to_head_page(object);
+	return PageSlab(page);
+}
+
+/**
+ * kmem_dump_obj - Print available slab provenance information
+ * @object: slab object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate.  The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For a slab-cache object, the fact that it is a slab object is printed,
+ * and, if available, the slab name, return address, and stack trace from
+ * the allocation of that object.
+ *
+ * This function will splat if passed a pointer to a non-slab object.
+ * If you are not sure what type of object you have, you should instead
+ * use mem_dump_obj().
+ */
+void kmem_dump_obj(void *object)
+{
+	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
+	int i;
+	struct page *page;
+	unsigned long ptroffset;
+	struct kmem_obj_info kp = { };
+
+	if (WARN_ON_ONCE(!virt_addr_valid(object)))
+		return;
+	page = virt_to_head_page(object);
+	if (WARN_ON_ONCE(!PageSlab(page))) {
+		pr_cont(" non-slab memory.\n");
+		return;
+	}
+	kmem_obj_info(&kp, object, page);
+	if (kp.kp_slab_cache)
+		pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
+	else
+		pr_cont(" slab%s", cp);
+	if (kp.kp_objp)
+		pr_cont(" start %px", kp.kp_objp);
+	if (kp.kp_data_offset)
+		pr_cont(" data offset %lu", kp.kp_data_offset);
+	if (kp.kp_objp) {
+		ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
+		pr_cont(" pointer offset %lu", ptroffset);
+	}
+	if (kp.kp_slab_cache && kp.kp_slab_cache->usersize)
+		pr_cont(" size %u", kp.kp_slab_cache->usersize);
+	if (kp.kp_ret)
+		pr_cont(" allocated at %pS\n", kp.kp_ret);
+	else
+		pr_cont("\n");
+	for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
+		if (!kp.kp_stack[i])
+			break;
+		pr_info("    %pS\n", kp.kp_stack[i]);
+	}
+}
+
 #ifndef CONFIG_SLOB
 /* Create a cache during boot when no slab services are available yet */
 void __init create_boot_cache(struct kmem_cache *s, const char *name,
diff --git a/mm/slob.c b/mm/slob.c
index 8d4bfa46247f..ef87ada8705d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -461,6 +461,12 @@ out:
 	spin_unlock_irqrestore(&slob_lock, flags);
 }
 
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+	kpp->kp_ptr = object;
+	kpp->kp_page = page;
+}
+
 /*
  * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
  */
diff --git a/mm/slub.c b/mm/slub.c
index 0c8b43a5b3b0..3c1a84316fd7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3919,6 +3919,46 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	return 0;
 }
 
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+{
+	void *base;
+	int __maybe_unused i;
+	unsigned int objnr;
+	void *objp;
+	void *objp0;
+	struct kmem_cache *s = page->slab_cache;
+	struct track __maybe_unused *trackp;
+
+	kpp->kp_ptr = object;
+	kpp->kp_page = page;
+	kpp->kp_slab_cache = s;
+	base = page_address(page);
+	objp0 = kasan_reset_tag(object);
+#ifdef CONFIG_SLUB_DEBUG
+	objp = restore_red_left(s, objp0);
+#else
+	objp = objp0;
+#endif
+	objnr = obj_to_index(s, page, objp);
+	kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
+	objp = base + s->size * objnr;
+	kpp->kp_objp = objp;
+	if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) ||
+	    !(s->flags & SLAB_STORE_USER))
+		return;
+#ifdef CONFIG_SLUB_DEBUG
+	trackp = get_track(s, objp, TRACK_ALLOC);
+	kpp->kp_ret = (void *)trackp->addr;
+#ifdef CONFIG_STACKTRACE
+	for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+		kpp->kp_stack[i] = (void *)trackp->addrs[i];
+		if (!kpp->kp_stack[i])
+			break;
+	}
+#endif
+#endif
+}
+
 /********************************************************************
  *		Kmalloc subsystem
  *******************************************************************/
diff --git a/mm/util.c b/mm/util.c
index 8c9b7d1e7c49..da46f9d6c382 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -982,3 +982,27 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
 	kunmap_atomic(addr1);
 	return ret;
 }
+
+/**
+ * mem_dump_obj - Print available provenance information
+ * @object: object for which to find provenance information.
+ *
+ * This function uses pr_cont(), so that the caller is expected to have
+ * printed out whatever preamble is appropriate.  The provenance information
+ * depends on the type of object and on how much debugging is enabled.
+ * For example, for a slab-cache object, the slab name is printed, and,
+ * if available, the return address and stack trace from the allocation
+ * of that object.
+ */
+void mem_dump_obj(void *object)
+{
+	if (!virt_addr_valid(object)) {
+		pr_cont(" non-paged (local) memory.\n");
+		return;
+	}
+	if (kmem_valid_obj(object)) {
+		kmem_dump_obj(object);
+		return;
+	}
+	pr_cont(" non-slab memory.\n");
+}
-- 
cgit v1.2.3


From 98f180837a896ecedf8f7e12af22b57f271d43c9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 8 Dec 2020 16:13:57 -0800
Subject: mm: Make mem_dump_obj() handle vmalloc() memory

This commit adds vmalloc() support to mem_dump_obj().  Note that the
vmalloc_dump_obj() function combines the checking and dumping, in
contrast with the split between kmem_valid_obj() and kmem_dump_obj().
The reason for the difference is that the checking in the vmalloc()
case involves acquiring a global lock, and redundant acquisitions of
global locks should be avoided, even on not-so-fast paths.

Note that this change causes on-stack variables to be reported as
vmalloc() storage from kernel_clone() or similar, depending on the degree
of inlining that your compiler does.  This is likely more helpful than
the earlier "non-paged (local) memory".

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: <linux-mm@kvack.org>
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/vmalloc.h |  6 ++++++
 mm/util.c               | 14 ++++++++------
 mm/vmalloc.c            | 12 ++++++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 80c0181c411d..c18f4751a704 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -246,4 +246,10 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 int register_vmap_purge_notifier(struct notifier_block *nb);
 int unregister_vmap_purge_notifier(struct notifier_block *nb);
 
+#ifdef CONFIG_MMU
+bool vmalloc_dump_obj(void *object);
+#else
+static inline bool vmalloc_dump_obj(void *object) { return false; }
+#endif
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/util.c b/mm/util.c
index 92f23d2c1277..54870226cea6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -996,18 +996,20 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
  */
 void mem_dump_obj(void *object)
 {
+	if (kmem_valid_obj(object)) {
+		kmem_dump_obj(object);
+		return;
+	}
+	if (vmalloc_dump_obj(object))
+		return;
 	if (!virt_addr_valid(object)) {
 		if (object == NULL)
 			pr_cont(" NULL pointer.\n");
 		else if (object == ZERO_SIZE_PTR)
 			pr_cont(" zero-size pointer.\n");
 		else
-			pr_cont(" non-paged (local) memory.\n");
-		return;
-	}
-	if (kmem_valid_obj(object)) {
-		kmem_dump_obj(object);
+			pr_cont(" non-paged memory.\n");
 		return;
 	}
-	pr_cont(" non-slab memory.\n");
+	pr_cont(" non-slab/vmalloc memory.\n");
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4d88fe5a277a..c274ea4f14ea 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3448,6 +3448,18 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 }
 #endif	/* CONFIG_SMP */
 
+bool vmalloc_dump_obj(void *object)
+{
+	struct vm_struct *vm;
+	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
+
+	vm = find_vm_area(objp);
+	if (!vm)
+		return false;
+	pr_cont(" vmalloc allocated at %pS\n", vm->caller);
+	return true;
+}
+
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
 	__acquires(&vmap_purge_lock)
-- 
cgit v1.2.3


From e7ed11ee945438b737e2ae2370e35591e16ec371 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Wed, 20 Jan 2021 12:41:55 -0800
Subject: tcp: add TTL to SCM_TIMESTAMPING_OPT_STATS

This patch adds TCP_NLA_TTL to SCM_TIMESTAMPING_OPT_STATS that exports
the time-to-live or hop limit of the latest incoming packet with
SCM_TSTAMP_ACK. The value exported may not be from the packet that acks
the sequence when incoming packets are aggregated. Exporting the
time-to-live or hop limit value of incoming packets helps to estimate
the hop count of the path of the flow that may change over time.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Link: https://lore.kernel.org/r/20210120204155.552275-1-ysseung@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h   |  2 +-
 include/linux/tcp.h      |  3 ++-
 include/uapi/linux/tcp.h |  1 +
 net/core/dev.c           |  2 +-
 net/core/skbuff.c        |  6 ++++--
 net/ipv4/tcp.c           | 18 +++++++++++++++++-
 net/ipv4/tcp_input.c     | 16 ++++++++--------
 7 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 186dad231e30..9313b5aaf45b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3859,7 +3859,7 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
 void skb_complete_tx_timestamp(struct sk_buff *skb,
 			       struct skb_shared_hwtstamps *hwtstamps);
 
-void __skb_tstamp_tx(struct sk_buff *orig_skb,
+void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb,
 		     struct skb_shared_hwtstamps *hwtstamps,
 		     struct sock *sk, int tstype);
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2f87377e9af7..48d8a363319e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -496,7 +496,8 @@ static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
 }
 
 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
-					       const struct sk_buff *orig_skb);
+					       const struct sk_buff *orig_skb,
+					       const struct sk_buff *ack_skb);
 
 static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
 {
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 768e93bd5b51..16dfa40bdac3 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -314,6 +314,7 @@ enum {
 	TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */
 	TCP_NLA_BYTES_NOTSENT,	/* Bytes in write queue not yet sent */
 	TCP_NLA_EDT,		/* Earliest departure time (CLOCK_MONOTONIC) */
+	TCP_NLA_TTL,		/* TTL or hop limit of a packet received */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/core/dev.c b/net/core/dev.c
index d9ce02e95992..6df3f1bcdc68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4084,7 +4084,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	skb_reset_mac_header(skb);
 
 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
-		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
 
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 145503d3f06b..2af12f7e170c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4721,6 +4721,7 @@ err:
 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
 
 void __skb_tstamp_tx(struct sk_buff *orig_skb,
+		     const struct sk_buff *ack_skb,
 		     struct skb_shared_hwtstamps *hwtstamps,
 		     struct sock *sk, int tstype)
 {
@@ -4743,7 +4744,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
 		    sk->sk_protocol == IPPROTO_TCP &&
 		    sk->sk_type == SOCK_STREAM) {
-			skb = tcp_get_timestamping_opt_stats(sk, orig_skb);
+			skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
+							     ack_skb);
 			opt_stats = true;
 		} else
 #endif
@@ -4772,7 +4774,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
 void skb_tstamp_tx(struct sk_buff *orig_skb,
 		   struct skb_shared_hwtstamps *hwtstamps)
 {
-	return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+	return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
 			       SCM_TSTAMP_SND);
 }
 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 856ae516ac18..a1a17b64f8cd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3767,11 +3767,24 @@ static size_t tcp_opt_stats_get_size(void)
 		nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
 		nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
 		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
+		nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
 		0;
 }
 
+/* Returns TTL or hop limit of an incoming packet from skb. */
+static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return ip_hdr(skb)->ttl;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_hdr(skb)->hop_limit;
+	else
+		return 0;
+}
+
 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
-					       const struct sk_buff *orig_skb)
+					       const struct sk_buff *orig_skb,
+					       const struct sk_buff *ack_skb)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *stats;
@@ -3827,6 +3840,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
 		    max_t(int, 0, tp->write_seq - tp->snd_nxt));
 	nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
 			  TCP_NLA_PAD);
+	if (ack_skb)
+		nla_put_u8(stats, TCP_NLA_TTL,
+			   tcp_skb_ttl_or_hop_limit(ack_skb));
 
 	return stats;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a7dfca0a38cd..d4f66aba9fd8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3145,7 +3145,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 }
 
 static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
-			   u32 prior_snd_una)
+			   const struct sk_buff *ack_skb, u32 prior_snd_una)
 {
 	const struct skb_shared_info *shinfo;
 
@@ -3157,7 +3157,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 	if (!before(shinfo->tskey, prior_snd_una) &&
 	    before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
 		tcp_skb_tsorted_save(skb) {
-			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+			__skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
 		} tcp_skb_tsorted_restore(skb);
 	}
 }
@@ -3166,8 +3166,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  * is before the ack sequence we can discard it as it's confirmed to have
  * arrived at the other end.
  */
-static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
-			       u32 prior_snd_una,
+static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
+			       u32 prior_fack, u32 prior_snd_una,
 			       struct tcp_sacktag_state *sack, bool ece_ack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3256,7 +3256,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		if (!fully_acked)
 			break;
 
-		tcp_ack_tstamp(sk, skb, prior_snd_una);
+		tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
 
 		next = skb_rb_next(skb);
 		if (unlikely(skb == tp->retransmit_skb_hint))
@@ -3274,7 +3274,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		tp->snd_up = tp->snd_una;
 
 	if (skb) {
-		tcp_ack_tstamp(sk, skb, prior_snd_una);
+		tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
 		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
 			flag |= FLAG_SACK_RENEGING;
 	}
@@ -3809,8 +3809,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
-				    flag & FLAG_ECE);
+	flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
+				    &sack_state, flag & FLAG_ECE);
 
 	tcp_rack_update_reo_wnd(sk, &rs);
 
-- 
cgit v1.2.3


From d03b195b5aa015f6c11988b86a3625f8d5dbac52 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Tue, 19 Jan 2021 14:08:13 +0200
Subject: sch_htb: Hierarchical QoS hardware offload

HTB doesn't scale well because of contention on a single lock, and it
also consumes CPU. This patch adds support for offloading HTB to
hardware that supports hierarchical rate limiting.

In the offload mode, HTB passes control commands to the driver using
ndo_setup_tc. The driver has to replicate the whole hierarchy of classes
and their settings (rate, ceil) in the NIC. Every modification of the
HTB tree caused by the admin results in ndo_setup_tc being called.

After this setup, the HTB algorithm is done completely in the NIC. An SQ
(send queue) is created for every leaf class and attached to the
hierarchy, so that the NIC can calculate and obey aggregated rate
limits, too. In the future, it can be changed, so that multiple SQs will
back a single leaf class.

ndo_select_queue is responsible for selecting the right queue that
serves the traffic class of each packet.

The data path works as follows: a packet is classified by clsact, the
driver selects a hardware queue according to its class, and the packet
is enqueued into this queue's qdisc.

This solution addresses two main problems of scaling HTB:

1. Contention by flow classification. Currently the filters are attached
to the HTB instance as follows:

    # tc filter add dev eth0 parent 1:0 protocol ip flower dst_port 80
    classid 1:10

It's possible to move classification to clsact egress hook, which is
thread-safe and lock-free:

    # tc filter add dev eth0 egress protocol ip flower dst_port 80
    action skbedit priority 1:10

This way classification still happens in software, but the lock
contention is eliminated, and it happens before selecting the TX queue,
allowing the driver to translate the class to the corresponding hardware
queue in ndo_select_queue.

Note that this is already compatible with non-offloaded HTB and doesn't
require changes to the kernel nor iproute2.

2. Contention by handling packets. HTB is not multi-queue, it attaches
to a whole net device, and handling of all packets takes the same lock.
When HTB is offloaded, it registers itself as a multi-queue qdisc,
similarly to mq: HTB is attached to the netdev, and each queue has its
own qdisc.

Some features of HTB may be not supported by some particular hardware,
for example, the maximum number of classes may be limited, the
granularity of rate and ceil parameters may be different, etc. - so, the
offload is not enabled by default, a new parameter is used to enable it:

    # tc qdisc replace dev eth0 root handle 1: htb offload

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h            |   1 +
 include/net/pkt_cls.h                |  36 +++
 include/uapi/linux/pkt_sched.h       |   1 +
 net/sched/sch_htb.c                  | 501 +++++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/pkt_sched.h |   1 +
 5 files changed, 512 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef517254367d..9e8572533d8e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -858,6 +858,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_ETS,
 	TC_SETUP_QDISC_TBF,
 	TC_SETUP_QDISC_FIFO,
+	TC_SETUP_QDISC_HTB,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0f2a9c44171c..255e4f4b521f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -783,6 +783,42 @@ struct tc_mq_qopt_offload {
 	};
 };
 
+enum tc_htb_command {
+	/* Root */
+	TC_HTB_CREATE, /* Initialize HTB offload. */
+	TC_HTB_DESTROY, /* Destroy HTB offload. */
+
+	/* Classes */
+	/* Allocate qid and create leaf. */
+	TC_HTB_LEAF_ALLOC_QUEUE,
+	/* Convert leaf to inner, preserve and return qid, create new leaf. */
+	TC_HTB_LEAF_TO_INNER,
+	/* Delete leaf, while siblings remain. */
+	TC_HTB_LEAF_DEL,
+	/* Delete leaf, convert parent to leaf, preserving qid. */
+	TC_HTB_LEAF_DEL_LAST,
+	/* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */
+	TC_HTB_LEAF_DEL_LAST_FORCE,
+	/* Modify parameters of a node. */
+	TC_HTB_NODE_MODIFY,
+
+	/* Class qdisc */
+	TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */
+};
+
+struct tc_htb_qopt_offload {
+	struct netlink_ext_ack *extack;
+	enum tc_htb_command command;
+	u16 classid;
+	u32 parent_classid;
+	u16 qid;
+	u16 moved_qid;
+	u64 rate;
+	u64 ceil;
+};
+
+#define TC_HTB_CLASSID_ROOT U32_MAX
+
 enum tc_red_command {
 	TC_RED_REPLACE,
 	TC_RED_DESTROY,
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 9e7c2c607845..79a699f106b1 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -434,6 +434,7 @@ enum {
 	TCA_HTB_RATE64,
 	TCA_HTB_CEIL64,
 	TCA_HTB_PAD,
+	TCA_HTB_OFFLOAD,
 	__TCA_HTB_MAX,
 };
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index a8fc97b05bd8..d1b60fe3d311 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -174,6 +174,11 @@ struct htb_sched {
 	int			row_mask[TC_HTB_MAXDEPTH];
 
 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
+
+	struct Qdisc		**direct_qdiscs;
+	unsigned int            num_direct_qdiscs;
+
+	bool			offload;
 };
 
 /* find class in global hash table using given handle */
@@ -957,7 +962,7 @@ static void htb_reset(struct Qdisc *sch)
 			if (cl->level)
 				memset(&cl->inner, 0, sizeof(cl->inner));
 			else {
-				if (cl->leaf.q)
+				if (cl->leaf.q && !q->offload)
 					qdisc_reset(cl->leaf.q);
 			}
 			cl->prio_activity = 0;
@@ -980,6 +985,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
+	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
 };
 
 static void htb_work_func(struct work_struct *work)
@@ -992,12 +998,27 @@ static void htb_work_func(struct work_struct *work)
 	rcu_read_unlock();
 }
 
+static void htb_set_lockdep_class_child(struct Qdisc *q)
+{
+	static struct lock_class_key child_key;
+
+	lockdep_set_class(qdisc_lock(q), &child_key);
+}
+
+static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
+{
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
+}
+
 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
 		    struct netlink_ext_ack *extack)
 {
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_htb_qopt_offload offload_opt;
 	struct htb_sched *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_glob *gopt;
+	unsigned int ntx;
 	int err;
 
 	qdisc_watchdog_init(&q->watchdog, sch);
@@ -1022,9 +1043,26 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
 	if (gopt->version != HTB_VER >> 16)
 		return -EINVAL;
 
+	q->offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
+
+	if (q->offload) {
+		if (sch->parent != TC_H_ROOT)
+			return -EOPNOTSUPP;
+
+		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+			return -EOPNOTSUPP;
+
+		q->num_direct_qdiscs = dev->real_num_tx_queues;
+		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
+					   sizeof(*q->direct_qdiscs),
+					   GFP_KERNEL);
+		if (!q->direct_qdiscs)
+			return -ENOMEM;
+	}
+
 	err = qdisc_class_hash_init(&q->clhash);
 	if (err < 0)
-		return err;
+		goto err_free_direct_qdiscs;
 
 	qdisc_skb_head_init(&q->direct_queue);
 
@@ -1037,7 +1075,107 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
 		q->rate2quantum = 1;
 	q->defcls = gopt->defcls;
 
+	if (!q->offload)
+		return 0;
+
+	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
+		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+		struct Qdisc *qdisc;
+
+		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
+					  TC_H_MAKE(sch->handle, 0), extack);
+		if (!qdisc) {
+			err = -ENOMEM;
+			goto err_free_qdiscs;
+		}
+
+		htb_set_lockdep_class_child(qdisc);
+		q->direct_qdiscs[ntx] = qdisc;
+		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+	}
+
+	sch->flags |= TCQ_F_MQROOT;
+
+	offload_opt = (struct tc_htb_qopt_offload) {
+		.command = TC_HTB_CREATE,
+		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
+		.classid = TC_H_MIN(q->defcls),
+		.extack = extack,
+	};
+	err = htb_offload(dev, &offload_opt);
+	if (err)
+		goto err_free_qdiscs;
+
 	return 0;
+
+err_free_qdiscs:
+	/* TC_HTB_CREATE call failed, avoid any further calls to the driver. */
+	q->offload = false;
+
+	for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx];
+	     ntx++)
+		qdisc_put(q->direct_qdiscs[ntx]);
+
+	qdisc_class_hash_destroy(&q->clhash);
+	/* Prevent use-after-free and double-free when htb_destroy gets called.
+	 */
+	q->clhash.hash = NULL;
+	q->clhash.hashsize = 0;
+
+err_free_direct_qdiscs:
+	kfree(q->direct_qdiscs);
+	q->direct_qdiscs = NULL;
+	return err;
+}
+
+static void htb_attach_offload(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct htb_sched *q = qdisc_priv(sch);
+	unsigned int ntx;
+
+	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
+		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
+
+		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		qdisc_put(old);
+		qdisc_hash_add(qdisc, false);
+	}
+	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
+		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
+
+		qdisc_put(old);
+	}
+
+	kfree(q->direct_qdiscs);
+	q->direct_qdiscs = NULL;
+}
+
+static void htb_attach_software(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx;
+
+	/* Resemble qdisc_graft behavior. */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
+
+		qdisc_refcount_inc(sch);
+
+		qdisc_put(old);
+	}
+}
+
+static void htb_attach(struct Qdisc *sch)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+
+	if (q->offload)
+		htb_attach_offload(sch);
+	else
+		htb_attach_software(sch);
 }
 
 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -1046,6 +1184,11 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct nlattr *nest;
 	struct tc_htb_glob gopt;
 
+	if (q->offload)
+		sch->flags |= TCQ_F_OFFLOADED;
+	else
+		sch->flags &= ~TCQ_F_OFFLOADED;
+
 	sch->qstats.overlimits = q->overlimits;
 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
 	 * no change can happen on the qdisc parameters.
@@ -1063,6 +1206,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
 		goto nla_put_failure;
+	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
+		goto nla_put_failure;
 
 	return nla_nest_end(skb, nest);
 
@@ -1144,19 +1289,97 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
 }
 
+static struct netdev_queue *
+htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_htb_qopt_offload offload_opt;
+	int err;
+
+	offload_opt = (struct tc_htb_qopt_offload) {
+		.command = TC_HTB_LEAF_QUERY_QUEUE,
+		.classid = TC_H_MIN(tcm->tcm_parent),
+	};
+	err = htb_offload(dev, &offload_opt);
+	if (err || offload_opt.qid >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, offload_opt.qid);
+}
+
+static struct Qdisc *
+htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
+{
+	struct net_device *dev = dev_queue->dev;
+	struct Qdisc *old_q;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+	old_q = dev_graft_qdisc(dev_queue, new_q);
+	if (new_q)
+		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return old_q;
+}
+
+static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new)
+{
+	struct netdev_queue *queue_old, *queue_new;
+	struct net_device *dev = qdisc_dev(sch);
+	struct Qdisc *qdisc;
+
+	queue_old = netdev_get_tx_queue(dev, qid_old);
+	queue_new = netdev_get_tx_queue(dev, qid_new);
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+	qdisc = dev_graft_qdisc(queue_old, NULL);
+	qdisc->dev_queue = queue_new;
+	qdisc = dev_graft_qdisc(queue_new, qdisc);
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
+}
+
 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
+	struct netdev_queue *dev_queue = sch->dev_queue;
 	struct htb_class *cl = (struct htb_class *)arg;
+	struct htb_sched *q = qdisc_priv(sch);
+	struct Qdisc *old_q;
 
 	if (cl->level)
 		return -EINVAL;
-	if (new == NULL &&
-	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
-				     cl->common.classid, extack)) == NULL)
-		return -ENOBUFS;
+
+	if (q->offload) {
+		dev_queue = new->dev_queue;
+		WARN_ON(dev_queue != cl->leaf.q->dev_queue);
+	}
+
+	if (!new) {
+		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
+					cl->common.classid, extack);
+		if (!new)
+			return -ENOBUFS;
+	}
+
+	if (q->offload) {
+		htb_set_lockdep_class_child(new);
+		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
+		qdisc_refcount_inc(new);
+		old_q = htb_graft_helper(dev_queue, new);
+	}
 
 	*old = qdisc_replace(sch, new, &cl->leaf.q);
+
+	if (q->offload) {
+		WARN_ON(old_q != *old);
+		qdisc_put(old_q);
+	}
+
 	return 0;
 }
 
@@ -1184,9 +1407,10 @@ static inline int htb_parent_last_child(struct htb_class *cl)
 	return 1;
 }
 
-static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
 			       struct Qdisc *new_q)
 {
+	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *parent = cl->parent;
 
 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
@@ -1204,6 +1428,71 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
 	parent->cmode = HTB_CAN_SEND;
 }
 
+static void htb_parent_to_leaf_offload(struct Qdisc *sch,
+				       struct netdev_queue *dev_queue,
+				       struct Qdisc *new_q)
+{
+	struct Qdisc *old_q;
+
+	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
+	qdisc_refcount_inc(new_q);
+	old_q = htb_graft_helper(dev_queue, new_q);
+	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
+}
+
+static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
+				     bool last_child, bool destroying,
+				     struct netlink_ext_ack *extack)
+{
+	struct tc_htb_qopt_offload offload_opt;
+	struct Qdisc *q = cl->leaf.q;
+	struct Qdisc *old = NULL;
+	int err;
+
+	if (cl->level)
+		return -EINVAL;
+
+	WARN_ON(!q);
+	if (!destroying) {
+		/* On destroy of HTB, two cases are possible:
+		 * 1. q is a normal qdisc, but q->dev_queue has noop qdisc.
+		 * 2. q is a noop qdisc (for nodes that were inner),
+		 *    q->dev_queue is noop_netdev_queue.
+		 */
+		old = htb_graft_helper(q->dev_queue, NULL);
+		WARN_ON(!old);
+		WARN_ON(old != q);
+	}
+
+	offload_opt = (struct tc_htb_qopt_offload) {
+		.command = !last_child ? TC_HTB_LEAF_DEL :
+			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
+			   TC_HTB_LEAF_DEL_LAST,
+		.classid = cl->common.classid,
+		.extack = extack,
+	};
+	err = htb_offload(qdisc_dev(sch), &offload_opt);
+
+	if (!err || destroying)
+		qdisc_put(old);
+	else
+		htb_graft_helper(q->dev_queue, old);
+
+	if (last_child)
+		return err;
+
+	if (!err && offload_opt.moved_qid != 0) {
+		if (destroying)
+			q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch),
+							   offload_opt.qid);
+		else
+			htb_offload_move_qdisc(sch, offload_opt.moved_qid,
+					       offload_opt.qid);
+	}
+
+	return err;
+}
+
 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 {
 	if (!cl->level) {
@@ -1217,8 +1506,11 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 
 static void htb_destroy(struct Qdisc *sch)
 {
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_htb_qopt_offload offload_opt;
 	struct htb_sched *q = qdisc_priv(sch);
 	struct hlist_node *next;
+	bool nonempty, changed;
 	struct htb_class *cl;
 	unsigned int i;
 
@@ -1237,13 +1529,58 @@ static void htb_destroy(struct Qdisc *sch)
 			cl->block = NULL;
 		}
 	}
-	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
-					  common.hnode)
-			htb_destroy_class(sch, cl);
-	}
+
+	do {
+		nonempty = false;
+		changed = false;
+		for (i = 0; i < q->clhash.hashsize; i++) {
+			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+						  common.hnode) {
+				bool last_child;
+
+				if (!q->offload) {
+					htb_destroy_class(sch, cl);
+					continue;
+				}
+
+				nonempty = true;
+
+				if (cl->level)
+					continue;
+
+				changed = true;
+
+				last_child = htb_parent_last_child(cl);
+				htb_destroy_class_offload(sch, cl, last_child,
+							  true, NULL);
+				qdisc_class_hash_remove(&q->clhash,
+							&cl->common);
+				if (cl->parent)
+					cl->parent->children--;
+				if (last_child)
+					htb_parent_to_leaf(sch, cl, NULL);
+				htb_destroy_class(sch, cl);
+			}
+		}
+	} while (changed);
+	WARN_ON(nonempty);
+
 	qdisc_class_hash_destroy(&q->clhash);
 	__qdisc_reset_queue(&q->direct_queue);
+
+	if (!q->offload)
+		return;
+
+	offload_opt = (struct tc_htb_qopt_offload) {
+		.command = TC_HTB_DESTROY,
+	};
+	htb_offload(dev, &offload_opt);
+
+	if (!q->direct_qdiscs)
+		return;
+	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
+		qdisc_put(q->direct_qdiscs[i]);
+	kfree(q->direct_qdiscs);
 }
 
 static int htb_delete(struct Qdisc *sch, unsigned long arg,
@@ -1253,6 +1590,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
 	struct htb_class *cl = (struct htb_class *)arg;
 	struct Qdisc *new_q = NULL;
 	int last_child = 0;
+	int err;
 
 	/* TODO: why don't allow to delete subtree ? references ? does
 	 * tc subsys guarantee us that in htb_destroy it holds no class
@@ -1261,11 +1599,28 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
 	if (cl->children || cl->filter_cnt)
 		return -EBUSY;
 
-	if (!cl->level && htb_parent_last_child(cl)) {
-		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+	if (!cl->level && htb_parent_last_child(cl))
+		last_child = 1;
+
+	if (q->offload) {
+		err = htb_destroy_class_offload(sch, cl, last_child, false,
+						extack);
+		if (err)
+			return err;
+	}
+
+	if (last_child) {
+		struct netdev_queue *dev_queue;
+
+		dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue;
+		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
 					  cl->parent->common.classid,
 					  NULL);
-		last_child = 1;
+		if (q->offload) {
+			if (new_q)
+				htb_set_lockdep_class_child(new_q);
+			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
+		}
 	}
 
 	sch_tree_lock(sch);
@@ -1286,7 +1641,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
 				  &q->hlevel[cl->level].wait_pq);
 
 	if (last_child)
-		htb_parent_to_leaf(q, cl, new_q);
+		htb_parent_to_leaf(sch, cl, new_q);
 
 	sch_tree_unlock(sch);
 
@@ -1301,9 +1656,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	int err = -EINVAL;
 	struct htb_sched *q = qdisc_priv(sch);
 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
+	struct tc_htb_qopt_offload offload_opt;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct Qdisc *parent_qdisc = NULL;
+	struct netdev_queue *dev_queue;
 	struct tc_htb_opt *hopt;
 	u64 rate64, ceil64;
 	int warn = 0;
@@ -1336,8 +1693,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
 					      NULL));
 
+	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
 	if (!cl) {		/* new class */
-		struct Qdisc *new_q;
+		struct net_device *dev = qdisc_dev(sch);
+		struct Qdisc *new_q, *old_q;
 		int prio;
 		struct {
 			struct nlattr		nla;
@@ -1380,11 +1741,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 						NULL,
 						qdisc_root_sleeping_running(sch),
 						tca[TCA_RATE] ? : &est.nla);
-			if (err) {
-				tcf_block_put(cl->block);
-				kfree(cl);
-				goto failure;
-			}
+			if (err)
+				goto err_block_put;
 		}
 
 		cl->children = 0;
@@ -1393,12 +1751,74 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
 			RB_CLEAR_NODE(&cl->node[prio]);
 
+		cl->common.classid = classid;
+
+		/* Make sure nothing interrupts us in between of two
+		 * ndo_setup_tc calls.
+		 */
+		ASSERT_RTNL();
+
 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
 		 * so that can't be used inside of sch_tree_lock
 		 * -- thanks to Karlis Peisenieks
 		 */
-		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+		if (!q->offload) {
+			dev_queue = sch->dev_queue;
+		} else if (!(parent && !parent->level)) {
+			/* Assign a dev_queue to this classid. */
+			offload_opt = (struct tc_htb_qopt_offload) {
+				.command = TC_HTB_LEAF_ALLOC_QUEUE,
+				.classid = cl->common.classid,
+				.parent_classid = parent ?
+					TC_H_MIN(parent->common.classid) :
+					TC_HTB_CLASSID_ROOT,
+				.rate = max_t(u64, hopt->rate.rate, rate64),
+				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
+				.extack = extack,
+			};
+			err = htb_offload(dev, &offload_opt);
+			if (err) {
+				pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n",
+				       err);
+				goto err_kill_estimator;
+			}
+			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
+		} else { /* First child. */
+			dev_queue = parent->leaf.q->dev_queue;
+			old_q = htb_graft_helper(dev_queue, NULL);
+			WARN_ON(old_q != parent->leaf.q);
+			offload_opt = (struct tc_htb_qopt_offload) {
+				.command = TC_HTB_LEAF_TO_INNER,
+				.classid = cl->common.classid,
+				.parent_classid =
+					TC_H_MIN(parent->common.classid),
+				.rate = max_t(u64, hopt->rate.rate, rate64),
+				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
+				.extack = extack,
+			};
+			err = htb_offload(dev, &offload_opt);
+			if (err) {
+				pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n",
+				       err);
+				htb_graft_helper(dev_queue, old_q);
+				goto err_kill_estimator;
+			}
+			qdisc_put(old_q);
+		}
+		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
 					  classid, NULL);
+		if (q->offload) {
+			if (new_q) {
+				htb_set_lockdep_class_child(new_q);
+				/* One ref for cl->leaf.q, the other for
+				 * dev_queue->qdisc.
+				 */
+				qdisc_refcount_inc(new_q);
+			}
+			old_q = htb_graft_helper(dev_queue, new_q);
+			/* No qdisc_put needed. */
+			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
+		}
 		sch_tree_lock(sch);
 		if (parent && !parent->level) {
 			/* turn parent into inner node */
@@ -1416,10 +1836,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 					 : TC_HTB_MAXDEPTH) - 1;
 			memset(&parent->inner, 0, sizeof(parent->inner));
 		}
+
 		/* leaf (we) needs elementary qdisc */
 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
 
-		cl->common.classid = classid;
 		cl->parent = parent;
 
 		/* set class to be in HTB_CAN_SEND state */
@@ -1445,12 +1865,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			if (err)
 				return err;
 		}
-		sch_tree_lock(sch);
-	}
 
-	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+		if (q->offload) {
+			struct net_device *dev = qdisc_dev(sch);
+
+			offload_opt = (struct tc_htb_qopt_offload) {
+				.command = TC_HTB_NODE_MODIFY,
+				.classid = cl->common.classid,
+				.rate = max_t(u64, hopt->rate.rate, rate64),
+				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
+				.extack = extack,
+			};
+			err = htb_offload(dev, &offload_opt);
+			if (err)
+				/* Estimator was replaced, and rollback may fail
+				 * as well, so we don't try to recover it, and
+				 * the estimator won't work property with the
+				 * offload anyway, because bstats are updated
+				 * only when the stats are queried.
+				 */
+				return err;
+		}
 
-	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+		sch_tree_lock(sch);
+	}
 
 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
@@ -1493,6 +1931,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	*arg = (unsigned long)cl;
 	return 0;
 
+err_kill_estimator:
+	gen_kill_estimator(&cl->rate_est);
+err_block_put:
+	tcf_block_put(cl->block);
+	kfree(cl);
 failure:
 	return err;
 }
@@ -1558,6 +2001,7 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 }
 
 static const struct Qdisc_class_ops htb_class_ops = {
+	.select_queue	=	htb_select_queue,
 	.graft		=	htb_graft,
 	.leaf		=	htb_leaf,
 	.qlen_notify	=	htb_qlen_notify,
@@ -1580,6 +2024,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
 	.dequeue	=	htb_dequeue,
 	.peek		=	qdisc_peek_dequeued,
 	.init		=	htb_init,
+	.attach		=	htb_attach,
 	.reset		=	htb_reset,
 	.destroy	=	htb_destroy,
 	.dump		=	htb_dump,
diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h
index 0d18b1d1fbbc..5c903abc9fa5 100644
--- a/tools/include/uapi/linux/pkt_sched.h
+++ b/tools/include/uapi/linux/pkt_sched.h
@@ -414,6 +414,7 @@ enum {
 	TCA_HTB_RATE64,
 	TCA_HTB_CEIL64,
 	TCA_HTB_PAD,
+	TCA_HTB_OFFLOAD,
 	__TCA_HTB_MAX,
 };
 
-- 
cgit v1.2.3


From 214baf22870cfa437522f3bd4fbae56338674b04 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Tue, 19 Jan 2021 14:08:15 +0200
Subject: net/mlx5e: Support HTB offload

This commit adds support for HTB offload in the mlx5e driver.

Performance:

  NIC: Mellanox ConnectX-6 Dx
  CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz (24 cores with HT)

  100 Gbit/s line rate, 500 UDP streams @ ~200 Mbit/s each
  48 traffic classes, flower used for steering
  No shaping (rate limits set to 4 Gbit/s per TC) - checking for max
  throughput.

  Baseline: 98.7 Gbps, 8.25 Mpps
  HTB: 6.7 Gbps, 0.56 Mpps
  HTB offload: 95.6 Gbps, 8.00 Mpps

Limitations:

1. 256 leaf nodes, 3 levels of depth.

2. Granularity for ceil is 1 Mbit/s. Rates are converted to weights, and
the bandwidth is split among the siblings according to these weights.
Other parameters for classes are not supported.

Ethtool statistics support for QoS SQs are also added. The counters are
called qos_txN_*, where N is the QoS queue number (starting from 0, the
numeration is separate from the normal SQs), and * is the counter name
(the counters are the same as for the normal SQs).

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  27 +-
 .../net/ethernet/mellanox/mlx5/core/en/params.h    |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/qos.c   | 984 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/qos.h   |  44 +
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  21 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 176 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 100 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |  47 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  26 +
 drivers/net/ethernet/mellanox/mlx5/core/qos.c      |  85 ++
 drivers/net/ethernet/mellanox/mlx5/core/qos.h      |  30 +
 include/linux/mlx5/mlx5_ifc.h                      |  13 +-
 15 files changed, 1516 insertions(+), 49 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/qos.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qos.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qos.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 134bd038ae8a..fcfc0b114985 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,7 +16,8 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
 		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \
-		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o fw_reset.o
+		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
+		fw_reset.o qos.o
 
 #
 # Netdev basic
@@ -25,7 +26,8 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
 		en_selftest.o en/port.o en/monitor_stats.o en/health.o \
 		en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
-		en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o
+		en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
+		en/qos.o
 
 #
 # Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 055baf3b6cb1..26e578a973e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -55,6 +55,7 @@
 #include "en_stats.h"
 #include "en/dcbnl.h"
 #include "en/fs.h"
+#include "en/qos.h"
 #include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
@@ -161,6 +162,9 @@ do {                                                            \
 			    ##__VA_ARGS__);                     \
 } while (0)
 
+#define mlx5e_state_dereference(priv, p) \
+	rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock))
+
 enum mlx5e_rq_group {
 	MLX5E_RQ_GROUP_REGULAR,
 	MLX5E_RQ_GROUP_XSK,
@@ -663,11 +667,13 @@ struct mlx5e_channel {
 	struct mlx5e_xdpsq         rq_xdpsq;
 	struct mlx5e_txqsq         sq[MLX5E_MAX_NUM_TC];
 	struct mlx5e_icosq         icosq;   /* internal control operations */
+	struct mlx5e_txqsq __rcu * __rcu *qos_sqs;
 	bool                       xdp;
 	struct napi_struct         napi;
 	struct device             *pdev;
 	struct net_device         *netdev;
 	__be32                     mkey_be;
+	u16                        qos_sqs_size;
 	u8                         num_tc;
 	u8                         lag_port;
 
@@ -756,6 +762,8 @@ struct mlx5e_modify_sq_param {
 	int next_state;
 	int rl_update;
 	int rl_index;
+	bool qos_update;
+	u16 qos_queue_group_id;
 };
 
 #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
@@ -788,10 +796,20 @@ struct mlx5e_scratchpad {
 	cpumask_var_t cpumask;
 };
 
+struct mlx5e_htb {
+	DECLARE_HASHTABLE(qos_tc2node, order_base_2(MLX5E_QOS_MAX_LEAF_NODES));
+	DECLARE_BITMAP(qos_used_qids, MLX5E_QOS_MAX_LEAF_NODES);
+	struct mlx5e_sq_stats **qos_sq_stats;
+	u16 max_qos_sqs;
+	u16 maj_id;
+	u16 defcls;
+};
+
 struct mlx5e_priv {
 	/* priv data path fields - start */
 	/* +1 for port ptp ts */
-	struct mlx5e_txqsq *txq2sq[(MLX5E_MAX_NUM_CHANNELS + 1) * MLX5E_MAX_NUM_TC];
+	struct mlx5e_txqsq *txq2sq[(MLX5E_MAX_NUM_CHANNELS + 1) * MLX5E_MAX_NUM_TC +
+				   MLX5E_QOS_MAX_LEAF_NODES];
 	int channel_tc2realtxq[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
 	int port_ptp_tc2realtxq[MLX5E_MAX_NUM_TC];
 #ifdef CONFIG_MLX5_CORE_EN_DCB
@@ -859,6 +877,7 @@ struct mlx5e_priv {
 	struct mlx5e_hv_vhca_stats_agent stats_agent;
 #endif
 	struct mlx5e_scratchpad    scratchpad;
+	struct mlx5e_htb           htb;
 };
 
 struct mlx5e_rx_handlers {
@@ -986,6 +1005,7 @@ int mlx5e_safe_switch_channels(struct mlx5e_priv *priv,
 			       struct mlx5e_channels *new_chs,
 			       mlx5e_fp_preactivate preactivate,
 			       void *context);
+int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv);
 int mlx5e_num_channels_changed(struct mlx5e_priv *priv);
 int mlx5e_num_channels_changed_ctx(struct mlx5e_priv *priv, void *context);
 void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
@@ -1010,6 +1030,9 @@ void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq);
 
 int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
 		    struct mlx5e_modify_sq_param *p);
+int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix,
+		     struct mlx5e_params *params, struct mlx5e_sq_param *param,
+		     struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, u16 qos_qid);
 void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
 void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq);
 void mlx5e_free_txqsq(struct mlx5e_txqsq *sq);
@@ -1020,8 +1043,10 @@ struct mlx5e_create_sq_param;
 int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
 			struct mlx5e_sq_param *param,
 			struct mlx5e_create_sq_param *csp,
+			u16 qos_queue_group_id,
 			u32 *sqn);
 void mlx5e_tx_err_cqe_work(struct work_struct *recover_work);
+void mlx5e_close_txqsq(struct mlx5e_txqsq *sq);
 
 static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index 807147d97a0f..ea2cfb04b31a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -118,6 +118,8 @@ void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 			  struct mlx5e_rq_param *param);
 void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
 				 struct mlx5e_sq_param *param);
+void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_params *params,
+			  struct mlx5e_sq_param *param);
 void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 			     struct mlx5e_params *params,
 			     struct mlx5e_xsk_param *xsk,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index 2a2bac30daaa..eeddd1137dda 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -261,7 +261,7 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_port_ptp *c, u32 tisn,
 	csp.min_inline_mode = txqsq->min_inline_mode;
 	csp.ts_cqe_to_dest_cqn = ptpsq->ts_cq.mcq.cqn;
 
-	err = mlx5e_create_sq_rdy(c->mdev, sqp, &csp, &txqsq->sqn);
+	err = mlx5e_create_sq_rdy(c->mdev, sqp, &csp, 0, &txqsq->sqn);
 	if (err)
 		goto err_free_txqsq;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
new file mode 100644
index 000000000000..12d7ad061237
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */
+
+#include "en.h"
+#include "params.h"
+#include "../qos.h"
+
+#define BYTES_IN_MBIT 125000
+
+int mlx5e_qos_max_leaf_nodes(struct mlx5_core_dev *mdev)
+{
+	return min(MLX5E_QOS_MAX_LEAF_NODES, mlx5_qos_max_leaf_nodes(mdev));
+}
+
+int mlx5e_qos_cur_leaf_nodes(struct mlx5e_priv *priv)
+{
+	int last = find_last_bit(priv->htb.qos_used_qids, mlx5e_qos_max_leaf_nodes(priv->mdev));
+
+	return last == mlx5e_qos_max_leaf_nodes(priv->mdev) ? 0 : last + 1;
+}
+
+/* Software representation of the QoS tree (internal to this file) */
+
+static int mlx5e_find_unused_qos_qid(struct mlx5e_priv *priv)
+{
+	int size = mlx5e_qos_max_leaf_nodes(priv->mdev);
+	int res;
+
+	WARN_ONCE(!mutex_is_locked(&priv->state_lock), "%s: state_lock is not held\n", __func__);
+	res = find_first_zero_bit(priv->htb.qos_used_qids, size);
+
+	return res == size ? -ENOSPC : res;
+}
+
+struct mlx5e_qos_node {
+	struct hlist_node hnode;
+	struct rcu_head rcu;
+	struct mlx5e_qos_node *parent;
+	u64 rate;
+	u32 bw_share;
+	u32 max_average_bw;
+	u32 hw_id;
+	u32 classid; /* 16-bit, except root. */
+	u16 qid;
+};
+
+#define MLX5E_QOS_QID_INNER 0xffff
+#define MLX5E_HTB_CLASSID_ROOT 0xffffffff
+
+static struct mlx5e_qos_node *
+mlx5e_sw_node_create_leaf(struct mlx5e_priv *priv, u16 classid, u16 qid,
+			  struct mlx5e_qos_node *parent)
+{
+	struct mlx5e_qos_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	node->parent = parent;
+
+	node->qid = qid;
+	__set_bit(qid, priv->htb.qos_used_qids);
+
+	node->classid = classid;
+	hash_add_rcu(priv->htb.qos_tc2node, &node->hnode, classid);
+
+	mlx5e_update_tx_netdev_queues(priv);
+
+	return node;
+}
+
+static struct mlx5e_qos_node *mlx5e_sw_node_create_root(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qos_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	node->qid = MLX5E_QOS_QID_INNER;
+	node->classid = MLX5E_HTB_CLASSID_ROOT;
+	hash_add_rcu(priv->htb.qos_tc2node, &node->hnode, node->classid);
+
+	return node;
+}
+
+static struct mlx5e_qos_node *mlx5e_sw_node_find(struct mlx5e_priv *priv, u32 classid)
+{
+	struct mlx5e_qos_node *node = NULL;
+
+	hash_for_each_possible(priv->htb.qos_tc2node, node, hnode, classid) {
+		if (node->classid == classid)
+			break;
+	}
+
+	return node;
+}
+
+static struct mlx5e_qos_node *mlx5e_sw_node_find_rcu(struct mlx5e_priv *priv, u32 classid)
+{
+	struct mlx5e_qos_node *node = NULL;
+
+	hash_for_each_possible_rcu(priv->htb.qos_tc2node, node, hnode, classid) {
+		if (node->classid == classid)
+			break;
+	}
+
+	return node;
+}
+
+static void mlx5e_sw_node_delete(struct mlx5e_priv *priv, struct mlx5e_qos_node *node)
+{
+	hash_del_rcu(&node->hnode);
+	if (node->qid != MLX5E_QOS_QID_INNER) {
+		__clear_bit(node->qid, priv->htb.qos_used_qids);
+		mlx5e_update_tx_netdev_queues(priv);
+	}
+	kfree_rcu(node, rcu);
+}
+
+/* TX datapath API */
+
+static u16 mlx5e_qid_from_qos(struct mlx5e_channels *chs, u16 qid)
+{
+	/* These channel params are safe to access from the datapath, because:
+	 * 1. This function is called only after checking priv->htb.maj_id != 0,
+	 *    and the number of queues can't change while HTB offload is active.
+	 * 2. When priv->htb.maj_id becomes 0, synchronize_rcu waits for
+	 *    mlx5e_select_queue to finish while holding priv->state_lock,
+	 *    preventing other code from changing the number of queues.
+	 */
+	bool is_ptp = MLX5E_GET_PFLAG(&chs->params, MLX5E_PFLAG_TX_PORT_TS);
+
+	return (chs->params.num_channels + is_ptp) * chs->params.num_tc + qid;
+}
+
+int mlx5e_get_txq_by_classid(struct mlx5e_priv *priv, u16 classid)
+{
+	struct mlx5e_qos_node *node;
+	u16 qid;
+	int res;
+
+	rcu_read_lock();
+
+	node = mlx5e_sw_node_find_rcu(priv, classid);
+	if (!node) {
+		res = -ENOENT;
+		goto out;
+	}
+	qid = READ_ONCE(node->qid);
+	if (qid == MLX5E_QOS_QID_INNER) {
+		res = -EINVAL;
+		goto out;
+	}
+	res = mlx5e_qid_from_qos(&priv->channels, qid);
+
+out:
+	rcu_read_unlock();
+	return res;
+}
+
+static struct mlx5e_txqsq *mlx5e_get_qos_sq(struct mlx5e_priv *priv, int qid)
+{
+	struct mlx5e_params *params = &priv->channels.params;
+	struct mlx5e_txqsq __rcu **qos_sqs;
+	struct mlx5e_channel *c;
+	int ix;
+
+	ix = qid % params->num_channels;
+	qid /= params->num_channels;
+	c = priv->channels.c[ix];
+
+	qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs);
+	return mlx5e_state_dereference(priv, qos_sqs[qid]);
+}
+
+/* SQ lifecycle */
+
+static int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs,
+			     struct mlx5e_qos_node *node)
+{
+	struct mlx5e_create_cq_param ccp = {};
+	struct mlx5e_txqsq __rcu **qos_sqs;
+	struct mlx5e_sq_param param_sq;
+	struct mlx5e_cq_param param_cq;
+	int txq_ix, ix, qid, err = 0;
+	struct mlx5e_params *params;
+	struct mlx5e_channel *c;
+	struct mlx5e_txqsq *sq;
+
+	params = &chs->params;
+
+	txq_ix = mlx5e_qid_from_qos(chs, node->qid);
+
+	WARN_ON(node->qid > priv->htb.max_qos_sqs);
+	if (node->qid == priv->htb.max_qos_sqs) {
+		struct mlx5e_sq_stats *stats, **stats_list = NULL;
+
+		if (priv->htb.max_qos_sqs == 0) {
+			stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev),
+					      sizeof(*stats_list),
+					      GFP_KERNEL);
+			if (!stats_list)
+				return -ENOMEM;
+		}
+		stats = kzalloc(sizeof(*stats), GFP_KERNEL);
+		if (!stats) {
+			kvfree(stats_list);
+			return -ENOMEM;
+		}
+		if (stats_list)
+			WRITE_ONCE(priv->htb.qos_sq_stats, stats_list);
+		WRITE_ONCE(priv->htb.qos_sq_stats[node->qid], stats);
+		/* Order max_qos_sqs increment after writing the array pointer.
+		 * Pairs with smp_load_acquire in en_stats.c.
+		 */
+		smp_store_release(&priv->htb.max_qos_sqs, priv->htb.max_qos_sqs + 1);
+	}
+
+	ix = node->qid % params->num_channels;
+	qid = node->qid / params->num_channels;
+	c = chs->c[ix];
+
+	qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs);
+	sq = kzalloc(sizeof(*sq), GFP_KERNEL);
+
+	if (!sq)
+		return -ENOMEM;
+
+	mlx5e_build_create_cq_param(&ccp, c);
+
+	memset(&param_sq, 0, sizeof(param_sq));
+	memset(&param_cq, 0, sizeof(param_cq));
+	mlx5e_build_sq_param(priv, params, &param_sq);
+	mlx5e_build_tx_cq_param(priv, params, &param_cq);
+	err = mlx5e_open_cq(priv, params->tx_cq_moderation, &param_cq, &ccp, &sq->cq);
+	if (err)
+		goto err_free_sq;
+	err = mlx5e_open_txqsq(c, priv->tisn[c->lag_port][0], txq_ix, params,
+			       &param_sq, sq, 0, node->hw_id, node->qid);
+	if (err)
+		goto err_close_cq;
+
+	rcu_assign_pointer(qos_sqs[qid], sq);
+
+	return 0;
+
+err_close_cq:
+	mlx5e_close_cq(&sq->cq);
+err_free_sq:
+	kfree(sq);
+	return err;
+}
+
+static void mlx5e_activate_qos_sq(struct mlx5e_priv *priv, struct mlx5e_qos_node *node)
+{
+	struct mlx5e_txqsq *sq;
+
+	sq = mlx5e_get_qos_sq(priv, node->qid);
+
+	WRITE_ONCE(priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, node->qid)], sq);
+
+	/* Make the change to txq2sq visible before the queue is started.
+	 * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE,
+	 * which pairs with this barrier.
+	 */
+	smp_wmb();
+
+	qos_dbg(priv->mdev, "Activate QoS SQ qid %u\n", node->qid);
+	mlx5e_activate_txqsq(sq);
+}
+
+static void mlx5e_deactivate_qos_sq(struct mlx5e_priv *priv, u16 qid)
+{
+	struct mlx5e_txqsq *sq;
+
+	sq = mlx5e_get_qos_sq(priv, qid);
+	if (!sq) /* Handle the case when the SQ failed to open. */
+		return;
+
+	qos_dbg(priv->mdev, "Deactivate QoS SQ qid %u\n", qid);
+	mlx5e_deactivate_txqsq(sq);
+
+	/* The queue is disabled, no synchronization with datapath is needed. */
+	priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, qid)] = NULL;
+}
+
+static void mlx5e_close_qos_sq(struct mlx5e_priv *priv, u16 qid)
+{
+	struct mlx5e_txqsq __rcu **qos_sqs;
+	struct mlx5e_params *params;
+	struct mlx5e_channel *c;
+	struct mlx5e_txqsq *sq;
+	int ix;
+
+	params = &priv->channels.params;
+
+	ix = qid % params->num_channels;
+	qid /= params->num_channels;
+	c = priv->channels.c[ix];
+	qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs);
+	sq = rcu_replace_pointer(qos_sqs[qid], NULL, lockdep_is_held(&priv->state_lock));
+	if (!sq) /* Handle the case when the SQ failed to open. */
+		return;
+
+	synchronize_rcu(); /* Sync with NAPI. */
+
+	mlx5e_close_txqsq(sq);
+	mlx5e_close_cq(&sq->cq);
+	kfree(sq);
+}
+
+void mlx5e_qos_close_queues(struct mlx5e_channel *c)
+{
+	struct mlx5e_txqsq __rcu **qos_sqs;
+	int i;
+
+	qos_sqs = rcu_replace_pointer(c->qos_sqs, NULL, lockdep_is_held(&c->priv->state_lock));
+	if (!qos_sqs)
+		return;
+	synchronize_rcu(); /* Sync with NAPI. */
+
+	for (i = 0; i < c->qos_sqs_size; i++) {
+		struct mlx5e_txqsq *sq;
+
+		sq = mlx5e_state_dereference(c->priv, qos_sqs[i]);
+		if (!sq) /* Handle the case when the SQ failed to open. */
+			continue;
+
+		mlx5e_close_txqsq(sq);
+		mlx5e_close_cq(&sq->cq);
+		kfree(sq);
+	}
+
+	kvfree(qos_sqs);
+}
+
+static void mlx5e_qos_close_all_queues(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_qos_close_queues(chs->c[i]);
+}
+
+static int mlx5e_qos_alloc_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs)
+{
+	u16 qos_sqs_size;
+	int i;
+
+	qos_sqs_size = DIV_ROUND_UP(mlx5e_qos_max_leaf_nodes(priv->mdev), chs->num);
+
+	for (i = 0; i < chs->num; i++) {
+		struct mlx5e_txqsq **sqs;
+
+		sqs = kvcalloc(qos_sqs_size, sizeof(struct mlx5e_txqsq *), GFP_KERNEL);
+		if (!sqs)
+			goto err_free;
+
+		WRITE_ONCE(chs->c[i]->qos_sqs_size, qos_sqs_size);
+		smp_wmb(); /* Pairs with mlx5e_napi_poll. */
+		rcu_assign_pointer(chs->c[i]->qos_sqs, sqs);
+	}
+
+	return 0;
+
+err_free:
+	while (--i >= 0) {
+		struct mlx5e_txqsq **sqs;
+
+		sqs = rcu_replace_pointer(chs->c[i]->qos_sqs, NULL,
+					  lockdep_is_held(&priv->state_lock));
+
+		synchronize_rcu(); /* Sync with NAPI. */
+		kvfree(sqs);
+	}
+	return -ENOMEM;
+}
+
+int mlx5e_qos_open_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs)
+{
+	struct mlx5e_qos_node *node = NULL;
+	int bkt, err;
+
+	if (!priv->htb.maj_id)
+		return 0;
+
+	err = mlx5e_qos_alloc_queues(priv, chs);
+	if (err)
+		return err;
+
+	hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) {
+		if (node->qid == MLX5E_QOS_QID_INNER)
+			continue;
+		err = mlx5e_open_qos_sq(priv, chs, node);
+		if (err) {
+			mlx5e_qos_close_all_queues(chs);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+void mlx5e_qos_activate_queues(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qos_node *node = NULL;
+	int bkt;
+
+	hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) {
+		if (node->qid == MLX5E_QOS_QID_INNER)
+			continue;
+		mlx5e_activate_qos_sq(priv, node);
+	}
+}
+
+void mlx5e_qos_deactivate_queues(struct mlx5e_channel *c)
+{
+	struct mlx5e_params *params = &c->priv->channels.params;
+	struct mlx5e_txqsq __rcu **qos_sqs;
+	int i;
+
+	qos_sqs = mlx5e_state_dereference(c->priv, c->qos_sqs);
+	if (!qos_sqs)
+		return;
+
+	for (i = 0; i < c->qos_sqs_size; i++) {
+		u16 qid = params->num_channels * i + c->ix;
+		struct mlx5e_txqsq *sq;
+
+		sq = mlx5e_state_dereference(c->priv, qos_sqs[i]);
+		if (!sq) /* Handle the case when the SQ failed to open. */
+			continue;
+
+		qos_dbg(c->mdev, "Deactivate QoS SQ qid %u\n", qid);
+		mlx5e_deactivate_txqsq(sq);
+
+		/* The queue is disabled, no synchronization with datapath is needed. */
+		c->priv->txq2sq[mlx5e_qid_from_qos(&c->priv->channels, qid)] = NULL;
+	}
+}
+
+static void mlx5e_qos_deactivate_all_queues(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_qos_deactivate_queues(chs->c[i]);
+}
+
+/* HTB API */
+
+int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
+		       struct netlink_ext_ack *extack)
+{
+	struct mlx5e_qos_node *root;
+	bool opened;
+	int err;
+
+	qos_dbg(priv->mdev, "TC_HTB_CREATE handle %04x:, default :%04x\n", htb_maj_id, htb_defcls);
+
+	if (!mlx5_qos_is_supported(priv->mdev)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Missing QoS capabilities. Try disabling SRIOV or use a supported device.");
+		return -EOPNOTSUPP;
+	}
+
+	opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	if (opened) {
+		err = mlx5e_qos_alloc_queues(priv, &priv->channels);
+		if (err)
+			return err;
+	}
+
+	root = mlx5e_sw_node_create_root(priv);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto err_free_queues;
+	}
+
+	err = mlx5_qos_create_root_node(priv->mdev, &root->hw_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware error. Try upgrading firmware.");
+		goto err_sw_node_delete;
+	}
+
+	WRITE_ONCE(priv->htb.defcls, htb_defcls);
+	/* Order maj_id after defcls - pairs with
+	 * mlx5e_select_queue/mlx5e_select_htb_queues.
+	 */
+	smp_store_release(&priv->htb.maj_id, htb_maj_id);
+
+	return 0;
+
+err_sw_node_delete:
+	mlx5e_sw_node_delete(priv, root);
+
+err_free_queues:
+	if (opened)
+		mlx5e_qos_close_all_queues(&priv->channels);
+	return err;
+}
+
+int mlx5e_htb_root_del(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qos_node *root;
+	int err;
+
+	qos_dbg(priv->mdev, "TC_HTB_DESTROY\n");
+
+	WRITE_ONCE(priv->htb.maj_id, 0);
+	synchronize_rcu(); /* Sync with mlx5e_select_htb_queue and TX data path. */
+
+	root = mlx5e_sw_node_find(priv, MLX5E_HTB_CLASSID_ROOT);
+	if (!root) {
+		qos_err(priv->mdev, "Failed to find the root node in the QoS tree\n");
+		return -ENOENT;
+	}
+	err = mlx5_qos_destroy_node(priv->mdev, root->hw_id);
+	if (err)
+		qos_err(priv->mdev, "Failed to destroy root node %u, err = %d\n",
+			root->hw_id, err);
+	mlx5e_sw_node_delete(priv, root);
+
+	mlx5e_qos_deactivate_all_queues(&priv->channels);
+	mlx5e_qos_close_all_queues(&priv->channels);
+
+	return err;
+}
+
+static int mlx5e_htb_convert_rate(struct mlx5e_priv *priv, u64 rate,
+				  struct mlx5e_qos_node *parent, u32 *bw_share)
+{
+	u64 share = 0;
+
+	while (parent->classid != MLX5E_HTB_CLASSID_ROOT && !parent->max_average_bw)
+		parent = parent->parent;
+
+	if (parent->max_average_bw)
+		share = div64_u64(div_u64(rate * 100, BYTES_IN_MBIT),
+				  parent->max_average_bw);
+	else
+		share = 101;
+
+	*bw_share = share == 0 ? 1 : share > 100 ? 0 : share;
+
+	qos_dbg(priv->mdev, "Convert: rate %llu, parent ceil %llu -> bw_share %u\n",
+		rate, (u64)parent->max_average_bw * BYTES_IN_MBIT, *bw_share);
+
+	return 0;
+}
+
+static void mlx5e_htb_convert_ceil(struct mlx5e_priv *priv, u64 ceil, u32 *max_average_bw)
+{
+	*max_average_bw = div_u64(ceil, BYTES_IN_MBIT);
+
+	qos_dbg(priv->mdev, "Convert: ceil %llu -> max_average_bw %u\n",
+		ceil, *max_average_bw);
+}
+
+int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid,
+			       u32 parent_classid, u64 rate, u64 ceil,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlx5e_qos_node *node, *parent;
+	int qid;
+	int err;
+
+	qos_dbg(priv->mdev, "TC_HTB_LEAF_ALLOC_QUEUE classid %04x, parent %04x, rate %llu, ceil %llu\n",
+		classid, parent_classid, rate, ceil);
+
+	qid = mlx5e_find_unused_qos_qid(priv);
+	if (qid < 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Maximum amount of leaf classes is reached.");
+		return qid;
+	}
+
+	parent = mlx5e_sw_node_find(priv, parent_classid);
+	if (!parent)
+		return -EINVAL;
+
+	node = mlx5e_sw_node_create_leaf(priv, classid, qid, parent);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	node->rate = rate;
+	mlx5e_htb_convert_rate(priv, rate, node->parent, &node->bw_share);
+	mlx5e_htb_convert_ceil(priv, ceil, &node->max_average_bw);
+
+	err = mlx5_qos_create_leaf_node(priv->mdev, node->parent->hw_id,
+					node->bw_share, node->max_average_bw,
+					&node->hw_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node.");
+		qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n",
+			classid, err);
+		mlx5e_sw_node_delete(priv, node);
+		return err;
+	}
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		err = mlx5e_open_qos_sq(priv, &priv->channels, node);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ.");
+			qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n",
+				 classid, err);
+		} else {
+			mlx5e_activate_qos_sq(priv, node);
+		}
+	}
+
+	return mlx5e_qid_from_qos(&priv->channels, node->qid);
+}
+
+int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid,
+			    u64 rate, u64 ceil, struct netlink_ext_ack *extack)
+{
+	struct mlx5e_qos_node *node, *child;
+	int err, tmp_err;
+	u32 new_hw_id;
+	u16 qid;
+
+	qos_dbg(priv->mdev, "TC_HTB_LEAF_TO_INNER classid %04x, upcoming child %04x, rate %llu, ceil %llu\n",
+		classid, child_classid, rate, ceil);
+
+	node = mlx5e_sw_node_find(priv, classid);
+	if (!node)
+		return -ENOENT;
+
+	err = mlx5_qos_create_inner_node(priv->mdev, node->parent->hw_id,
+					 node->bw_share, node->max_average_bw,
+					 &new_hw_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating an inner node.");
+		qos_err(priv->mdev, "Failed to create an inner node (class %04x), err = %d\n",
+			classid, err);
+		return err;
+	}
+
+	/* Intentionally reuse the qid for the upcoming first child. */
+	child = mlx5e_sw_node_create_leaf(priv, child_classid, node->qid, node);
+	if (IS_ERR(child)) {
+		err = PTR_ERR(child);
+		goto err_destroy_hw_node;
+	}
+
+	child->rate = rate;
+	mlx5e_htb_convert_rate(priv, rate, node, &child->bw_share);
+	mlx5e_htb_convert_ceil(priv, ceil, &child->max_average_bw);
+
+	err = mlx5_qos_create_leaf_node(priv->mdev, new_hw_id, child->bw_share,
+					child->max_average_bw, &child->hw_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node.");
+		qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n",
+			classid, err);
+		goto err_delete_sw_node;
+	}
+
+	/* No fail point. */
+
+	qid = node->qid;
+	/* Pairs with mlx5e_get_txq_by_classid. */
+	WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER);
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		mlx5e_deactivate_qos_sq(priv, qid);
+		mlx5e_close_qos_sq(priv, qid);
+	}
+
+	err = mlx5_qos_destroy_node(priv->mdev, node->hw_id);
+	if (err) /* Not fatal. */
+		qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n",
+			 node->hw_id, classid, err);
+
+	node->hw_id = new_hw_id;
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		err = mlx5e_open_qos_sq(priv, &priv->channels, child);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ.");
+			qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n",
+				 classid, err);
+		} else {
+			mlx5e_activate_qos_sq(priv, child);
+		}
+	}
+
+	return 0;
+
+err_delete_sw_node:
+	child->qid = MLX5E_QOS_QID_INNER;
+	mlx5e_sw_node_delete(priv, child);
+
+err_destroy_hw_node:
+	tmp_err = mlx5_qos_destroy_node(priv->mdev, new_hw_id);
+	if (tmp_err) /* Not fatal. */
+		qos_warn(priv->mdev, "Failed to roll back creation of an inner node %u (class %04x), err = %d\n",
+			 new_hw_id, classid, tmp_err);
+	return err;
+}
+
+static struct mlx5e_qos_node *mlx5e_sw_node_find_by_qid(struct mlx5e_priv *priv, u16 qid)
+{
+	struct mlx5e_qos_node *node = NULL;
+	int bkt;
+
+	hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode)
+		if (node->qid == qid)
+			break;
+
+	return node;
+}
+
+static void mlx5e_reactivate_qos_sq(struct mlx5e_priv *priv, u16 qid, struct netdev_queue *txq)
+{
+	qos_dbg(priv->mdev, "Reactivate QoS SQ qid %u\n", qid);
+	netdev_tx_reset_queue(txq);
+	netif_tx_start_queue(txq);
+}
+
+static void mlx5e_reset_qdisc(struct net_device *dev, u16 qid)
+{
+	struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, qid);
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+	if (!qdisc)
+		return;
+
+	spin_lock_bh(qdisc_lock(qdisc));
+	qdisc_reset(qdisc);
+	spin_unlock_bh(qdisc_lock(qdisc));
+}
+
+int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
+		       u16 *new_qid, struct netlink_ext_ack *extack)
+{
+	struct mlx5e_qos_node *node;
+	struct netdev_queue *txq;
+	u16 qid, moved_qid;
+	bool opened;
+	int err;
+
+	qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", classid);
+
+	*old_qid = *new_qid = 0;
+
+	node = mlx5e_sw_node_find(priv, classid);
+	if (!node)
+		return -ENOENT;
+
+	/* Store qid for reuse. */
+	qid = node->qid;
+
+	opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	if (opened) {
+		txq = netdev_get_tx_queue(priv->netdev,
+					  mlx5e_qid_from_qos(&priv->channels, qid));
+		mlx5e_deactivate_qos_sq(priv, qid);
+		mlx5e_close_qos_sq(priv, qid);
+	}
+
+	err = mlx5_qos_destroy_node(priv->mdev, node->hw_id);
+	if (err) /* Not fatal. */
+		qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n",
+			 node->hw_id, classid, err);
+
+	mlx5e_sw_node_delete(priv, node);
+
+	moved_qid = mlx5e_qos_cur_leaf_nodes(priv);
+
+	if (moved_qid == 0) {
+		/* The last QoS SQ was just destroyed. */
+		if (opened)
+			mlx5e_reactivate_qos_sq(priv, qid, txq);
+		return 0;
+	}
+	moved_qid--;
+
+	if (moved_qid < qid) {
+		/* The highest QoS SQ was just destroyed. */
+		WARN(moved_qid != qid - 1, "Gaps in queue numeration: destroyed queue %u, the highest queue is %u",
+		     qid, moved_qid);
+		if (opened)
+			mlx5e_reactivate_qos_sq(priv, qid, txq);
+		return 0;
+	}
+
+	WARN(moved_qid == qid, "Can't move node with qid %u to itself", qid);
+	qos_dbg(priv->mdev, "Moving QoS SQ %u to %u\n", moved_qid, qid);
+
+	node = mlx5e_sw_node_find_by_qid(priv, moved_qid);
+	WARN(!node, "Could not find a node with qid %u to move to queue %u",
+	     moved_qid, qid);
+
+	/* Stop traffic to the old queue. */
+	WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER);
+	__clear_bit(moved_qid, priv->htb.qos_used_qids);
+
+	if (opened) {
+		txq = netdev_get_tx_queue(priv->netdev,
+					  mlx5e_qid_from_qos(&priv->channels, moved_qid));
+		mlx5e_deactivate_qos_sq(priv, moved_qid);
+		mlx5e_close_qos_sq(priv, moved_qid);
+	}
+
+	/* Prevent packets from the old class from getting into the new one. */
+	mlx5e_reset_qdisc(priv->netdev, moved_qid);
+
+	__set_bit(qid, priv->htb.qos_used_qids);
+	WRITE_ONCE(node->qid, qid);
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		err = mlx5e_open_qos_sq(priv, &priv->channels, node);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ.");
+			qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x) while moving qid %u to %u, err = %d\n",
+				 node->classid, moved_qid, qid, err);
+		} else {
+			mlx5e_activate_qos_sq(priv, node);
+		}
+	}
+
+	mlx5e_update_tx_netdev_queues(priv);
+	if (opened)
+		mlx5e_reactivate_qos_sq(priv, moved_qid, txq);
+
+	*old_qid = mlx5e_qid_from_qos(&priv->channels, moved_qid);
+	*new_qid = mlx5e_qid_from_qos(&priv->channels, qid);
+	return 0;
+}
+
+int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force,
+			    struct netlink_ext_ack *extack)
+{
+	struct mlx5e_qos_node *node, *parent;
+	u32 old_hw_id, new_hw_id;
+	int err, saved_err = 0;
+	u16 qid;
+
+	qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL_LAST%s classid %04x\n",
+		force ? "_FORCE" : "", classid);
+
+	node = mlx5e_sw_node_find(priv, classid);
+	if (!node)
+		return -ENOENT;
+
+	err = mlx5_qos_create_leaf_node(priv->mdev, node->parent->parent->hw_id,
+					node->parent->bw_share,
+					node->parent->max_average_bw,
+					&new_hw_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node.");
+		qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n",
+			classid, err);
+		if (!force)
+			return err;
+		saved_err = err;
+	}
+
+	/* Store qid for reuse and prevent clearing the bit. */
+	qid = node->qid;
+	/* Pairs with mlx5e_get_txq_by_classid. */
+	WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER);
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		mlx5e_deactivate_qos_sq(priv, qid);
+		mlx5e_close_qos_sq(priv, qid);
+	}
+
+	/* Prevent packets from the old class from getting into the new one. */
+	mlx5e_reset_qdisc(priv->netdev, qid);
+
+	err = mlx5_qos_destroy_node(priv->mdev, node->hw_id);
+	if (err) /* Not fatal. */
+		qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n",
+			 node->hw_id, classid, err);
+
+	parent = node->parent;
+	mlx5e_sw_node_delete(priv, node);
+
+	node = parent;
+	WRITE_ONCE(node->qid, qid);
+
+	/* Early return on error in force mode. Parent will still be an inner
+	 * node to be deleted by a following delete operation.
+	 */
+	if (saved_err)
+		return saved_err;
+
+	old_hw_id = node->hw_id;
+	node->hw_id = new_hw_id;
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		err = mlx5e_open_qos_sq(priv, &priv->channels, node);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ.");
+			qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n",
+				 classid, err);
+		} else {
+			mlx5e_activate_qos_sq(priv, node);
+		}
+	}
+
+	err = mlx5_qos_destroy_node(priv->mdev, old_hw_id);
+	if (err) /* Not fatal. */
+		qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n",
+			 node->hw_id, classid, err);
+
+	return 0;
+}
+
+static int mlx5e_qos_update_children(struct mlx5e_priv *priv, struct mlx5e_qos_node *node,
+				     struct netlink_ext_ack *extack)
+{
+	struct mlx5e_qos_node *child;
+	int err = 0;
+	int bkt;
+
+	hash_for_each(priv->htb.qos_tc2node, bkt, child, hnode) {
+		u32 old_bw_share = child->bw_share;
+		int err_one;
+
+		if (child->parent != node)
+			continue;
+
+		mlx5e_htb_convert_rate(priv, child->rate, node, &child->bw_share);
+		if (child->bw_share == old_bw_share)
+			continue;
+
+		err_one = mlx5_qos_update_node(priv->mdev, child->hw_id, child->bw_share,
+					       child->max_average_bw, child->hw_id);
+		if (!err && err_one) {
+			err = err_one;
+
+			NL_SET_ERR_MSG_MOD(extack, "Firmware error when modifying a child node.");
+			qos_err(priv->mdev, "Failed to modify a child node (class %04x), err = %d\n",
+				node->classid, err);
+		}
+	}
+
+	return err;
+}
+
+int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil,
+			  struct netlink_ext_ack *extack)
+{
+	u32 bw_share, max_average_bw;
+	struct mlx5e_qos_node *node;
+	bool ceil_changed = false;
+	int err;
+
+	qos_dbg(priv->mdev, "TC_HTB_LEAF_MODIFY classid %04x, rate %llu, ceil %llu\n",
+		classid, rate, ceil);
+
+	node = mlx5e_sw_node_find(priv, classid);
+	if (!node)
+		return -ENOENT;
+
+	node->rate = rate;
+	mlx5e_htb_convert_rate(priv, rate, node->parent, &bw_share);
+	mlx5e_htb_convert_ceil(priv, ceil, &max_average_bw);
+
+	err = mlx5_qos_update_node(priv->mdev, node->parent->hw_id, bw_share,
+				   max_average_bw, node->hw_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware error when modifying a node.");
+		qos_err(priv->mdev, "Failed to modify a node (class %04x), err = %d\n",
+			classid, err);
+		return err;
+	}
+
+	if (max_average_bw != node->max_average_bw)
+		ceil_changed = true;
+
+	node->bw_share = bw_share;
+	node->max_average_bw = max_average_bw;
+
+	if (ceil_changed)
+		err = mlx5e_qos_update_children(priv, node, extack);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h
new file mode 100644
index 000000000000..5af7991fcd19
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */
+
+#ifndef __MLX5E_EN_QOS_H
+#define __MLX5E_EN_QOS_H
+
+#include <linux/mlx5/driver.h>
+
+#define MLX5E_QOS_MAX_LEAF_NODES 256
+
+struct mlx5e_priv;
+struct mlx5e_channels;
+struct mlx5e_channel;
+
+int mlx5e_qos_max_leaf_nodes(struct mlx5_core_dev *mdev);
+int mlx5e_qos_cur_leaf_nodes(struct mlx5e_priv *priv);
+
+/* TX datapath API */
+int mlx5e_get_txq_by_classid(struct mlx5e_priv *priv, u16 classid);
+struct mlx5e_txqsq *mlx5e_get_sq(struct mlx5e_priv *priv, int qid);
+
+/* SQ lifecycle */
+int mlx5e_qos_open_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs);
+void mlx5e_qos_activate_queues(struct mlx5e_priv *priv);
+void mlx5e_qos_deactivate_queues(struct mlx5e_channel *c);
+void mlx5e_qos_close_queues(struct mlx5e_channel *c);
+
+/* HTB API */
+int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls,
+		       struct netlink_ext_ack *extack);
+int mlx5e_htb_root_del(struct mlx5e_priv *priv);
+int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid,
+			       u32 parent_classid, u64 rate, u64 ceil,
+			       struct netlink_ext_ack *extack);
+int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid,
+			    u64 rate, u64 ceil, struct netlink_ext_ack *extack);
+int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid,
+		       u16 *new_qid, struct netlink_ext_ack *extack);
+int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force,
+			    struct netlink_ext_ack *extack);
+int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil,
+			  struct netlink_ext_ack *extack);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 2d37742a888c..2e5a0696374a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -447,6 +447,17 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
 		goto out;
 	}
 
+	/* Don't allow changing the number of channels if HTB offload is active,
+	 * because the numeration of the QoS SQs will change, while per-queue
+	 * qdiscs are attached.
+	 */
+	if (priv->htb.maj_id) {
+		err = -EINVAL;
+		netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the number of channels\n",
+			   __func__);
+		goto out;
+	}
+
 	new_channels.params = priv->channels.params;
 	new_channels.params.num_channels = count;
 
@@ -1966,6 +1977,16 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable)
 	if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn))
 		return -EOPNOTSUPP;
 
+	/* Don't allow changing the PTP state if HTB offload is active, because
+	 * the numeration of the QoS SQs will change, while per-queue qdiscs are
+	 * attached.
+	 */
+	if (priv->htb.maj_id) {
+		netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the PTP state\n",
+			   __func__);
+		return -EINVAL;
+	}
+
 	new_channels.params = priv->channels.params;
 	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_TX_PORT_TS, enable);
 	/* No need to verify SQ stop room as
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f33c38629886..b9a175982801 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -65,6 +65,7 @@
 #include "en/devlink.h"
 #include "lib/mlx5.h"
 #include "en/ptp.h"
+#include "qos.h"
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 {
@@ -1143,7 +1144,6 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 	sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
-	sq->stats     = &c->priv->channel_stats[c->ix].sq[tc];
 	INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
 	if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert))
 		set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state);
@@ -1233,6 +1233,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
 		    struct mlx5e_modify_sq_param *p)
 {
+	u64 bitmask = 0;
 	void *in;
 	void *sqc;
 	int inlen;
@@ -1248,9 +1249,14 @@ int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
 	MLX5_SET(modify_sq_in, in, sq_state, p->curr_state);
 	MLX5_SET(sqc, sqc, state, p->next_state);
 	if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) {
-		MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
-		MLX5_SET(sqc,  sqc, packet_pacing_rate_limit_index, p->rl_index);
+		bitmask |= 1;
+		MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index);
 	}
+	if (p->qos_update && p->next_state == MLX5_SQC_STATE_RDY) {
+		bitmask |= 1 << 2;
+		MLX5_SET(sqc, sqc, qos_queue_group_id, p->qos_queue_group_id);
+	}
+	MLX5_SET64(modify_sq_in, in, modify_bitmask, bitmask);
 
 	err = mlx5_core_modify_sq(mdev, sqn, in);
 
@@ -1267,6 +1273,7 @@ static void mlx5e_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn)
 int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
 			struct mlx5e_sq_param *param,
 			struct mlx5e_create_sq_param *csp,
+			u16 qos_queue_group_id,
 			u32 *sqn)
 {
 	struct mlx5e_modify_sq_param msp = {0};
@@ -1278,6 +1285,10 @@ int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
 
 	msp.curr_state = MLX5_SQC_STATE_RST;
 	msp.next_state = MLX5_SQC_STATE_RDY;
+	if (qos_queue_group_id) {
+		msp.qos_update = true;
+		msp.qos_queue_group_id = qos_queue_group_id;
+	}
 	err = mlx5e_modify_sq(mdev, *sqn, &msp);
 	if (err)
 		mlx5e_destroy_sq(mdev, *sqn);
@@ -1288,13 +1299,9 @@ int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
 static int mlx5e_set_sq_maxrate(struct net_device *dev,
 				struct mlx5e_txqsq *sq, u32 rate);
 
-static int mlx5e_open_txqsq(struct mlx5e_channel *c,
-			    u32 tisn,
-			    int txq_ix,
-			    struct mlx5e_params *params,
-			    struct mlx5e_sq_param *param,
-			    struct mlx5e_txqsq *sq,
-			    int tc)
+int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix,
+		     struct mlx5e_params *params, struct mlx5e_sq_param *param,
+		     struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, u16 qos_qid)
 {
 	struct mlx5e_create_sq_param csp = {};
 	u32 tx_rate;
@@ -1304,12 +1311,17 @@ static int mlx5e_open_txqsq(struct mlx5e_channel *c,
 	if (err)
 		return err;
 
+	if (qos_queue_group_id)
+		sq->stats = c->priv->htb.qos_sq_stats[qos_qid];
+	else
+		sq->stats = &c->priv->channel_stats[c->ix].sq[tc];
+
 	csp.tisn            = tisn;
 	csp.tis_lst_sz      = 1;
 	csp.cqn             = sq->cq.mcq.cqn;
 	csp.wq_ctrl         = &sq->wq_ctrl;
 	csp.min_inline_mode = sq->min_inline_mode;
-	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, qos_queue_group_id, &sq->sqn);
 	if (err)
 		goto err_free_txqsq;
 
@@ -1366,7 +1378,7 @@ void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
 	}
 }
 
-static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
+void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 {
 	struct mlx5_core_dev *mdev = sq->mdev;
 	struct mlx5_rate_limit rl = {0};
@@ -1403,7 +1415,7 @@ int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
 	csp.cqn             = sq->cq.mcq.cqn;
 	csp.wq_ctrl         = &sq->wq_ctrl;
 	csp.min_inline_mode = params->tx_min_inline_mode;
-	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn);
 	if (err)
 		goto err_free_icosq;
 
@@ -1452,7 +1464,7 @@ int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
 	csp.wq_ctrl         = &sq->wq_ctrl;
 	csp.min_inline_mode = sq->min_inline_mode;
 	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
-	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn);
 	if (err)
 		goto err_free_xdpsq;
 
@@ -1703,7 +1715,7 @@ static int mlx5e_open_sqs(struct mlx5e_channel *c,
 		int txq_ix = c->ix + tc * params->num_channels;
 
 		err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix,
-				       params, &cparam->txq_sq, &c->sq[tc], tc);
+				       params, &cparam->txq_sq, &c->sq[tc], tc, 0, 0);
 		if (err)
 			goto err_close_sqs;
 	}
@@ -2044,6 +2056,8 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
 	mlx5e_deactivate_icosq(&c->icosq);
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_deactivate_txqsq(&c->sq[tc]);
+
+	mlx5e_qos_deactivate_queues(c);
 }
 
 static void mlx5e_close_channel(struct mlx5e_channel *c)
@@ -2051,6 +2065,7 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
 		mlx5e_close_xsk(c);
 	mlx5e_close_queues(c);
+	mlx5e_qos_close_queues(c);
 	netif_napi_del(&c->napi);
 
 	kvfree(c);
@@ -2198,9 +2213,8 @@ void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
 	param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev));
 }
 
-static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
-				 struct mlx5e_params *params,
-				 struct mlx5e_sq_param *param)
+void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_params *params,
+			  struct mlx5e_sq_param *param)
 {
 	void *sqc = param->sqc;
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@ -2379,10 +2393,18 @@ int mlx5e_open_channels(struct mlx5e_priv *priv,
 			goto err_close_channels;
 	}
 
+	err = mlx5e_qos_open_queues(priv, chs);
+	if (err)
+		goto err_close_ptp;
+
 	mlx5e_health_channels_update(priv);
 	kvfree(cparam);
 	return 0;
 
+err_close_ptp:
+	if (chs->port_ptp)
+		mlx5e_port_ptp_close(chs->port_ptp);
+
 err_close_channels:
 	for (i--; i >= 0; i--)
 		mlx5e_close_channel(chs->c[i]);
@@ -2915,11 +2937,31 @@ static void mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc)
 		netdev_set_tc_queue(netdev, tc, nch, 0);
 }
 
+int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv)
+{
+	int qos_queues, nch, ntc, num_txqs, err;
+
+	qos_queues = mlx5e_qos_cur_leaf_nodes(priv);
+
+	nch = priv->channels.params.num_channels;
+	ntc = priv->channels.params.num_tc;
+	num_txqs = nch * ntc + qos_queues;
+	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS))
+		num_txqs += ntc;
+
+	mlx5e_dbg(DRV, priv, "Setting num_txqs %d\n", num_txqs);
+	err = netif_set_real_num_tx_queues(priv->netdev, num_txqs);
+	if (err)
+		netdev_warn(priv->netdev, "netif_set_real_num_tx_queues failed, %d\n", err);
+
+	return err;
+}
+
 static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv)
 {
 	struct net_device *netdev = priv->netdev;
-	int num_txqs, num_rxqs, nch, ntc;
 	int old_num_txqs, old_ntc;
+	int num_rxqs, nch, ntc;
 	int err;
 
 	old_num_txqs = netdev->real_num_tx_queues;
@@ -2927,18 +2969,13 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv)
 
 	nch = priv->channels.params.num_channels;
 	ntc = priv->channels.params.num_tc;
-	num_txqs = nch * ntc;
-	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS))
-		num_txqs += ntc;
 	num_rxqs = nch * priv->profile->rq_groups;
 
 	mlx5e_netdev_set_tcs(netdev, nch, ntc);
 
-	err = netif_set_real_num_tx_queues(netdev, num_txqs);
-	if (err) {
-		netdev_warn(netdev, "netif_set_real_num_tx_queues failed, %d\n", err);
+	err = mlx5e_update_tx_netdev_queues(priv);
+	if (err)
 		goto err_tcs;
-	}
 	err = netif_set_real_num_rx_queues(netdev, num_rxqs);
 	if (err) {
 		netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err);
@@ -3042,6 +3079,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 	mlx5e_update_num_tc_x_num_ch(priv);
 	mlx5e_build_txq_maps(priv);
 	mlx5e_activate_channels(&priv->channels);
+	mlx5e_qos_activate_queues(priv);
 	mlx5e_xdp_tx_enable(priv);
 	netif_tx_start_all_queues(priv->netdev);
 
@@ -3608,6 +3646,14 @@ static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv,
 
 	mutex_lock(&priv->state_lock);
 
+	/* MQPRIO is another toplevel qdisc that can't be attached
+	 * simultaneously with the offloaded HTB.
+	 */
+	if (WARN_ON(priv->htb.maj_id)) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	new_channels.params = priv->channels.params;
 	new_channels.params.num_tc = tc ? tc : 1;
 
@@ -3628,12 +3674,55 @@ out:
 	return err;
 }
 
+static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb)
+{
+	int res;
+
+	switch (htb->command) {
+	case TC_HTB_CREATE:
+		return mlx5e_htb_root_add(priv, htb->parent_classid, htb->classid,
+					  htb->extack);
+	case TC_HTB_DESTROY:
+		return mlx5e_htb_root_del(priv);
+	case TC_HTB_LEAF_ALLOC_QUEUE:
+		res = mlx5e_htb_leaf_alloc_queue(priv, htb->classid, htb->parent_classid,
+						 htb->rate, htb->ceil, htb->extack);
+		if (res < 0)
+			return res;
+		htb->qid = res;
+		return 0;
+	case TC_HTB_LEAF_TO_INNER:
+		return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid,
+					       htb->rate, htb->ceil, htb->extack);
+	case TC_HTB_LEAF_DEL:
+		return mlx5e_htb_leaf_del(priv, htb->classid, &htb->moved_qid, &htb->qid,
+					  htb->extack);
+	case TC_HTB_LEAF_DEL_LAST:
+	case TC_HTB_LEAF_DEL_LAST_FORCE:
+		return mlx5e_htb_leaf_del_last(priv, htb->classid,
+					       htb->command == TC_HTB_LEAF_DEL_LAST_FORCE,
+					       htb->extack);
+	case TC_HTB_NODE_MODIFY:
+		return mlx5e_htb_node_modify(priv, htb->classid, htb->rate, htb->ceil,
+					     htb->extack);
+	case TC_HTB_LEAF_QUERY_QUEUE:
+		res = mlx5e_get_txq_by_classid(priv, htb->classid);
+		if (res < 0)
+			return res;
+		htb->qid = res;
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static LIST_HEAD(mlx5e_block_cb_list);
 
 static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			  void *type_data)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err;
 
 	switch (type) {
 	case TC_SETUP_BLOCK: {
@@ -3647,6 +3736,11 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	}
 	case TC_SETUP_QDISC_MQPRIO:
 		return mlx5e_setup_tc_mqprio(priv, type_data);
+	case TC_SETUP_QDISC_HTB:
+		mutex_lock(&priv->state_lock);
+		err = mlx5e_setup_tc_htb(priv, type_data);
+		mutex_unlock(&priv->state_lock);
+		return err;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -3811,20 +3905,25 @@ static int set_feature_cvlan_filter(struct net_device *netdev, bool enable)
 	return 0;
 }
 
-#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
-static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
+static int set_feature_hw_tc(struct net_device *netdev, bool enable)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
+#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
 	if (!enable && mlx5e_tc_num_filters(priv, MLX5_TC_FLAG(NIC_OFFLOAD))) {
 		netdev_err(netdev,
 			   "Active offloaded tc filters, can't turn hw_tc_offload off\n");
 		return -EINVAL;
 	}
+#endif
+
+	if (!enable && priv->htb.maj_id) {
+		netdev_err(netdev, "Active HTB offload, can't turn hw_tc_offload off\n");
+		return -EINVAL;
+	}
 
 	return 0;
 }
-#endif
 
 static int set_feature_rx_all(struct net_device *netdev, bool enable)
 {
@@ -3922,9 +4021,7 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features)
 	err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
 	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
 				    set_feature_cvlan_filter);
-#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
-	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
-#endif
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc);
 	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
 	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
 	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
@@ -5028,6 +5125,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 		netdev->hw_features	 |= NETIF_F_NTUPLE;
 #endif
 	}
+	if (mlx5_qos_is_supported(mdev))
+		netdev->features |= NETIF_F_HW_TC;
 
 	netdev->features         |= NETIF_F_HIGHDMA;
 	netdev->features         |= NETIF_F_HW_VLAN_STAG_FILTER;
@@ -5333,6 +5432,7 @@ int mlx5e_netdev_init(struct net_device *netdev,
 		return -ENOMEM;
 
 	mutex_init(&priv->state_lock);
+	hash_init(priv->htb.qos_tc2node);
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
 	INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work);
@@ -5355,8 +5455,14 @@ err_free_cpumask:
 
 void mlx5e_netdev_cleanup(struct net_device *netdev, struct mlx5e_priv *priv)
 {
+	int i;
+
 	destroy_workqueue(priv->wq);
 	free_cpumask_var(priv->scratchpad.cpumask);
+
+	for (i = 0; i < priv->htb.max_qos_sqs; i++)
+		kfree(priv->htb.qos_sq_stats[i]);
+	kvfree(priv->htb.qos_sq_stats);
 }
 
 struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
@@ -5366,13 +5472,17 @@ struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
 {
 	struct net_device *netdev;
 	unsigned int ptp_txqs = 0;
+	int qos_sqs = 0;
 	int err;
 
 	if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn))
 		ptp_txqs = profile->max_tc;
 
+	if (mlx5_qos_is_supported(mdev))
+		qos_sqs = mlx5e_qos_max_leaf_nodes(mdev);
+
 	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
-				    nch * profile->max_tc + ptp_txqs,
+				    nch * profile->max_tc + ptp_txqs + qos_sqs,
 				    nch * profile->rq_groups);
 	if (!netdev) {
 		mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 2cf2042b37c7..92c5b81427b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -420,6 +420,25 @@ static void mlx5e_stats_grp_sw_update_stats_ptp(struct mlx5e_priv *priv,
 	}
 }
 
+static void mlx5e_stats_grp_sw_update_stats_qos(struct mlx5e_priv *priv,
+						struct mlx5e_sw_stats *s)
+{
+	struct mlx5e_sq_stats **stats;
+	u16 max_qos_sqs;
+	int i;
+
+	/* Pairs with smp_store_release in mlx5e_open_qos_sq. */
+	max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs);
+	stats = READ_ONCE(priv->htb.qos_sq_stats);
+
+	for (i = 0; i < max_qos_sqs; i++) {
+		mlx5e_stats_grp_sw_update_stats_sq(s, READ_ONCE(stats[i]));
+
+		/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */
+		barrier();
+	}
+}
+
 static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
 {
 	struct mlx5e_sw_stats *s = &priv->stats.sw;
@@ -449,6 +468,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
 		}
 	}
 	mlx5e_stats_grp_sw_update_stats_ptp(priv, s);
+	mlx5e_stats_grp_sw_update_stats_qos(priv, s);
 }
 
 static const struct counter_desc q_stats_desc[] = {
@@ -1740,6 +1760,41 @@ static const struct counter_desc ptp_cq_stats_desc[] = {
 	{ MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort_abs_diff_ns) },
 };
 
+static const struct counter_desc qos_sq_stats_desc[] = {
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, packets) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, bytes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_packets) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_bytes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_partial) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, nop) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, mpwqe_blks) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, mpwqe_pkts) },
+#ifdef CONFIG_MLX5_EN_TLS
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_packets) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_bytes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_ctx) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_ooo) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_dump_packets) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_dump_bytes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_resync_bytes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_skip_no_sync_data) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_drop_no_sync_data) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_drop_bypass_req) },
+#endif
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_none) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, stopped) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, dropped) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, xmit_more) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, recover) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqes) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
+};
+
 #define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
 #define NUM_SQ_STATS			ARRAY_SIZE(sq_stats_desc)
 #define NUM_XDPSQ_STATS			ARRAY_SIZE(xdpsq_stats_desc)
@@ -1750,6 +1805,49 @@ static const struct counter_desc ptp_cq_stats_desc[] = {
 #define NUM_PTP_SQ_STATS		ARRAY_SIZE(ptp_sq_stats_desc)
 #define NUM_PTP_CH_STATS		ARRAY_SIZE(ptp_ch_stats_desc)
 #define NUM_PTP_CQ_STATS		ARRAY_SIZE(ptp_cq_stats_desc)
+#define NUM_QOS_SQ_STATS		ARRAY_SIZE(qos_sq_stats_desc)
+
+static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(qos)
+{
+	/* Pairs with smp_store_release in mlx5e_open_qos_sq. */
+	return NUM_QOS_SQ_STATS * smp_load_acquire(&priv->htb.max_qos_sqs);
+}
+
+static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(qos)
+{
+	/* Pairs with smp_store_release in mlx5e_open_qos_sq. */
+	u16 max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs);
+	int i, qid;
+
+	for (qid = 0; qid < max_qos_sqs; qid++)
+		for (i = 0; i < NUM_QOS_SQ_STATS; i++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				qos_sq_stats_desc[i].format, qid);
+
+	return idx;
+}
+
+static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qos)
+{
+	struct mlx5e_sq_stats **stats;
+	u16 max_qos_sqs;
+	int i, qid;
+
+	/* Pairs with smp_store_release in mlx5e_open_qos_sq. */
+	max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs);
+	stats = READ_ONCE(priv->htb.qos_sq_stats);
+
+	for (qid = 0; qid < max_qos_sqs; qid++) {
+		struct mlx5e_sq_stats *s = READ_ONCE(stats[qid]);
+
+		for (i = 0; i < NUM_QOS_SQ_STATS; i++)
+			data[idx++] = MLX5E_READ_CTR64_CPU(s, qos_sq_stats_desc, i);
+	}
+
+	return idx;
+}
+
+static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qos) { return; }
 
 static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ptp)
 {
@@ -1932,6 +2030,7 @@ MLX5E_DEFINE_STATS_GRP(per_port_buff_congest, 0);
 MLX5E_DEFINE_STATS_GRP(eth_ext, 0);
 static MLX5E_DEFINE_STATS_GRP(tls, 0);
 static MLX5E_DEFINE_STATS_GRP(ptp, 0);
+static MLX5E_DEFINE_STATS_GRP(qos, 0);
 
 /* The stats groups order is opposite to the update_stats() order calls */
 mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = {
@@ -1955,6 +2054,7 @@ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = {
 	&MLX5E_STATS_GRP(channels),
 	&MLX5E_STATS_GRP(per_port_buff_congest),
 	&MLX5E_STATS_GRP(ptp),
+	&MLX5E_STATS_GRP(qos),
 };
 
 unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index e41fc11f2ce7..93c41312fb03 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -55,6 +55,8 @@
 #define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld)
 #define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld)
 
+#define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld)
+
 struct counter_desc {
 	char		format[ETH_GSTRING_LEN];
 	size_t		offset; /* Byte offset */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 74f233eece54..da6a358a8a10 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -106,28 +106,53 @@ return_txq:
 	return priv->port_ptp_tc2realtxq[up];
 }
 
+static int mlx5e_select_htb_queue(struct mlx5e_priv *priv, struct sk_buff *skb,
+				  u16 htb_maj_id)
+{
+	u16 classid;
+
+	if ((TC_H_MAJ(skb->priority) >> 16) == htb_maj_id)
+		classid = TC_H_MIN(skb->priority);
+	else
+		classid = READ_ONCE(priv->htb.defcls);
+
+	if (!classid)
+		return 0;
+
+	return mlx5e_get_txq_by_classid(priv, classid);
+}
+
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 		       struct net_device *sb_dev)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	int num_tc_x_num_ch;
 	int txq_ix;
 	int up = 0;
 	int ch_ix;
 
-	if (unlikely(priv->channels.port_ptp)) {
-		int num_tc_x_num_ch;
+	/* Sync with mlx5e_update_num_tc_x_num_ch - avoid refetching. */
+	num_tc_x_num_ch = READ_ONCE(priv->num_tc_x_num_ch);
+	if (unlikely(dev->real_num_tx_queues > num_tc_x_num_ch)) {
+		/* Order maj_id before defcls - pairs with mlx5e_htb_root_add. */
+		u16 htb_maj_id = smp_load_acquire(&priv->htb.maj_id);
 
-		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
-		    mlx5e_use_ptpsq(skb))
-			return mlx5e_select_ptpsq(dev, skb);
+		if (unlikely(htb_maj_id)) {
+			txq_ix = mlx5e_select_htb_queue(priv, skb, htb_maj_id);
+			if (txq_ix > 0)
+				return txq_ix;
+		}
 
-		/* Sync with mlx5e_update_num_tc_x_num_ch - avoid refetching. */
-		num_tc_x_num_ch = READ_ONCE(priv->num_tc_x_num_ch);
+		if (unlikely(priv->channels.port_ptp))
+			if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
+			    mlx5e_use_ptpsq(skb))
+				return mlx5e_select_ptpsq(dev, skb);
 
 		txq_ix = netdev_pick_tx(dev, skb, NULL);
-		/* Fix netdev_pick_tx() not to choose ptp_channel txqs.
+		/* Fix netdev_pick_tx() not to choose ptp_channel and HTB txqs.
 		 * If they are selected, switch to regular queues.
-		 * Driver to select these queues only at mlx5e_select_ptpsq().
+		 * Driver to select these queues only at mlx5e_select_ptpsq()
+		 * and mlx5e_select_htb_queue().
 		 */
 		if (unlikely(txq_ix >= num_tc_x_num_ch))
 			txq_ix %= num_tc_x_num_ch;
@@ -702,6 +727,10 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev)
 	u16 pi;
 
 	sq = priv->txq2sq[skb_get_queue_mapping(skb)];
+	if (unlikely(!sq)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
 
 	/* May send SKBs and WQEs. */
 	if (unlikely(!mlx5e_accel_tx_begin(dev, sq, skb, &accel)))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index a3cfe06d5116..d54da3797c30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -115,17 +115,21 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 					       napi);
 	struct mlx5e_ch_stats *ch_stats = c->stats;
 	struct mlx5e_xdpsq *xsksq = &c->xsksq;
+	struct mlx5e_txqsq __rcu **qos_sqs;
 	struct mlx5e_rq *xskrq = &c->xskrq;
 	struct mlx5e_rq *rq = &c->rq;
 	bool aff_change = false;
 	bool busy_xsk = false;
 	bool busy = false;
 	int work_done = 0;
+	u16 qos_sqs_size;
 	bool xsk_open;
 	int i;
 
 	rcu_read_lock();
 
+	qos_sqs = rcu_dereference(c->qos_sqs);
+
 	xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
 
 	ch_stats->poll++;
@@ -133,6 +137,18 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	for (i = 0; i < c->num_tc; i++)
 		busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget);
 
+	if (unlikely(qos_sqs)) {
+		smp_rmb(); /* Pairs with mlx5e_qos_alloc_queues. */
+		qos_sqs_size = READ_ONCE(c->qos_sqs_size);
+
+		for (i = 0; i < qos_sqs_size; i++) {
+			struct mlx5e_txqsq *sq = rcu_dereference(qos_sqs[i]);
+
+			if (sq)
+				busy |= mlx5e_poll_tx_cq(&sq->cq, budget);
+		}
+	}
+
 	busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq);
 
 	if (c->xdp)
@@ -186,6 +202,16 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 		mlx5e_handle_tx_dim(&c->sq[i]);
 		mlx5e_cq_arm(&c->sq[i].cq);
 	}
+	if (unlikely(qos_sqs)) {
+		for (i = 0; i < qos_sqs_size; i++) {
+			struct mlx5e_txqsq *sq = rcu_dereference(qos_sqs[i]);
+
+			if (sq) {
+				mlx5e_handle_tx_dim(sq);
+				mlx5e_cq_arm(&sq->cq);
+			}
+		}
+	}
 
 	mlx5e_handle_rx_dim(rq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/qos.c
new file mode 100644
index 000000000000..0777be24a307
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qos.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */
+
+#include "qos.h"
+
+#define MLX5_QOS_DEFAULT_DWRR_UID 0
+
+bool mlx5_qos_is_supported(struct mlx5_core_dev *mdev)
+{
+	if (!MLX5_CAP_GEN(mdev, qos))
+		return false;
+	if (!MLX5_CAP_QOS(mdev, nic_sq_scheduling))
+		return false;
+	if (!MLX5_CAP_QOS(mdev, nic_bw_share))
+		return false;
+	if (!MLX5_CAP_QOS(mdev, nic_rate_limit))
+		return false;
+	return true;
+}
+
+int mlx5_qos_max_leaf_nodes(struct mlx5_core_dev *mdev)
+{
+	return 1 << MLX5_CAP_QOS(mdev, log_max_qos_nic_queue_group);
+}
+
+int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id,
+			      u32 bw_share, u32 max_avg_bw, u32 *id)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id);
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
+		 SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw);
+
+	return mlx5_create_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC,
+						  sched_ctx, id);
+}
+
+int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id,
+			       u32 bw_share, u32 max_avg_bw, u32 *id)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	void *attr;
+
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id);
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
+		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw);
+
+	attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
+	MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR);
+
+	return mlx5_create_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC,
+						  sched_ctx, id);
+}
+
+int mlx5_qos_create_root_node(struct mlx5_core_dev *mdev, u32 *id)
+{
+	return mlx5_qos_create_inner_node(mdev, MLX5_QOS_DEFAULT_DWRR_UID, 0, 0, id);
+}
+
+int mlx5_qos_update_node(struct mlx5_core_dev *mdev, u32 parent_id,
+			 u32 bw_share, u32 max_avg_bw, u32 id)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	u32 bitmask = 0;
+
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw);
+
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
+
+	return mlx5_modify_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC,
+						  sched_ctx, id, bitmask);
+}
+
+int mlx5_qos_destroy_node(struct mlx5_core_dev *mdev, u32 id)
+{
+	return mlx5_destroy_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, id);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qos.h b/drivers/net/ethernet/mellanox/mlx5/core/qos.h
new file mode 100644
index 000000000000..125e4e47e6f7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qos.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */
+
+#ifndef __MLX5_QOS_H
+#define __MLX5_QOS_H
+
+#include "mlx5_core.h"
+
+#define MLX5_DEBUG_QOS_MASK BIT(4)
+
+#define qos_err(mdev, fmt, ...) \
+	mlx5_core_err(mdev, "QoS: " fmt, ##__VA_ARGS__)
+#define qos_warn(mdev, fmt, ...) \
+	mlx5_core_warn(mdev, "QoS: " fmt, ##__VA_ARGS__)
+#define qos_dbg(mdev, fmt, ...) \
+	mlx5_core_dbg_mask(mdev, MLX5_DEBUG_QOS_MASK, "QoS: " fmt, ##__VA_ARGS__)
+
+bool mlx5_qos_is_supported(struct mlx5_core_dev *mdev);
+int mlx5_qos_max_leaf_nodes(struct mlx5_core_dev *mdev);
+
+int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id,
+			      u32 bw_share, u32 max_avg_bw, u32 *id);
+int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id,
+			       u32 bw_share, u32 max_avg_bw, u32 *id);
+int mlx5_qos_create_root_node(struct mlx5_core_dev *mdev, u32 *id);
+int mlx5_qos_update_node(struct mlx5_core_dev *mdev, u32 parent_id, u32 bw_share,
+			 u32 max_avg_bw, u32 id);
+int mlx5_qos_destroy_node(struct mlx5_core_dev *mdev, u32 id);
+
+#endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 823411e288c0..71ae6aac3410 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -842,11 +842,16 @@ struct mlx5_ifc_qos_cap_bits {
 	u8         reserved_at_4[0x1];
 	u8         packet_pacing_burst_bound[0x1];
 	u8         packet_pacing_typical_size[0x1];
-	u8         reserved_at_7[0x4];
+	u8         reserved_at_7[0x1];
+	u8         nic_sq_scheduling[0x1];
+	u8         nic_bw_share[0x1];
+	u8         nic_rate_limit[0x1];
 	u8         packet_pacing_uid[0x1];
 	u8         reserved_at_c[0x14];
 
-	u8         reserved_at_20[0x20];
+	u8         reserved_at_20[0xb];
+	u8         log_max_qos_nic_queue_group[0x5];
+	u8         reserved_at_30[0x10];
 
 	u8         packet_pacing_max_rate[0x20];
 
@@ -3347,7 +3352,7 @@ struct mlx5_ifc_sqc_bits {
 	u8         reserved_at_e0[0x10];
 	u8         packet_pacing_rate_limit_index[0x10];
 	u8         tis_lst_sz[0x10];
-	u8         reserved_at_110[0x10];
+	u8         qos_queue_group_id[0x10];
 
 	u8         reserved_at_120[0x40];
 
@@ -3362,6 +3367,7 @@ enum {
 	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT = 0x1,
 	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2,
 	SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP = 0x4,
 };
 
 enum {
@@ -4805,6 +4811,7 @@ struct mlx5_ifc_query_scheduling_element_out_bits {
 
 enum {
 	SCHEDULING_HIERARCHY_E_SWITCH = 0x2,
+	SCHEDULING_HIERARCHY_NIC = 0x3,
 };
 
 struct mlx5_ifc_query_scheduling_element_in_bits {
-- 
cgit v1.2.3


From 9d0735519f99948c5b5c22426b682ced7f7af9be Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 16:41:56 +0100
Subject: rtc: remove sirfsoc driver

The CSR SiRF prima2/atlas platforms are getting removed, so this driver
is no longer needed.

Cc: Barry Song <baohua@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Barry Song <baohua@kernel.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20210120154158.1860736-2-arnd@kernel.org
---
 .../devicetree/bindings/rtc/sirf,prima2-sysrtc.txt |  13 -
 drivers/rtc/Kconfig                                |   7 -
 drivers/rtc/Makefile                               |   1 -
 drivers/rtc/rtc-sirfsoc.c                          | 446 ---------------------
 include/linux/rtc/sirfsoc_rtciobrg.h               |  21 -
 5 files changed, 488 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/rtc/sirf,prima2-sysrtc.txt
 delete mode 100644 drivers/rtc/rtc-sirfsoc.c
 delete mode 100644 include/linux/rtc/sirfsoc_rtciobrg.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/rtc/sirf,prima2-sysrtc.txt b/Documentation/devicetree/bindings/rtc/sirf,prima2-sysrtc.txt
deleted file mode 100644
index 58885b55da21..000000000000
--- a/Documentation/devicetree/bindings/rtc/sirf,prima2-sysrtc.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-SiRFSoC Real Time Clock
-
-Required properties:
-- compatible: must be "sirf,prima2-sysrtc"
-- reg: address range of rtc register set.
-- interrupts: rtc alarm interrupts.
-
-Example:
-	rtc@2000 {
-		compatible = "sirf,prima2-sysrtc";
-		reg = <0x2000 0x1000>;
-		interrupts = <52 53 54>;
-	};
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index fe4fdb75a4c5..12907352f697 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1793,13 +1793,6 @@ config RTC_DRV_IMX_SC
 	   If you say yes here you get support for the NXP i.MX System
 	   Controller RTC module.
 
-config RTC_DRV_SIRFSOC
-	tristate "SiRFSOC RTC"
-	depends on ARCH_SIRF
-	help
-	  Say "yes" here to support the real time clock on SiRF SOC chips.
-	  This driver can also be built as a module called rtc-sirfsoc.
-
 config RTC_DRV_ST_LPC
 	tristate "STMicroelectronics LPC RTC"
 	depends on ARCH_STI
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index a020adde4bbd..821643b3b699 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -154,7 +154,6 @@ obj-$(CONFIG_RTC_DRV_SA1100)	+= rtc-sa1100.o
 obj-$(CONFIG_RTC_DRV_SC27XX)	+= rtc-sc27xx.o
 obj-$(CONFIG_RTC_DRV_SD3078)   += rtc-sd3078.o
 obj-$(CONFIG_RTC_DRV_SH)	+= rtc-sh.o
-obj-$(CONFIG_RTC_DRV_SIRFSOC)	+= rtc-sirfsoc.o
 obj-$(CONFIG_RTC_DRV_SNVS)	+= rtc-snvs.o
 obj-$(CONFIG_RTC_DRV_SPEAR)	+= rtc-spear.o
 obj-$(CONFIG_RTC_DRV_STARFIRE)	+= rtc-starfire.o
diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c
deleted file mode 100644
index 03a6cca23201..000000000000
--- a/drivers/rtc/rtc-sirfsoc.c
+++ /dev/null
@@ -1,446 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SiRFSoC Real Time Clock interface for Linux
- *
- * Copyright (c) 2013 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/rtc.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/of.h>
-#include <linux/regmap.h>
-#include <linux/rtc/sirfsoc_rtciobrg.h>
-
-
-#define RTC_CN			0x00
-#define RTC_ALARM0		0x04
-#define RTC_ALARM1		0x18
-#define RTC_STATUS		0x08
-#define RTC_SW_VALUE            0x40
-#define SIRFSOC_RTC_AL1E	(1<<6)
-#define SIRFSOC_RTC_AL1		(1<<4)
-#define SIRFSOC_RTC_HZE		(1<<3)
-#define SIRFSOC_RTC_AL0E	(1<<2)
-#define SIRFSOC_RTC_HZ		(1<<1)
-#define SIRFSOC_RTC_AL0		(1<<0)
-#define RTC_DIV			0x0c
-#define RTC_DEEP_CTRL		0x14
-#define RTC_CLOCK_SWITCH	0x1c
-#define SIRFSOC_RTC_CLK		0x03	/* others are reserved */
-
-/* Refer to RTC DIV switch */
-#define RTC_HZ			16
-
-/* This macro is also defined in arch/arm/plat-sirfsoc/cpu.c */
-#define RTC_SHIFT		4
-
-#define INTR_SYSRTC_CN		0x48
-
-struct sirfsoc_rtc_drv {
-	struct rtc_device	*rtc;
-	u32			rtc_base;
-	u32			irq;
-	unsigned		irq_wake;
-	/* Overflow for every 8 years extra time */
-	u32			overflow_rtc;
-	spinlock_t		lock;
-	struct regmap *regmap;
-#ifdef CONFIG_PM
-	u32		saved_counter;
-	u32		saved_overflow_rtc;
-#endif
-};
-
-static u32 sirfsoc_rtc_readl(struct sirfsoc_rtc_drv *rtcdrv, u32 offset)
-{
-	u32 val;
-
-	regmap_read(rtcdrv->regmap, rtcdrv->rtc_base + offset, &val);
-	return val;
-}
-
-static void sirfsoc_rtc_writel(struct sirfsoc_rtc_drv *rtcdrv,
-			       u32 offset, u32 val)
-{
-	regmap_write(rtcdrv->regmap, rtcdrv->rtc_base + offset, val);
-}
-
-static int sirfsoc_rtc_read_alarm(struct device *dev,
-		struct rtc_wkalrm *alrm)
-{
-	unsigned long rtc_alarm, rtc_count;
-	struct sirfsoc_rtc_drv *rtcdrv;
-
-	rtcdrv = dev_get_drvdata(dev);
-
-	spin_lock_irq(&rtcdrv->lock);
-
-	rtc_count = sirfsoc_rtc_readl(rtcdrv, RTC_CN);
-
-	rtc_alarm = sirfsoc_rtc_readl(rtcdrv, RTC_ALARM0);
-	memset(alrm, 0, sizeof(struct rtc_wkalrm));
-
-	/*
-	 * assume alarm interval not beyond one round counter overflow_rtc:
-	 * 0->0xffffffff
-	 */
-	/* if alarm is in next overflow cycle */
-	if (rtc_count > rtc_alarm)
-		rtc_time64_to_tm((rtcdrv->overflow_rtc + 1)
-				 << (BITS_PER_LONG - RTC_SHIFT)
-				 | rtc_alarm >> RTC_SHIFT, &alrm->time);
-	else
-		rtc_time64_to_tm(rtcdrv->overflow_rtc
-				 << (BITS_PER_LONG - RTC_SHIFT)
-				 | rtc_alarm >> RTC_SHIFT, &alrm->time);
-	if (sirfsoc_rtc_readl(rtcdrv, RTC_STATUS) & SIRFSOC_RTC_AL0E)
-		alrm->enabled = 1;
-
-	spin_unlock_irq(&rtcdrv->lock);
-
-	return 0;
-}
-
-static int sirfsoc_rtc_set_alarm(struct device *dev,
-		struct rtc_wkalrm *alrm)
-{
-	unsigned long rtc_status_reg, rtc_alarm;
-	struct sirfsoc_rtc_drv *rtcdrv;
-	rtcdrv = dev_get_drvdata(dev);
-
-	if (alrm->enabled) {
-		rtc_alarm = rtc_tm_to_time64(&alrm->time);
-
-		spin_lock_irq(&rtcdrv->lock);
-
-		rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
-		if (rtc_status_reg & SIRFSOC_RTC_AL0E) {
-			/*
-			 * An ongoing alarm in progress - ingore it and not
-			 * to return EBUSY
-			 */
-			dev_info(dev, "An old alarm was set, will be replaced by a new one\n");
-		}
-
-		sirfsoc_rtc_writel(rtcdrv, RTC_ALARM0, rtc_alarm << RTC_SHIFT);
-		rtc_status_reg &= ~0x07; /* mask out the lower status bits */
-		/*
-		 * This bit RTC_AL sets it as a wake-up source for Sleep Mode
-		 * Writing 1 into this bit will clear it
-		 */
-		rtc_status_reg |= SIRFSOC_RTC_AL0;
-		/* enable the RTC alarm interrupt */
-		rtc_status_reg |= SIRFSOC_RTC_AL0E;
-		sirfsoc_rtc_writel(rtcdrv, RTC_STATUS, rtc_status_reg);
-
-		spin_unlock_irq(&rtcdrv->lock);
-	} else {
-		/*
-		 * if this function was called with enabled=0
-		 * then it could mean that the application is
-		 * trying to cancel an ongoing alarm
-		 */
-		spin_lock_irq(&rtcdrv->lock);
-
-		rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
-		if (rtc_status_reg & SIRFSOC_RTC_AL0E) {
-			/* clear the RTC status register's alarm bit */
-			rtc_status_reg &= ~0x07;
-			/* write 1 into SIRFSOC_RTC_AL0 to force a clear */
-			rtc_status_reg |= (SIRFSOC_RTC_AL0);
-			/* Clear the Alarm enable bit */
-			rtc_status_reg &= ~(SIRFSOC_RTC_AL0E);
-
-			sirfsoc_rtc_writel(rtcdrv, RTC_STATUS,
-					   rtc_status_reg);
-		}
-
-		spin_unlock_irq(&rtcdrv->lock);
-	}
-
-	return 0;
-}
-
-static int sirfsoc_rtc_read_time(struct device *dev,
-		struct rtc_time *tm)
-{
-	unsigned long tmp_rtc = 0;
-	struct sirfsoc_rtc_drv *rtcdrv;
-	rtcdrv = dev_get_drvdata(dev);
-	/*
-	 * This patch is taken from WinCE - Need to validate this for
-	 * correctness. To work around sirfsoc RTC counter double sync logic
-	 * fail, read several times to make sure get stable value.
-	 */
-	do {
-		tmp_rtc = sirfsoc_rtc_readl(rtcdrv, RTC_CN);
-		cpu_relax();
-	} while (tmp_rtc != sirfsoc_rtc_readl(rtcdrv, RTC_CN));
-
-	rtc_time64_to_tm(rtcdrv->overflow_rtc << (BITS_PER_LONG - RTC_SHIFT)
-			 | tmp_rtc >> RTC_SHIFT, tm);
-	return 0;
-}
-
-static int sirfsoc_rtc_set_time(struct device *dev,
-		struct rtc_time *tm)
-{
-	unsigned long rtc_time;
-	struct sirfsoc_rtc_drv *rtcdrv;
-	rtcdrv = dev_get_drvdata(dev);
-
-	rtc_time = rtc_tm_to_time64(tm);
-
-	rtcdrv->overflow_rtc = rtc_time >> (BITS_PER_LONG - RTC_SHIFT);
-
-	sirfsoc_rtc_writel(rtcdrv, RTC_SW_VALUE, rtcdrv->overflow_rtc);
-	sirfsoc_rtc_writel(rtcdrv, RTC_CN, rtc_time << RTC_SHIFT);
-
-	return 0;
-}
-
-static int sirfsoc_rtc_alarm_irq_enable(struct device *dev,
-		unsigned int enabled)
-{
-	unsigned long rtc_status_reg = 0x0;
-	struct sirfsoc_rtc_drv *rtcdrv;
-
-	rtcdrv = dev_get_drvdata(dev);
-
-	spin_lock_irq(&rtcdrv->lock);
-
-	rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
-	if (enabled)
-		rtc_status_reg |= SIRFSOC_RTC_AL0E;
-	else
-		rtc_status_reg &= ~SIRFSOC_RTC_AL0E;
-
-	sirfsoc_rtc_writel(rtcdrv, RTC_STATUS, rtc_status_reg);
-
-	spin_unlock_irq(&rtcdrv->lock);
-
-	return 0;
-
-}
-
-static const struct rtc_class_ops sirfsoc_rtc_ops = {
-	.read_time = sirfsoc_rtc_read_time,
-	.set_time = sirfsoc_rtc_set_time,
-	.read_alarm = sirfsoc_rtc_read_alarm,
-	.set_alarm = sirfsoc_rtc_set_alarm,
-	.alarm_irq_enable = sirfsoc_rtc_alarm_irq_enable
-};
-
-static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata)
-{
-	struct sirfsoc_rtc_drv *rtcdrv = pdata;
-	unsigned long rtc_status_reg = 0x0;
-	unsigned long events = 0x0;
-
-	spin_lock(&rtcdrv->lock);
-
-	rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
-	/* this bit will be set ONLY if an alarm was active
-	 * and it expired NOW
-	 * So this is being used as an ASSERT
-	 */
-	if (rtc_status_reg & SIRFSOC_RTC_AL0) {
-		/*
-		 * clear the RTC status register's alarm bit
-		 * mask out the lower status bits
-		 */
-		rtc_status_reg &= ~0x07;
-		/* write 1 into SIRFSOC_RTC_AL0 to ACK the alarm interrupt */
-		rtc_status_reg |= (SIRFSOC_RTC_AL0);
-		/* Clear the Alarm enable bit */
-		rtc_status_reg &= ~(SIRFSOC_RTC_AL0E);
-	}
-
-	sirfsoc_rtc_writel(rtcdrv, RTC_STATUS, rtc_status_reg);
-
-	spin_unlock(&rtcdrv->lock);
-
-	/* this should wake up any apps polling/waiting on the read
-	 * after setting the alarm
-	 */
-	events |= RTC_IRQF | RTC_AF;
-	rtc_update_irq(rtcdrv->rtc, 1, events);
-
-	return IRQ_HANDLED;
-}
-
-static const struct of_device_id sirfsoc_rtc_of_match[] = {
-	{ .compatible = "sirf,prima2-sysrtc"},
-	{},
-};
-
-static const struct regmap_config sysrtc_regmap_config = {
-	.reg_bits = 32,
-	.val_bits = 32,
-	.fast_io = true,
-};
-
-MODULE_DEVICE_TABLE(of, sirfsoc_rtc_of_match);
-
-static int sirfsoc_rtc_probe(struct platform_device *pdev)
-{
-	int err;
-	unsigned long rtc_div;
-	struct sirfsoc_rtc_drv *rtcdrv;
-	struct device_node *np = pdev->dev.of_node;
-
-	rtcdrv = devm_kzalloc(&pdev->dev,
-		sizeof(struct sirfsoc_rtc_drv), GFP_KERNEL);
-	if (rtcdrv == NULL)
-		return -ENOMEM;
-
-	spin_lock_init(&rtcdrv->lock);
-
-	err = of_property_read_u32(np, "reg", &rtcdrv->rtc_base);
-	if (err) {
-		dev_err(&pdev->dev, "unable to find base address of rtc node in dtb\n");
-		return err;
-	}
-
-	platform_set_drvdata(pdev, rtcdrv);
-
-	/* Register rtc alarm as a wakeup source */
-	device_init_wakeup(&pdev->dev, 1);
-
-	rtcdrv->regmap = devm_regmap_init_iobg(&pdev->dev,
-			&sysrtc_regmap_config);
-	if (IS_ERR(rtcdrv->regmap)) {
-		err = PTR_ERR(rtcdrv->regmap);
-		dev_err(&pdev->dev, "Failed to allocate register map: %d\n",
-			err);
-		return err;
-	}
-
-	/*
-	 * Set SYS_RTC counter in RTC_HZ HZ Units
-	 * We are using 32K RTC crystal (32768 / RTC_HZ / 2) -1
-	 * If 16HZ, therefore RTC_DIV = 1023;
-	 */
-	rtc_div = ((32768 / RTC_HZ) / 2) - 1;
-	sirfsoc_rtc_writel(rtcdrv, RTC_DIV, rtc_div);
-
-	/* 0x3 -> RTC_CLK */
-	sirfsoc_rtc_writel(rtcdrv, RTC_CLOCK_SWITCH, SIRFSOC_RTC_CLK);
-
-	/* reset SYS RTC ALARM0 */
-	sirfsoc_rtc_writel(rtcdrv, RTC_ALARM0, 0x0);
-
-	/* reset SYS RTC ALARM1 */
-	sirfsoc_rtc_writel(rtcdrv, RTC_ALARM1, 0x0);
-
-	/* Restore RTC Overflow From Register After Command Reboot */
-	rtcdrv->overflow_rtc =
-		sirfsoc_rtc_readl(rtcdrv, RTC_SW_VALUE);
-
-	rtcdrv->rtc = devm_rtc_allocate_device(&pdev->dev);
-	if (IS_ERR(rtcdrv->rtc))
-		return PTR_ERR(rtcdrv->rtc);
-
-	rtcdrv->rtc->ops = &sirfsoc_rtc_ops;
-	rtcdrv->rtc->range_max = (1ULL << 60) - 1;
-
-	rtcdrv->irq = platform_get_irq(pdev, 0);
-	err = devm_request_irq(&pdev->dev, rtcdrv->irq, sirfsoc_rtc_irq_handler,
-			       IRQF_SHARED, pdev->name, rtcdrv);
-	if (err) {
-		dev_err(&pdev->dev, "Unable to register for the SiRF SOC RTC IRQ\n");
-		return err;
-	}
-
-	return devm_rtc_register_device(rtcdrv->rtc);
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int sirfsoc_rtc_suspend(struct device *dev)
-{
-	struct sirfsoc_rtc_drv *rtcdrv = dev_get_drvdata(dev);
-	rtcdrv->overflow_rtc =
-		sirfsoc_rtc_readl(rtcdrv, RTC_SW_VALUE);
-
-	rtcdrv->saved_counter =
-		sirfsoc_rtc_readl(rtcdrv, RTC_CN);
-	rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc;
-	if (device_may_wakeup(dev) && !enable_irq_wake(rtcdrv->irq))
-		rtcdrv->irq_wake = 1;
-
-	return 0;
-}
-
-static int sirfsoc_rtc_resume(struct device *dev)
-{
-	u32 tmp;
-	struct sirfsoc_rtc_drv *rtcdrv = dev_get_drvdata(dev);
-
-	/*
-	 * if resume from snapshot and the rtc power is lost,
-	 * restroe the rtc settings
-	 */
-	if (SIRFSOC_RTC_CLK != sirfsoc_rtc_readl(rtcdrv, RTC_CLOCK_SWITCH)) {
-		u32 rtc_div;
-		/* 0x3 -> RTC_CLK */
-		sirfsoc_rtc_writel(rtcdrv, RTC_CLOCK_SWITCH, SIRFSOC_RTC_CLK);
-		/*
-		 * Set SYS_RTC counter in RTC_HZ HZ Units
-		 * We are using 32K RTC crystal (32768 / RTC_HZ / 2) -1
-		 * If 16HZ, therefore RTC_DIV = 1023;
-		 */
-		rtc_div = ((32768 / RTC_HZ) / 2) - 1;
-
-		sirfsoc_rtc_writel(rtcdrv, RTC_DIV, rtc_div);
-
-		/* reset SYS RTC ALARM0 */
-		sirfsoc_rtc_writel(rtcdrv, RTC_ALARM0, 0x0);
-
-		/* reset SYS RTC ALARM1 */
-		sirfsoc_rtc_writel(rtcdrv, RTC_ALARM1, 0x0);
-	}
-	rtcdrv->overflow_rtc = rtcdrv->saved_overflow_rtc;
-
-	/*
-	 * if current counter is small than previous,
-	 * it means overflow in sleep
-	 */
-	tmp = sirfsoc_rtc_readl(rtcdrv, RTC_CN);
-	if (tmp <= rtcdrv->saved_counter)
-		rtcdrv->overflow_rtc++;
-	/*
-	 *PWRC Value Be Changed When Suspend, Restore Overflow
-	 * In Memory To Register
-	 */
-	sirfsoc_rtc_writel(rtcdrv, RTC_SW_VALUE, rtcdrv->overflow_rtc);
-
-	if (device_may_wakeup(dev) && rtcdrv->irq_wake) {
-		disable_irq_wake(rtcdrv->irq);
-		rtcdrv->irq_wake = 0;
-	}
-
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(sirfsoc_rtc_pm_ops,
-		sirfsoc_rtc_suspend, sirfsoc_rtc_resume);
-
-static struct platform_driver sirfsoc_rtc_driver = {
-	.driver = {
-		.name = "sirfsoc-rtc",
-		.pm = &sirfsoc_rtc_pm_ops,
-		.of_match_table = sirfsoc_rtc_of_match,
-	},
-	.probe = sirfsoc_rtc_probe,
-};
-module_platform_driver(sirfsoc_rtc_driver);
-
-MODULE_DESCRIPTION("SiRF SoC rtc driver");
-MODULE_AUTHOR("Xianglong Du <Xianglong.Du@csr.com>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:sirfsoc-rtc");
diff --git a/include/linux/rtc/sirfsoc_rtciobrg.h b/include/linux/rtc/sirfsoc_rtciobrg.h
deleted file mode 100644
index b31f2856733d..000000000000
--- a/include/linux/rtc/sirfsoc_rtciobrg.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * RTC I/O Bridge interfaces for CSR SiRFprimaII
- * ARM access the registers of SYSRTC, GPSRTC and PWRC through this module
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-#ifndef _SIRFSOC_RTC_IOBRG_H_
-#define _SIRFSOC_RTC_IOBRG_H_
-
-struct regmap_config;
-
-extern void sirfsoc_rtc_iobrg_besyncing(void);
-
-extern u32 sirfsoc_rtc_iobrg_readl(u32 addr);
-
-extern void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr);
-struct regmap *devm_regmap_init_iobg(struct device *dev,
-				    const struct regmap_config *config);
-
-#endif
-- 
cgit v1.2.3


From 2f63296578cad1ae681152d5b2122a4595195f16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Jan 2021 10:06:09 -0800
Subject: iomap: pass a flags argument to iomap_dio_rw

Pass a set of flags to iomap_dio_rw instead of the boolean
wait_for_completion argument.  The IOMAP_DIO_FORCE_WAIT flag
replaces the wait_for_completion, but only needs to be passed
when the iocb isn't synchronous to start with to simplify the
callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
[djwong: rework xfs_file.c so that we can push iomap changes separately]
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/btrfs/file.c       |  7 +++----
 fs/ext4/file.c        |  5 ++---
 fs/gfs2/file.c        |  7 ++-----
 fs/iomap/direct-io.c  | 11 +++++------
 fs/xfs/xfs_file.c     |  5 ++---
 fs/zonefs/super.c     |  4 ++--
 include/linux/iomap.h | 10 ++++++++--
 7 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e41459b8de6..ddfd2e2adedf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1949,8 +1949,8 @@ relock:
 		goto buffered;
 	}
 
-	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
-			     &btrfs_dio_ops, is_sync_kiocb(iocb));
+	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+			     0);
 
 	btrfs_inode_unlock(inode, ilock_flags);
 
@@ -3622,8 +3622,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
 		return 0;
 
 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
-	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-			   is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 	return ret;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 349b27f0dda0..194f5d00fa32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		return generic_file_read_iter(iocb, to);
 	}
 
-	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
-			   is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
 	inode_unlock_shared(inode);
 
 	file_accessed(iocb->ki_filp);
@@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ilock_shared)
 		iomap_ops = &ext4_iomap_overwrite_ops;
 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
-			   is_sync_kiocb(iocb) || unaligned_io || extend);
+			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
 	if (ret == -ENOTBLK)
 		ret = 0;
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b39b339feddc..89609c299717 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 	if (ret)
 		goto out_uninit;
 
-	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
-			   is_sync_kiocb(iocb));
-
+	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
 	gfs2_glock_dq(gh);
 out_uninit:
 	gfs2_holder_uninit(gh);
@@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	if (offset + len > i_size_read(&ip->i_inode))
 		goto out;
 
-	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
-			   is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
 	if (ret == -ENOTBLK)
 		ret = 0;
 out:
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 604103ab76f9..947343730e2c 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -420,13 +420,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 struct iomap_dio *
 __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion)
+		unsigned int dio_flags)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	loff_t end = iocb->ki_pos + count - 1, ret = 0;
+	bool wait_for_completion =
+		is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
 	unsigned int iomap_flags = IOMAP_DIRECT;
 	struct blk_plug plug;
 	struct iomap_dio *dio;
@@ -434,9 +436,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (!count)
 		return NULL;
 
-	if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
-		return ERR_PTR(-EIO);
-
 	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	if (!dio)
 		return ERR_PTR(-ENOMEM);
@@ -598,11 +597,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion)
+		unsigned int dio_flags)
 {
 	struct iomap_dio *dio;
 
-	dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
+	dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
 	if (IS_ERR_OR_NULL(dio))
 		return PTR_ERR_OR_ZERO(dio);
 	return iomap_dio_complete(dio);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f73837..7f18dae84584 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -219,8 +219,7 @@ xfs_file_dio_aio_read(
 	} else {
 		xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	}
-	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
-			is_sync_kiocb(iocb));
+	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	return ret;
@@ -584,7 +583,7 @@ xfs_file_dio_aio_write(
 	 */
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 			   &xfs_dio_write_ops,
-			   is_sync_kiocb(iocb) || unaligned_io);
+			   unaligned_io ? IOMAP_DIO_FORCE_WAIT : 0);
 out:
 	xfs_iunlock(ip, iolock);
 
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bec47f2d074b..0e7ab0bc00ae 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 		ret = zonefs_file_dio_append(iocb, from);
 	else
 		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
-				   &zonefs_write_dio_ops, sync);
+				   &zonefs_write_dio_ops, 0);
 	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
 	    (ret > 0 || ret == -EIOCBQUEUED)) {
 		if (ret > 0)
@@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		}
 		file_accessed(iocb->ki_filp);
 		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
-				   &zonefs_read_dio_ops, is_sync_kiocb(iocb));
+				   &zonefs_read_dio_ops, 0);
 	} else {
 		ret = generic_file_read_iter(iocb, to);
 		if (ret == -EIO)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5bd3cac4df9c..be4e1e1e01e8 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -256,12 +256,18 @@ struct iomap_dio_ops {
 			struct bio *bio, loff_t file_offset);
 };
 
+/*
+ * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
+ * synchronous.
+ */
+#define IOMAP_DIO_FORCE_WAIT	(1 << 0)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion);
+		unsigned int dio_flags);
 struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-		bool wait_for_completion);
+		unsigned int dio_flags);
 ssize_t iomap_dio_complete(struct iomap_dio *dio);
 int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
-- 
cgit v1.2.3


From 213f627104daf8589aad8ee73fcaeb603ab0af15 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Jan 2021 10:06:10 -0800
Subject: iomap: add a IOMAP_DIO_OVERWRITE_ONLY flag

Add a flag to signal that only pure overwrites are allowed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/direct-io.c  | 7 +++++++
 include/linux/iomap.h | 8 ++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 947343730e2c..65d32364345d 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -485,6 +485,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		iomap_flags |= IOMAP_NOWAIT;
 	}
 
+	if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
+		ret = -EAGAIN;
+		if (pos >= dio->i_size || pos + count > dio->i_size)
+			goto out_free_dio;
+		iomap_flags |= IOMAP_OVERWRITE_ONLY;
+	}
+
 	ret = filemap_write_and_wait_range(mapping, pos, end);
 	if (ret)
 		goto out_free_dio;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index be4e1e1e01e8..1a86f520de56 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -122,6 +122,7 @@ struct iomap_page_ops {
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
 #define IOMAP_NOWAIT		(1 << 5) /* do not block */
+#define IOMAP_OVERWRITE_ONLY	(1 << 6) /* only pure overwrites allowed */
 
 struct iomap_ops {
 	/*
@@ -262,6 +263,13 @@ struct iomap_dio_ops {
  */
 #define IOMAP_DIO_FORCE_WAIT	(1 << 0)
 
+/*
+ * Do not allocate blocks or zero partial blocks, but instead fall back to
+ * the caller by returning -EAGAIN.  Used to optimize direct I/O writes that
+ * are not aligned to the file system block size.
+  */
+#define IOMAP_DIO_OVERWRITE_ONLY	(1 << 1)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags);
-- 
cgit v1.2.3


From 6f1c0ea133a6e4a193a7b285efe209664caeea43 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Fri, 22 Jan 2021 18:19:48 +0000
Subject: net: introduce a netdev feature for UDP GRO forwarding

Introduce a new netdev feature, NETIF_F_GRO_UDP_FWD, to allow user
to turn UDP GRO on and off for forwarding.
Defaults to off to not change current datapath.

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdev_features.h | 4 +++-
 net/ethtool/common.c            | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 934de56644e7..c06d6aaba9df 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -84,6 +84,7 @@ enum {
 	NETIF_F_GRO_FRAGLIST_BIT,	/* Fraglist GRO */
 
 	NETIF_F_HW_MACSEC_BIT,		/* Offload MACsec operations */
+	NETIF_F_GRO_UDP_FWD_BIT,	/* Allow UDP GRO for forwarding */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -157,6 +158,7 @@ enum {
 #define NETIF_F_GRO_FRAGLIST	__NETIF_F(GRO_FRAGLIST)
 #define NETIF_F_GSO_FRAGLIST	__NETIF_F(GSO_FRAGLIST)
 #define NETIF_F_HW_MACSEC	__NETIF_F(HW_MACSEC)
+#define NETIF_F_GRO_UDP_FWD	__NETIF_F(GRO_UDP_FWD)
 
 /* Finds the next feature with the highest number of the range of start till 0.
  */
@@ -234,7 +236,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 #define NETIF_F_SOFT_FEATURES	(NETIF_F_GSO | NETIF_F_GRO)
 
 /* Changeable features with no special hardware requirements that defaults to off. */
-#define NETIF_F_SOFT_FEATURES_OFF	NETIF_F_GRO_FRAGLIST
+#define NETIF_F_SOFT_FEATURES_OFF	(NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)
 
 #define NETIF_F_VLAN_FEATURES	(NETIF_F_HW_VLAN_CTAG_FILTER | \
 				 NETIF_F_HW_VLAN_CTAG_RX | \
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 24036e3055a1..181220101a6e 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -68,6 +68,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
 	[NETIF_F_HW_TLS_RX_BIT] =	 "tls-hw-rx-offload",
 	[NETIF_F_GRO_FRAGLIST_BIT] =	 "rx-gro-list",
 	[NETIF_F_HW_MACSEC_BIT] =	 "macsec-hw-offload",
+	[NETIF_F_GRO_UDP_FWD_BIT] =	 "rx-udp-gro-forwarding",
 };
 
 const char
-- 
cgit v1.2.3


From a6435940b62f81a1718bf2bd46a051379fc89b9d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:20 +0100
Subject: mount: attach mappings to mounts

In order to support per-mount idmappings vfsmounts are marked with user
namespaces. The idmapping of the user namespace will be used to map the
ids of vfs objects when they are accessed through that mount. By default
all vfsmounts are marked with the initial user namespace. The initial
user namespace is used to indicate that a mount is not idmapped. All
operations behave as before.

Based on prior discussions we want to attach the whole user namespace
and not just a dedicated idmapping struct. This allows us to reuse all
the helpers that already exist for dealing with idmappings instead of
introducing a whole new range of helpers. In addition, if we decide in
the future that we are confident enough to enable unprivileged users to
setup idmapped mounts the permission checking can take into account
whether the caller is privileged in the user namespace the mount is
currently marked with.
Later patches enforce that once a mount has been idmapped it can't be
remapped. This keeps permission checking and life-cycle management
simple. Users wanting to change the idmapped can always create a new
detached mount with a different idmapping.

Add a new mnt_userns member to vfsmount and two simple helpers to
retrieve the mnt_userns from vfsmounts and files.

The idea to attach user namespaces to vfsmounts has been floated around
in various forms at Linux Plumbers in ~2018 with the original idea
tracing back to a discussion in 2017 at a conference in St. Petersburg
between Christoph, Tycho, and myself.

Link: https://lore.kernel.org/r/20210121131959.646623-2-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/namespace.c        | 9 +++++++++
 include/linux/fs.h    | 6 ++++++
 include/linux/mount.h | 6 ++++++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 9d33909d0f9e..ecdc63ef881c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -210,6 +210,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+		mnt->mnt.mnt_userns = &init_user_ns;
 	}
 	return mnt;
 
@@ -547,6 +548,11 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+	struct user_namespace *mnt_userns;
+
+	mnt_userns = mnt_user_ns(&mnt->mnt);
+	if (mnt_userns != &init_user_ns)
+		put_user_ns(mnt_userns);
 	kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
@@ -1055,6 +1061,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
 
 	atomic_inc(&sb->s_active);
+	mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
+	if (mnt->mnt.mnt_userns != &init_user_ns)
+		mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..fd0b80e6361d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -39,6 +39,7 @@
 #include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
+#include <linux/mount.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -2231,6 +2232,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_DISALLOW_NOTIFY_PERM	16	/* Disable fanotify permission events */
+#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
 #define FS_THP_SUPPORT		8192	/* Remove once all fs converted */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
@@ -2517,6 +2519,10 @@ struct filename {
 };
 static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
 
+static inline struct user_namespace *file_mnt_user_ns(struct file *file)
+{
+	return mnt_user_ns(file->f_path.mnt);
+}
 extern long vfs_truncate(const struct path *, loff_t);
 extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 		       struct file *filp);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index aaf343b38671..52de25e08319 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -72,8 +72,14 @@ struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
 	struct super_block *mnt_sb;	/* pointer to superblock */
 	int mnt_flags;
+	struct user_namespace *mnt_userns;
 } __randomize_layout;
 
+static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+	return mnt->mnt_userns;
+}
+
 struct file; /* forward dec */
 struct path;
 
-- 
cgit v1.2.3


From e6c9a71451560edba343cbcbd500bea0a188f0d1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:21 +0100
Subject: fs: add id translation helpers

Add simple helpers to make it easy to map kuids into and from idmapped
mounts. We provide simple wrappers that filesystems can use to e.g.
initialize inodes similar to i_{uid,gid}_read() and i_{uid,gid}_write().
Accessing an inode through an idmapped mount maps the i_uid and i_gid of
the inode to the mount's user namespace. If the fsids are used to
initialize inodes they are unmapped according to the mount's user
namespace. Passing the initial user namespace to these helpers makes
them a nop and so any non-idmapped paths will not be impacted.

Link: https://lore.kernel.org/r/20210121131959.646623-3-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd0b80e6361d..3165998e2294 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -40,6 +40,7 @@
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
 #include <linux/mount.h>
+#include <linux/cred.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1573,6 +1574,52 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
+static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
+				   kuid_t kuid)
+{
+	return make_kuid(mnt_userns, __kuid_val(kuid));
+}
+
+static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
+				   kgid_t kgid)
+{
+	return make_kgid(mnt_userns, __kgid_val(kgid));
+}
+
+static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
+				    const struct inode *inode)
+{
+	return kuid_into_mnt(mnt_userns, inode->i_uid);
+}
+
+static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
+				    const struct inode *inode)
+{
+	return kgid_into_mnt(mnt_userns, inode->i_gid);
+}
+
+static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
+				   kuid_t kuid)
+{
+	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
+}
+
+static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
+				   kgid_t kgid)
+{
+	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
+}
+
+static inline kuid_t fsuid_into_mnt(struct user_namespace *mnt_userns)
+{
+	return kuid_from_mnt(mnt_userns, current_fsuid());
+}
+
+static inline kgid_t fsgid_into_mnt(struct user_namespace *mnt_userns)
+{
+	return kgid_from_mnt(mnt_userns, current_fsgid());
+}
+
 extern struct timespec64 current_time(struct inode *inode);
 
 /*
-- 
cgit v1.2.3


From 02f92b3868a1b34ab98464e76b0e4e060474ba10 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:22 +0100
Subject: fs: add file and path permissions helpers

Add two simple helpers to check permissions on a file and path
respectively and convert over some callers. It simplifies quite a few
codepaths and also reduces the churn in later patches quite a bit.
Christoph also correctly points out that this makes codepaths (e.g.
ioctls) way easier to follow that would otherwise have to do more
complex argument passing than necessary.

Link: https://lore.kernel.org/r/20210121131959.646623-4-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/init.c                          | 6 +++---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 fs/notify/inotify/inotify_user.c   | 2 +-
 fs/open.c                          | 6 +++---
 fs/udf/file.c                      | 2 +-
 fs/verity/enable.c                 | 2 +-
 include/linux/fs.h                 | 8 ++++++++
 kernel/bpf/inode.c                 | 2 +-
 kernel/sys.c                       | 2 +-
 mm/madvise.c                       | 2 +-
 mm/memcontrol.c                    | 2 +-
 mm/mincore.c                       | 2 +-
 net/unix/af_unix.c                 | 2 +-
 13 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/init.c b/fs/init.c
index e9c320a48cf1..02723bea8499 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -49,7 +49,7 @@ int __init init_chdir(const char *filename)
 	error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
 	if (error)
 		return error;
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &path);
 	path_put(&path);
@@ -64,7 +64,7 @@ int __init init_chroot(const char *filename)
 	error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
 	if (error)
 		return error;
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 	error = -EPERM;
@@ -118,7 +118,7 @@ int __init init_eaccess(const char *filename)
 	error = kern_path(filename, LOOKUP_FOLLOW, &path);
 	if (error)
 		return error;
-	error = inode_permission(d_inode(path.dentry), MAY_ACCESS);
+	error = path_permission(&path, MAY_ACCESS);
 	path_put(&path);
 	return error;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index dcab112e1f00..64cfc1a3015d 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -702,7 +702,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
 	}
 
 	/* you can only watch an inode if you have read permissions on it */
-	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	ret = path_permission(path, MAY_READ);
 	if (ret) {
 		path_put(path);
 		goto out;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 59c177011a0f..e1155d32ef6f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -352,7 +352,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path,
 	if (error)
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	error = path_permission(path, MAY_READ);
 	if (error) {
 		path_put(path);
 		return error;
diff --git a/fs/open.c b/fs/open.c
index 1e06e443a565..cd1efd254cad 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -492,7 +492,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
@@ -521,7 +521,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 	if (!d_can_lookup(f.file->f_path.dentry))
 		goto out_putf;
 
-	error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+	error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
 	if (!error)
 		set_fs_pwd(current->fs, &f.file->f_path);
 out_putf:
@@ -540,7 +540,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+	error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index ad8eefad27d7..3671a40ed3c3 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -183,7 +183,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	long old_block, new_block;
 	int result;
 
-	if (inode_permission(inode, MAY_READ) != 0) {
+	if (file_permission(filp, MAY_READ) != 0) {
 		udf_debug("no permission to access inode %lu\n", inode->i_ino);
 		return -EPERM;
 	}
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index f7e997a01ad0..77e159a0346b 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -369,7 +369,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
 	 * has verity enabled, and to stabilize the data being hashed.
 	 */
 
-	err = inode_permission(inode, MAY_WRITE);
+	err = file_permission(filp, MAY_WRITE);
 	if (err)
 		return err;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3165998e2294..bcd17097d441 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2812,6 +2812,14 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int);
+static inline int file_permission(struct file *file, int mask)
+{
+	return inode_permission(file_inode(file), mask);
+}
+static inline int path_permission(const struct path *path, int mask)
+{
+	return inode_permission(d_inode(path->dentry), mask);
+}
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
 static inline bool execute_ok(struct inode *inode)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dd4b7fd60ee7..8962f139521e 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -507,7 +507,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, ACC_MODE(flags));
+	ret = path_permission(&path, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 51f00fe20e4d..138fb253b344 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1848,7 +1848,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
 		goto exit;
 
-	err = inode_permission(inode, MAY_EXEC);
+	err = file_permission(exe.file, MAY_EXEC);
 	if (err)
 		goto exit;
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 6a660858784b..175c5582d8a9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -540,7 +540,7 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
 	 * opens a side channel.
 	 */
 	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
-		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
 static long madvise_pageout(struct vm_area_struct *vma,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 605f671203ef..cf9076f58582 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4899,7 +4899,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 
 	/* the process need read permission on control file */
 	/* AV: shouldn't we check that it's been opened for read instead? */
-	ret = inode_permission(file_inode(cfile.file), MAY_READ);
+	ret = file_permission(cfile.file, MAY_READ);
 	if (ret < 0)
 		goto out_put_cfile;
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 02db1a834021..7bdb4673f776 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -167,7 +167,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
 	 * mappings, which opens a side channel.
 	 */
 	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
-		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
 static const struct mm_walk_ops mincore_walk_ops = {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 41c3303c3357..18453d15dddf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -936,7 +936,7 @@ static struct sock *unix_find_other(struct net *net,
 		if (err)
 			goto fail;
 		inode = d_backing_inode(path.dentry);
-		err = inode_permission(inode, MAY_WRITE);
+		err = path_permission(&path, MAY_WRITE);
 		if (err)
 			goto put_fail;
 
-- 
cgit v1.2.3


From 0558c1bf5a0811bf5e3753eed911a15b9bd08271 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:23 +0100
Subject: capability: handle idmapped mounts

In order to determine whether a caller holds privilege over a given
inode the capability framework exposes the two helpers
privileged_wrt_inode_uidgid() and capable_wrt_inode_uidgid(). The former
verifies that the inode has a mapping in the caller's user namespace and
the latter additionally verifies that the caller has the requested
capability in their current user namespace.
If the inode is accessed through an idmapped mount map it into the
mount's user namespace. Afterwards the checks are identical to
non-idmapped inodes. If the initial user namespace is passed all
operations are a nop so non-idmapped mounts will not see a change in
behavior.

Link: https://lore.kernel.org/r/20210121131959.646623-5-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                  |  8 ++++----
 fs/exec.c                  |  3 ++-
 fs/inode.c                 |  3 ++-
 fs/namei.c                 | 13 ++++++++-----
 fs/overlayfs/super.c       |  2 +-
 fs/posix_acl.c             |  2 +-
 fs/xfs/xfs_ioctl.c         |  2 +-
 include/linux/capability.h |  7 +++++--
 kernel/capability.c        | 14 +++++++++-----
 security/commoncap.c       |  5 +++--
 10 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index b4bbdbd4c8ca..d270f640a192 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -23,7 +23,7 @@ static bool chown_ok(const struct inode *inode, kuid_t uid)
 	if (uid_eq(current_fsuid(), inode->i_uid) &&
 	    uid_eq(uid, inode->i_uid))
 		return true;
-	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
 		return true;
 	if (uid_eq(inode->i_uid, INVALID_UID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
@@ -36,7 +36,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
 	if (uid_eq(current_fsuid(), inode->i_uid) &&
 	    (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
 		return true;
-	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
 		return true;
 	if (gid_eq(inode->i_gid, INVALID_GID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
@@ -92,7 +92,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
 				inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -193,7 +193,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 		umode_t mode = attr->ia_mode;
 
 		if (!in_group_p(inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index 5d4d52039105..89d4780ff48f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1411,7 +1411,8 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
 		/* Ensure mm->user_ns contains the executable */
 		user_ns = old = bprm->mm->user_ns;
 		while ((user_ns != &init_user_ns) &&
-		       !privileged_wrt_inode_uidgid(user_ns, inode))
+		       !privileged_wrt_inode_uidgid(user_ns, &init_user_ns,
+						    inode))
 			user_ns = user_ns->parent;
 
 		if (old != user_ns) {
diff --git a/fs/inode.c b/fs/inode.c
index 6442d97d9a4a..cd40cbf87ce4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2146,7 +2146,8 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
 			mode |= S_ISGID;
 		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
 			 !in_group_p(inode->i_gid) &&
-			 !capable_wrt_inode_uidgid(dir, CAP_FSETID))
+			 !capable_wrt_inode_uidgid(&init_user_ns, dir,
+						   CAP_FSETID))
 			mode &= ~S_ISGID;
 	} else
 		inode->i_gid = current_fsgid();
diff --git a/fs/namei.c b/fs/namei.c
index 78443a85480a..fd4724bce4f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -357,10 +357,11 @@ int generic_permission(struct inode *inode, int mask)
 	if (S_ISDIR(inode->i_mode)) {
 		/* DACs are overridable for directories */
 		if (!(mask & MAY_WRITE))
-			if (capable_wrt_inode_uidgid(inode,
+			if (capable_wrt_inode_uidgid(&init_user_ns, inode,
 						     CAP_DAC_READ_SEARCH))
 				return 0;
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+					     CAP_DAC_OVERRIDE))
 			return 0;
 		return -EACCES;
 	}
@@ -370,7 +371,8 @@ int generic_permission(struct inode *inode, int mask)
 	 */
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 	if (mask == MAY_READ)
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
+		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+					     CAP_DAC_READ_SEARCH))
 			return 0;
 	/*
 	 * Read/write DACs are always overridable.
@@ -378,7 +380,8 @@ int generic_permission(struct inode *inode, int mask)
 	 * at least one exec bit set.
 	 */
 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+					     CAP_DAC_OVERRIDE))
 			return 0;
 
 	return -EACCES;
@@ -2659,7 +2662,7 @@ int __check_sticky(struct inode *dir, struct inode *inode)
 		return 0;
 	if (uid_eq(dir->i_uid, fsuid))
 		return 0;
-	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
+	return !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FOWNER);
 }
 EXPORT_SYMBOL(__check_sticky);
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 2bd570cbe8a4..88d877787770 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1017,7 +1017,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 	if (unlikely(inode->i_mode & S_ISGID) &&
 	    handler->flags == ACL_TYPE_ACCESS &&
 	    !in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(inode, CAP_FSETID)) {
+	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
 		struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
 
 		err = ovl_setattr(dentry, &iattr);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 95882b3f5f62..4ca6d53c6f0a 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -656,7 +656,7 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
 	if (error == 0)
 		*acl = NULL;
 	if (!in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
 		mode &= ~S_ISGID;
 	*mode_p = mode;
 	return 0;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3fbd98f61ea5..97bd29fc8c43 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1502,7 +1502,7 @@ xfs_ioctl_setattr(
 	 */
 
 	if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
-	    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+	    !capable_wrt_inode_uidgid(&init_user_ns, VFS_I(ip), CAP_FSETID))
 		VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
 
 	/* Change the ownerships and register project quota modifications */
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b2f698915c0f..62bfa3ed84d7 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -247,8 +247,11 @@ static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
 	return true;
 }
 #endif /* CONFIG_MULTIUSER */
-extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
-extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+				 struct user_namespace *mnt_userns,
+				 const struct inode *inode);
+bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+			      const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
 static inline bool perfmon_capable(void)
diff --git a/kernel/capability.c b/kernel/capability.c
index de7eac903a2a..46a361dde042 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -484,10 +484,12 @@ EXPORT_SYMBOL(file_ns_capable);
  *
  * Return true if the inode uid and gid are within the namespace.
  */
-bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+				 struct user_namespace *mnt_userns,
+				 const struct inode *inode)
 {
-	return kuid_has_mapping(ns, inode->i_uid) &&
-		kgid_has_mapping(ns, inode->i_gid);
+	return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
+	       kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
 }
 
 /**
@@ -499,11 +501,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *
  * its own user namespace and that the given inode's uid and gid are
  * mapped into the current user namespace.
  */
-bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
+			      const struct inode *inode, int cap)
 {
 	struct user_namespace *ns = current_user_ns();
 
-	return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+	return ns_capable(ns, cap) &&
+	       privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
 }
 EXPORT_SYMBOL(capable_wrt_inode_uidgid);
 
diff --git a/security/commoncap.c b/security/commoncap.c
index bacc1111d871..88ee345f7bfa 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -489,7 +489,7 @@ int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 		return -EINVAL;
 	if (!validheader(size, cap))
 		return -EINVAL;
-	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+	if (!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_SETFCAP))
 		return -EPERM;
 	if (size == XATTR_CAPS_SZ_2)
 		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
@@ -956,7 +956,8 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 		struct inode *inode = d_backing_inode(dentry);
 		if (!inode)
 			return -EINVAL;
-		if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+		if (!capable_wrt_inode_uidgid(&init_user_ns, inode,
+					      CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
-- 
cgit v1.2.3


From 47291baa8ddfdae10663624ff0a15ab165952708 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:24 +0100
Subject: namei: make permission helpers idmapped mount aware

The two helpers inode_permission() and generic_permission() are used by
the vfs to perform basic permission checking by verifying that the
caller is privileged over an inode. In order to handle idmapped mounts
we extend the two helpers with an additional user namespace argument.
On idmapped mounts the two helpers will make sure to map the inode
according to the mount's user namespace and then peform identical
permission checks to inode_permission() and generic_permission(). If the
initial user namespace is passed nothing changes so non-idmapped mounts
will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-6-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                 |   3 +-
 fs/btrfs/inode.c          |   2 +-
 fs/btrfs/ioctl.c          |  12 +++--
 fs/ceph/inode.c           |   2 +-
 fs/cifs/cifsfs.c          |   2 +-
 fs/configfs/symlink.c     |   3 +-
 fs/ecryptfs/inode.c       |   3 +-
 fs/exec.c                 |   2 +-
 fs/fuse/dir.c             |   5 +-
 fs/gfs2/inode.c           |   2 +-
 fs/hostfs/hostfs_kern.c   |   2 +-
 fs/kernfs/inode.c         |   2 +-
 fs/libfs.c                |   7 ++-
 fs/namei.c                | 121 ++++++++++++++++++++++++++++++++--------------
 fs/nfs/dir.c              |   2 +-
 fs/nfsd/nfsfh.c           |   3 +-
 fs/nfsd/vfs.c             |   5 +-
 fs/nilfs2/inode.c         |   2 +-
 fs/ocfs2/file.c           |   2 +-
 fs/ocfs2/refcounttree.c   |   4 +-
 fs/open.c                 |   4 +-
 fs/orangefs/inode.c       |   2 +-
 fs/overlayfs/file.c       |   2 +-
 fs/overlayfs/inode.c      |   4 +-
 fs/overlayfs/util.c       |   2 +-
 fs/posix_acl.c            |  17 +++++--
 fs/proc/base.c            |   4 +-
 fs/proc/fd.c              |   2 +-
 fs/reiserfs/xattr.c       |   2 +-
 fs/remap_range.c          |   2 +-
 fs/xattr.c                |   2 +-
 include/linux/fs.h        |  10 ++--
 include/linux/posix_acl.h |   7 ++-
 ipc/mqueue.c              |   2 +-
 kernel/bpf/inode.c        |   2 +-
 kernel/cgroup/cgroup.c    |   2 +-
 36 files changed, 164 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index d270f640a192..c9e29e589cec 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -244,7 +244,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 			return -EPERM;
 
 		if (!inode_owner_or_capable(inode)) {
-			error = inode_permission(inode, MAY_WRITE);
+			error = inode_permission(&init_user_ns, inode,
+						 MAY_WRITE);
 			if (error)
 				return error;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8e0a6b038d3..512ee2650bbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9889,7 +9889,7 @@ static int btrfs_permission(struct inode *inode, int mask)
 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
 			return -EACCES;
 	}
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dde49a791f3e..8ced6dfefee4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -922,7 +922,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 	BUG_ON(d_inode(victim->d_parent) != dir);
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -951,7 +951,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2538,7 +2538,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
 				ret = PTR_ERR(temp_inode);
 				goto out_put;
 			}
-			ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+			ret = inode_permission(&init_user_ns, temp_inode,
+					       MAY_READ | MAY_EXEC);
 			iput(temp_inode);
 			if (ret) {
 				ret = -EACCES;
@@ -3068,7 +3069,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		if (root == dest)
 			goto out_dput;
 
-		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+		err = inode_permission(&init_user_ns, inode,
+				       MAY_WRITE | MAY_EXEC);
 		if (err)
 			goto out_dput;
 	}
@@ -3139,7 +3141,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 		 * running and allows defrag on files open in read-only mode.
 		 */
 		if (!capable(CAP_SYS_ADMIN) &&
-		    inode_permission(inode, MAY_WRITE)) {
+		    inode_permission(&init_user_ns, inode, MAY_WRITE)) {
 			ret = -EPERM;
 			goto out;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index adc8fc3c5d85..e8a15ee09bc1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2331,7 +2331,7 @@ int ceph_permission(struct inode *inode, int mask)
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
 
 	if (!err)
-		err = generic_permission(inode, mask);
+		err = generic_permission(&init_user_ns, inode, mask);
 	return err;
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ce0d0037fd0a..ce14e6f8adb6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -320,7 +320,7 @@ static int cifs_permission(struct inode *inode, int mask)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask);
+		return generic_permission(&init_user_ns, inode, mask);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index cb61467478ca..8ca36394fa30 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -197,7 +197,8 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	if (dentry->d_inode || d_unhashed(dentry))
 		ret = -EEXIST;
 	else
-		ret = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+		ret = inode_permission(&init_user_ns, dir,
+				       MAY_WRITE | MAY_EXEC);
 	if (!ret)
 		ret = type->ct_item_ops->allow_link(parent_item, target_item);
 	if (!ret) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e23752d9a79f..0b346baf110d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -864,7 +864,8 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
-	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
+	return inode_permission(&init_user_ns,
+				ecryptfs_inode_to_lower(inode), mask);
 }
 
 /**
diff --git a/fs/exec.c b/fs/exec.c
index 89d4780ff48f..a8ec371cd3cd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1404,7 +1404,7 @@ EXPORT_SYMBOL(begin_new_exec);
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
 	struct inode *inode = file_inode(file);
-	if (inode_permission(inode, MAY_READ) < 0) {
+	if (inode_permission(&init_user_ns, inode, MAY_READ) < 0) {
 		struct user_namespace *old, *user_ns;
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 78f9f209078c..7497009a5a45 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1280,7 +1280,7 @@ static int fuse_permission(struct inode *inode, int mask)
 	}
 
 	if (fc->default_permissions) {
-		err = generic_permission(inode, mask);
+		err = generic_permission(&init_user_ns, inode, mask);
 
 		/* If permission is denied, try to refresh file
 		   attributes.  This is also needed, because the root
@@ -1288,7 +1288,8 @@ static int fuse_permission(struct inode *inode, int mask)
 		if (err == -EACCES && !refreshed) {
 			err = fuse_perm_getattr(inode, mask);
 			if (!err)
-				err = generic_permission(inode, mask);
+				err = generic_permission(&init_user_ns,
+							 inode, mask);
 		}
 
 		/* Note: the opposite of the above test does not
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c1b77e8d6b1c..5b2ff0c74b67 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1852,7 +1852,7 @@ int gfs2_permission(struct inode *inode, int mask)
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
 		error = -EPERM;
 	else
-		error = generic_permission(inode, mask);
+		error = generic_permission(&init_user_ns, inode, mask);
 	if (gfs2_holder_initialized(&i_gh))
 		gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index aea35459d390..b841a05a2b8c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -779,7 +779,7 @@ static int hostfs_permission(struct inode *ino, int desired)
 		err = access_file(name, r, w, x);
 	__putname(name);
 	if (!err)
-		err = generic_permission(ino, desired);
+		err = generic_permission(&init_user_ns, ino, desired);
 	return err;
 }
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index fc2469a20fed..ff5598cc1de0 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -285,7 +285,7 @@ int kernfs_iop_permission(struct inode *inode, int mask)
 	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/libfs.c b/fs/libfs.c
index d1c3bade9f30..f8b3c02b4f0f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1318,9 +1318,14 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
 	return -EOPNOTSUPP;
 }
 
+static int empty_dir_permission(struct inode *inode, int mask)
+{
+	return generic_permission(&init_user_ns, inode, mask);
+}
+
 static const struct inode_operations empty_dir_inode_operations = {
 	.lookup		= empty_dir_lookup,
-	.permission	= generic_permission,
+	.permission	= empty_dir_permission,
 	.setattr	= empty_dir_setattr,
 	.getattr	= empty_dir_getattr,
 	.listxattr	= empty_dir_listxattr,
diff --git a/fs/namei.c b/fs/namei.c
index fd4724bce4f5..d78d74f5f5af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -259,7 +259,24 @@ void putname(struct filename *name)
 		__putname(name);
 }
 
-static int check_acl(struct inode *inode, int mask)
+/**
+ * check_acl - perform ACL permission checking
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	inode to check permissions on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
+ * This function performs the ACL permission checking. Since this function
+ * retrieve POSIX acls it needs to know whether it is called from a blocking or
+ * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+static int check_acl(struct user_namespace *mnt_userns,
+		     struct inode *inode, int mask)
 {
 #ifdef CONFIG_FS_POSIX_ACL
 	struct posix_acl *acl;
@@ -271,14 +288,14 @@ static int check_acl(struct inode *inode, int mask)
 		/* no ->get_acl() calls in RCU mode... */
 		if (is_uncached_acl(acl))
 			return -ECHILD;
-	        return posix_acl_permission(inode, acl, mask);
+	        return posix_acl_permission(mnt_userns, inode, acl, mask);
 	}
 
 	acl = get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
-	        int error = posix_acl_permission(inode, acl, mask);
+	        int error = posix_acl_permission(mnt_userns, inode, acl, mask);
 	        posix_acl_release(acl);
 	        return error;
 	}
@@ -287,18 +304,31 @@ static int check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-/*
- * This does the basic UNIX permission checking.
+/**
+ * acl_permission_check - perform basic UNIX permission checking
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	inode to check permissions on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
+ * This function performs the basic UNIX permission checking. Since this
+ * function may retrieve POSIX acls it needs to know whether it is called from a
+ * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
  *
- * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
- * for RCU walking.
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-static int acl_permission_check(struct inode *inode, int mask)
+static int acl_permission_check(struct user_namespace *mnt_userns,
+				struct inode *inode, int mask)
 {
 	unsigned int mode = inode->i_mode;
+	kuid_t i_uid;
 
 	/* Are we the owner? If so, ACL's don't matter */
-	if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
+	i_uid = i_uid_into_mnt(mnt_userns, inode);
+	if (likely(uid_eq(current_fsuid(), i_uid))) {
 		mask &= 7;
 		mode >>= 6;
 		return (mask & ~mode) ? -EACCES : 0;
@@ -306,7 +336,7 @@ static int acl_permission_check(struct inode *inode, int mask)
 
 	/* Do we have ACL's? */
 	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
-		int error = check_acl(inode, mask);
+		int error = check_acl(mnt_userns, inode, mask);
 		if (error != -EAGAIN)
 			return error;
 	}
@@ -320,7 +350,8 @@ static int acl_permission_check(struct inode *inode, int mask)
 	 * about? Need to check group ownership if so.
 	 */
 	if (mask & (mode ^ (mode >> 3))) {
-		if (in_group_p(inode->i_gid))
+		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+		if (in_group_p(kgid))
 			mode >>= 3;
 	}
 
@@ -330,6 +361,7 @@ static int acl_permission_check(struct inode *inode, int mask)
 
 /**
  * generic_permission -  check for access rights on a Posix-like filesystem
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode:	inode to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
  *		%MAY_NOT_BLOCK ...)
@@ -342,25 +374,32 @@ static int acl_permission_check(struct inode *inode, int mask)
  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
  * request cannot be satisfied (eg. requires blocking or too much complexity).
  * It would then be called again in ref-walk mode.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-int generic_permission(struct inode *inode, int mask)
+int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		       int mask)
 {
 	int ret;
 
 	/*
 	 * Do the basic permission checks.
 	 */
-	ret = acl_permission_check(inode, mask);
+	ret = acl_permission_check(mnt_userns, inode, mask);
 	if (ret != -EACCES)
 		return ret;
 
 	if (S_ISDIR(inode->i_mode)) {
 		/* DACs are overridable for directories */
 		if (!(mask & MAY_WRITE))
-			if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+			if (capable_wrt_inode_uidgid(mnt_userns, inode,
 						     CAP_DAC_READ_SEARCH))
 				return 0;
-		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+		if (capable_wrt_inode_uidgid(mnt_userns, inode,
 					     CAP_DAC_OVERRIDE))
 			return 0;
 		return -EACCES;
@@ -371,7 +410,7 @@ int generic_permission(struct inode *inode, int mask)
 	 */
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 	if (mask == MAY_READ)
-		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+		if (capable_wrt_inode_uidgid(mnt_userns, inode,
 					     CAP_DAC_READ_SEARCH))
 			return 0;
 	/*
@@ -380,7 +419,7 @@ int generic_permission(struct inode *inode, int mask)
 	 * at least one exec bit set.
 	 */
 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-		if (capable_wrt_inode_uidgid(&init_user_ns, inode,
+		if (capable_wrt_inode_uidgid(mnt_userns, inode,
 					     CAP_DAC_OVERRIDE))
 			return 0;
 
@@ -388,13 +427,19 @@ int generic_permission(struct inode *inode, int mask)
 }
 EXPORT_SYMBOL(generic_permission);
 
-/*
+/**
+ * do_inode_permission - UNIX permission checking
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	inode to check permissions on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+ *
  * We _really_ want to just do "generic_permission()" without
  * even looking at the inode->i_op values. So we keep a cache
  * flag in inode->i_opflags, that says "this has not special
  * permission function, use the fast case".
  */
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct user_namespace *mnt_userns,
+				      struct inode *inode, int mask)
 {
 	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 		if (likely(inode->i_op->permission))
@@ -405,7 +450,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
 		inode->i_opflags |= IOP_FASTPERM;
 		spin_unlock(&inode->i_lock);
 	}
-	return generic_permission(inode, mask);
+	return generic_permission(mnt_userns, inode, mask);
 }
 
 /**
@@ -430,8 +475,9 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 
 /**
  * inode_permission - Check for access rights to a given inode
- * @inode: Inode to check permission on
- * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mnt_userns:	User namespace of the mount the inode was found from
+ * @inode:	Inode to check permission on
+ * @mask:	Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
  * this, letting us set arbitrary permissions for filesystem access without
@@ -439,7 +485,8 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
  *
  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
  */
-int inode_permission(struct inode *inode, int mask)
+int inode_permission(struct user_namespace *mnt_userns,
+		     struct inode *inode, int mask)
 {
 	int retval;
 
@@ -463,7 +510,7 @@ int inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	retval = do_inode_permission(inode, mask);
+	retval = do_inode_permission(mnt_userns, inode, mask);
 	if (retval)
 		return retval;
 
@@ -1009,7 +1056,7 @@ static bool safe_hardlink_source(struct inode *inode)
 		return false;
 
 	/* Hardlinking to unreadable or unwritable sources is dangerous. */
-	if (inode_permission(inode, MAY_READ | MAY_WRITE))
+	if (inode_permission(&init_user_ns, inode, MAY_READ | MAY_WRITE))
 		return false;
 
 	return true;
@@ -1569,13 +1616,14 @@ static struct dentry *lookup_slow(const struct qstr *name,
 static inline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
-		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+		int err = inode_permission(&init_user_ns, nd->inode,
+					   MAY_EXEC | MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
 		if (unlazy_walk(nd))
 			return -ECHILD;
 	}
-	return inode_permission(nd->inode, MAY_EXEC);
+	return inode_permission(&init_user_ns, nd->inode, MAY_EXEC);
 }
 
 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
@@ -2509,7 +2557,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base,
 			return err;
 	}
 
-	return inode_permission(base->d_inode, MAY_EXEC);
+	return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
 }
 
 /**
@@ -2703,7 +2751,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -2747,7 +2795,7 @@ static inline int may_create(struct inode *dir, struct dentry *child)
 	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
 	    !kgid_has_mapping(s_user_ns, current_fsgid()))
 		return -EOVERFLOW;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2877,7 +2925,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 		break;
 	}
 
-	error = inode_permission(inode, MAY_OPEN | acc_mode);
+	error = inode_permission(&init_user_ns, inode, MAY_OPEN | acc_mode);
 	if (error)
 		return error;
 
@@ -2939,7 +2987,8 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
 	    !kgid_has_mapping(s_user_ns, current_fsgid()))
 		return -EOVERFLOW;
 
-	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir->dentry->d_inode,
+				 MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -3276,7 +3325,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
 	int error;
 
 	/* we want directory to be writable */
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_err;
 	error = -EOPNOTSUPP;
@@ -4267,12 +4316,14 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(source, MAY_WRITE);
+			error = inode_permission(&init_user_ns, source,
+						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(target, MAY_WRITE);
+			error = inode_permission(&init_user_ns, target,
+						 MAY_WRITE);
 			if (error)
 				return error;
 		}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ef827ae193d2..727e01a84503 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2987,7 +2987,7 @@ out_notsup:
 
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
-		res = generic_permission(inode, mask);
+		res = generic_permission(&init_user_ns, inode, mask);
 	goto out;
 }
 EXPORT_SYMBOL_GPL(nfs_permission);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 66f2ef67792a..8d90796e236a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -40,7 +40,8 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		/* make sure parents give x permission to user */
 		int err;
 		parent = dget_parent(tdentry);
-		err = inode_permission(d_inode(parent), MAY_EXEC);
+		err = inode_permission(&init_user_ns,
+				       d_inode(parent), MAY_EXEC);
 		if (err < 0) {
 			dput(parent);
 			break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..0edf11258aaa 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2391,13 +2391,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		return 0;
 
 	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
-	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+	err = inode_permission(&init_user_ns, inode,
+			       acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	     (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
 	      acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
-		err = inode_permission(inode, MAY_EXEC);
+		err = inode_permission(&init_user_ns, inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 745d371d6fea..b6517220cad5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -851,7 +851,7 @@ int nilfs_permission(struct inode *inode, int mask)
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 85979e2214b3..0c75619adf54 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1355,7 +1355,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		dump_stack();
 	}
 
-	ret = generic_permission(inode, mask);
+	ret = generic_permission(&init_user_ns, inode, mask);
 
 	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
 out:
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3b397fa9c9e8..c26937824be1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4346,7 +4346,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /**
@@ -4400,7 +4400,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 	 * file.
 	 */
 	if (!preserve) {
-		error = inode_permission(inode, MAY_READ);
+		error = inode_permission(&init_user_ns, inode, MAY_READ);
 		if (error)
 			return error;
 	}
diff --git a/fs/open.c b/fs/open.c
index cd1efd254cad..a6dac6d97988 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -83,7 +83,7 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (error)
 		goto out;
 
-	error = inode_permission(inode, MAY_WRITE);
+	error = inode_permission(&init_user_ns, inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -436,7 +436,7 @@ retry:
 			goto out_path_release;
 	}
 
-	res = inode_permission(inode, mode | MAY_ACCESS);
+	res = inode_permission(&init_user_ns, inode, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
 	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 48f0547d4850..4c790cc8042d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -933,7 +933,7 @@ int orangefs_permission(struct inode *inode, int mask)
 	if (ret < 0)
 		return ret;
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index bd9dd38347ae..b2948e7b3210 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -50,7 +50,7 @@ static struct file *ovl_open_realfile(const struct file *file,
 		acc_mode |= MAY_APPEND;
 
 	old_cred = ovl_override_creds(inode->i_sb);
-	err = inode_permission(realinode, MAY_OPEN | acc_mode);
+	err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
 	if (err) {
 		realfile = ERR_PTR(err);
 	} else {
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index d739e14c6814..c101ebbb7a77 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -294,7 +294,7 @@ int ovl_permission(struct inode *inode, int mask)
 	 * Check overlay inode with the creds of task and underlying inode
 	 * with creds of mounter
 	 */
-	err = generic_permission(inode, mask);
+	err = generic_permission(&init_user_ns, inode, mask);
 	if (err)
 		return err;
 
@@ -305,7 +305,7 @@ int ovl_permission(struct inode *inode, int mask)
 		/* Make sure mounter can read file for copy up later */
 		mask |= MAY_READ;
 	}
-	err = inode_permission(realinode, mask);
+	err = inode_permission(&init_user_ns, realinode, mask);
 	revert_creds(old_cred);
 
 	return err;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 6569031af3cd..de5c2047a0e9 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -479,7 +479,7 @@ struct file *ovl_path_open(struct path *path, int flags)
 		BUG();
 	}
 
-	err = inode_permission(inode, acc_mode | MAY_OPEN);
+	err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4ca6d53c6f0a..5d9fe2fb2953 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -345,10 +345,13 @@ EXPORT_SYMBOL(posix_acl_from_mode);
  * by the acl. Returns -E... otherwise.
  */
 int
-posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
+posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     const struct posix_acl *acl, int want)
 {
 	const struct posix_acl_entry *pa, *pe, *mask_obj;
 	int found = 0;
+	kuid_t uid;
+	kgid_t gid;
 
 	want &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
@@ -356,22 +359,26 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
                 switch(pa->e_tag) {
                         case ACL_USER_OBJ:
 				/* (May have been checked already) */
-				if (uid_eq(inode->i_uid, current_fsuid()))
+				uid = i_uid_into_mnt(mnt_userns, inode);
+				if (uid_eq(uid, current_fsuid()))
                                         goto check_perm;
                                 break;
                         case ACL_USER:
-				if (uid_eq(pa->e_uid, current_fsuid()))
+				uid = kuid_into_mnt(mnt_userns, pa->e_uid);
+				if (uid_eq(uid, current_fsuid()))
                                         goto mask;
 				break;
                         case ACL_GROUP_OBJ:
-                                if (in_group_p(inode->i_gid)) {
+				gid = i_gid_into_mnt(mnt_userns, inode);
+				if (in_group_p(gid)) {
 					found = 1;
 					if ((pa->e_perm & want) == want)
 						goto mask;
                                 }
 				break;
                         case ACL_GROUP:
-				if (in_group_p(pa->e_gid)) {
+				gid = kgid_into_mnt(mnt_userns, pa->e_gid);
+				if (in_group_p(gid)) {
 					found = 1;
 					if ((pa->e_perm & want) == want)
 						goto mask;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b3422cda2a91..b4ec9293625e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -751,7 +751,7 @@ static int proc_pid_permission(struct inode *inode, int mask)
 
 		return -EPERM;
 	}
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 
@@ -3492,7 +3492,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
 		return 0;
 	}
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 static const struct inode_operations proc_tid_comm_inode_operations = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index cb51763ed554..d6e76461e135 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -281,7 +281,7 @@ int proc_fd_permission(struct inode *inode, int mask)
 	struct task_struct *p;
 	int rv;
 
-	rv = generic_permission(inode, mask);
+	rv = generic_permission(&init_user_ns, inode, mask);
 	if (rv == 0)
 		return rv;
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index fe63a7c3e0da..ec440d1957a1 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -957,7 +957,7 @@ int reiserfs_permission(struct inode *inode, int mask)
 	if (IS_PRIVATE(inode))
 		return 0;
 
-	return generic_permission(inode, mask);
+	return generic_permission(&init_user_ns, inode, mask);
 }
 
 static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 77dba3a49e65..29a4a4dbfe12 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -438,7 +438,7 @@ static bool allow_file_dedupe(struct file *file)
 		return true;
 	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
 		return true;
-	if (!inode_permission(file_inode(file), MAY_WRITE))
+	if (!inode_permission(&init_user_ns, file_inode(file), MAY_WRITE))
 		return true;
 	return false;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index fd57153b1f61..56151bd9e642 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -131,7 +131,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return -EPERM;
 	}
 
-	return inode_permission(inode, mask);
+	return inode_permission(&init_user_ns, inode, mask);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcd17097d441..a85dfe6962df 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2810,15 +2810,17 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 #endif
 
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
-extern int inode_permission(struct inode *, int);
-extern int generic_permission(struct inode *, int);
+int inode_permission(struct user_namespace *, struct inode *, int);
+int generic_permission(struct user_namespace *, struct inode *, int);
 static inline int file_permission(struct file *file, int mask)
 {
-	return inode_permission(file_inode(file), mask);
+	return inode_permission(file_mnt_user_ns(file),
+				file_inode(file), mask);
 }
 static inline int path_permission(const struct path *path, int mask)
 {
-	return inode_permission(d_inode(path->dentry), mask);
+	return inode_permission(mnt_user_ns(path->mnt),
+				d_inode(path->dentry), mask);
 }
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 90797f1b421d..85fb4c0c650a 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -15,6 +15,8 @@
 #include <linux/refcount.h>
 #include <uapi/linux/posix_acl.h>
 
+struct user_namespace;
+
 struct posix_acl_entry {
 	short			e_tag;
 	unsigned short		e_perm;
@@ -61,8 +63,6 @@ posix_acl_release(struct posix_acl *acl)
 
 extern void posix_acl_init(struct posix_acl *, int);
 extern struct posix_acl *posix_acl_alloc(int, gfp_t);
-extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
-extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
 extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
 extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
@@ -85,6 +85,9 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
 void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
 void forget_cached_acl(struct inode *inode, int type);
 void forget_all_cached_acls(struct inode *inode);
+int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
+int posix_acl_permission(struct user_namespace *, struct inode *,
+			 const struct posix_acl *, int);
 
 static inline void cache_no_acl(struct inode *inode)
 {
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index beff0cfcd1e8..693f01fe1216 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -873,7 +873,7 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro,
 	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
 		return -EINVAL;
 	acc = oflag2acc[oflag & O_ACCMODE];
-	return inode_permission(d_inode(dentry), acc);
+	return inode_permission(&init_user_ns, d_inode(dentry), acc);
 }
 
 static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 8962f139521e..e3226b65f5dc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -558,7 +558,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
 {
 	struct bpf_prog *prog;
-	int ret = inode_permission(inode, MAY_READ);
+	int ret = inode_permission(&init_user_ns, inode, MAY_READ);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 613845769103..091ffb5d2939 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4670,7 +4670,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
 	if (!inode)
 		return -ENOMEM;
 
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
 	iput(inode);
 	return ret;
 }
-- 
cgit v1.2.3


From 21cb47be6fb9ece7e6ee63f6780986faa384a77c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:25 +0100
Subject: inode: make init and permission helpers idmapped mount aware

The inode_owner_or_capable() helper determines whether the caller is the
owner of the inode or is capable with respect to that inode. Allow it to
handle idmapped mounts. If the inode is accessed through an idmapped
mount it according to the mount's user namespace. Afterwards the checks
are identical to non-idmapped mounts. If the initial user namespace is
passed nothing changes so non-idmapped mounts will see identical
behavior as before.

Similarly, allow the inode_init_owner() helper to handle idmapped
mounts. It initializes a new inode on idmapped mounts by mapping the
fsuid and fsgid of the caller from the mount's user namespace. If the
initial user namespace is passed nothing changes so non-idmapped mounts
will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-7-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/9p/acl.c                  |  2 +-
 fs/9p/vfs_inode.c            |  2 +-
 fs/attr.c                    |  6 +++---
 fs/bfs/dir.c                 |  2 +-
 fs/btrfs/inode.c             |  2 +-
 fs/btrfs/ioctl.c             | 10 +++++-----
 fs/btrfs/tests/btrfs-tests.c |  2 +-
 fs/crypto/policy.c           |  2 +-
 fs/efivarfs/file.c           |  2 +-
 fs/ext2/ialloc.c             |  2 +-
 fs/ext2/ioctl.c              |  6 +++---
 fs/ext4/ialloc.c             |  2 +-
 fs/ext4/ioctl.c              | 15 ++++++++-------
 fs/f2fs/file.c               | 14 +++++++-------
 fs/f2fs/namei.c              |  2 +-
 fs/f2fs/xattr.c              |  2 +-
 fs/fcntl.c                   |  2 +-
 fs/gfs2/file.c               |  2 +-
 fs/hfsplus/inode.c           |  2 +-
 fs/hfsplus/ioctl.c           |  2 +-
 fs/hugetlbfs/inode.c         |  2 +-
 fs/inode.c                   | 36 ++++++++++++++++++++++++++----------
 fs/jfs/ioctl.c               |  2 +-
 fs/jfs/jfs_inode.c           |  2 +-
 fs/minix/bitmap.c            |  2 +-
 fs/namei.c                   |  5 +++--
 fs/nilfs2/inode.c            |  2 +-
 fs/nilfs2/ioctl.c            |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c       |  4 ++--
 fs/ocfs2/ioctl.c             |  2 +-
 fs/ocfs2/namei.c             |  2 +-
 fs/omfs/inode.c              |  2 +-
 fs/overlayfs/dir.c           |  2 +-
 fs/overlayfs/file.c          |  4 ++--
 fs/overlayfs/super.c         |  2 +-
 fs/overlayfs/util.c          |  2 +-
 fs/posix_acl.c               |  2 +-
 fs/ramfs/inode.c             |  2 +-
 fs/reiserfs/ioctl.c          |  4 ++--
 fs/reiserfs/namei.c          |  2 +-
 fs/sysv/ialloc.c             |  2 +-
 fs/ubifs/dir.c               |  2 +-
 fs/ubifs/ioctl.c             |  2 +-
 fs/udf/ialloc.c              |  2 +-
 fs/ufs/ialloc.c              |  2 +-
 fs/xattr.c                   |  3 ++-
 fs/xfs/xfs_ioctl.c           |  2 +-
 fs/zonefs/super.c            |  2 +-
 include/linux/fs.h           |  8 ++++----
 kernel/bpf/inode.c           |  2 +-
 mm/madvise.c                 |  3 ++-
 mm/mincore.c                 |  3 ++-
 mm/shmem.c                   |  2 +-
 security/selinux/hooks.c     |  4 ++--
 54 files changed, 112 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6261719f6f2a..d77b28e8d57a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -258,7 +258,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 	if (value) {
 		/* update the cached acl value */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4a937fac1acb..f66eb3c12c8a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -251,7 +251,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 {
 	int err = 0;
 
-	inode_init_owner(inode, NULL, mode);
+	inode_init_owner(&init_user_ns,inode,  NULL, mode);
 	inode->i_blocks = 0;
 	inode->i_rdev = rdev;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/attr.c b/fs/attr.c
index c9e29e589cec..00ae0b000146 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -87,7 +87,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -98,7 +98,7 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 	}
 
@@ -243,7 +243,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 		if (IS_IMMUTABLE(inode))
 			return -EPERM;
 
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			error = inode_permission(&init_user_ns, inode,
 						 MAY_WRITE);
 			if (error)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d8dfe3a0cb39..be1335a8d25b 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -96,7 +96,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 	set_bit(ino, info->si_imap);
 	info->si_freei--;
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 512ee2650bbb..07fe8b2f3bab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6190,7 +6190,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail_unlock;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode_set_bytes(inode, 0);
 
 	inode->i_mtime = current_time(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8ced6dfefee4..1f763c60415b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -213,7 +213,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	const char *comp = NULL;
 	u32 binode_flags;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	if (btrfs_root_readonly(root))
@@ -429,7 +429,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
 	unsigned old_i_flags;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	if (btrfs_root_readonly(root))
@@ -1862,7 +1862,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 			btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
 				   "Snapshot src from another FS");
 			ret = -EXDEV;
-		} else if (!inode_owner_or_capable(src_inode)) {
+		} else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
 			/*
 			 * Subvolume creation is not restricted, but snapshots
 			 * are limited to own subvolumes only
@@ -1982,7 +1982,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	u64 flags;
 	int ret = 0;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	ret = mnt_want_write_file(file);
@@ -4453,7 +4453,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 	int ret = 0;
 	int received_uuid_changed;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	ret = mnt_want_write_file(file);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 6bd97bd4cb37..3a4099a2bf05 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -62,7 +62,7 @@ struct inode *btrfs_new_test_inode(void)
 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
 	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	BTRFS_I(inode)->location.offset = 0;
-	inode_init_owner(inode, NULL, S_IFREG);
+	inode_init_owner(&init_user_ns, inode, NULL, S_IFREG);
 
 	return inode;
 }
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index a51cef6bd27f..ed3d623724cd 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -465,7 +465,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 		return -EFAULT;
 	policy.version = version;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index feaa5e182b7b..e6bc0302643b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -137,7 +137,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 	unsigned int oldflags = efivarfs_getflags(inode);
 	int error;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (copy_from_user(&flags, arg, sizeof(flags)))
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 432c3febea6d..df14e750e9fe 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -551,7 +551,7 @@ got:
 		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
 	} else
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 32a8d10b579d..b399cbb7022d 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (ret)
 			return ret;
 
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			ret = -EACCES;
 			goto setflags_out;
 		}
@@ -84,7 +84,7 @@ setflags_out:
 	case EXT2_IOC_SETVERSION: {
 		__u32 generation;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 		ret = mnt_want_write_file(filp);
 		if (ret)
@@ -117,7 +117,7 @@ setversion_out:
 		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b215c564bc31..00c1ec6eee16 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -972,7 +972,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
 	} else
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	if (ext4_has_feature_project(sb) &&
 	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d9665d2f82db..ab80e2493fdc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -139,7 +139,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	}
 
 	if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
-	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+	    !inode_owner_or_capable(&init_user_ns, inode) ||
+	    !capable(CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto journal_err_out;
 	}
@@ -829,7 +830,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case FS_IOC_SETFLAGS: {
 		int err;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -871,7 +872,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		__u32 generation;
 		int err;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 
 		if (ext4_has_metadata_csum(inode->i_sb)) {
@@ -1010,7 +1011,7 @@ mext_out:
 	case EXT4_IOC_MIGRATE:
 	{
 		int err;
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		err = mnt_want_write_file(filp);
@@ -1032,7 +1033,7 @@ mext_out:
 	case EXT4_IOC_ALLOC_DA_BLKS:
 	{
 		int err;
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		err = mnt_want_write_file(filp);
@@ -1217,7 +1218,7 @@ resizefs_out:
 
 	case EXT4_IOC_CLEAR_ES_CACHE:
 	{
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 		ext4_clear_inode_es(inode);
 		return 0;
@@ -1263,7 +1264,7 @@ resizefs_out:
 			return -EFAULT;
 
 		/* Make sure caller has proper permission */
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f585545277d7..5fc0ff28b5dd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1961,7 +1961,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	u32 iflags;
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (get_user(fsflags, (int __user *)arg))
@@ -2008,7 +2008,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (!S_ISREG(inode->i_mode))
@@ -2075,7 +2075,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
@@ -2117,7 +2117,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (!S_ISREG(inode->i_mode))
@@ -2152,7 +2152,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
@@ -2181,7 +2181,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(filp);
@@ -3158,7 +3158,7 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 		return -EFAULT;
 
 	/* Make sure caller has proper permission */
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6edb1ab579a1..ad98926bacac 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	nid_free = true;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 65afcc3cc68a..d772bf13a814 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -114,7 +114,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 	unsigned char old_advise = F2FS_I(inode)->i_advise;
 	unsigned char new_advise;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 	if (value == NULL)
 		return -EINVAL;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 05b36b28f2e8..74d99731fd43 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -46,7 +46,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 
 	/* required for strict SunOS emulation */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b39b339feddc..1d994bdfffaa 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -238,7 +238,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
 		goto out;
 
 	error = -EACCES;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		goto out;
 
 	error = 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index e3da9e96b835..21357046b039 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -376,7 +376,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
 		return NULL;
 
 	inode->i_ino = sbi->next_cnid++;
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	set_nlink(inode, 1);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ce15b9496b77..3edb1926d127 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -91,7 +91,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	if (err)
 		goto out;
 
-	if (!inode_owner_or_capable(inode)) {
+	if (!inode_owner_or_capable(&init_user_ns, inode)) {
 		err = -EACCES;
 		goto out_drop_write;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b5c109703daa..6737929e443c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -836,7 +836,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 
 		inode->i_ino = get_next_ino();
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
diff --git a/fs/inode.c b/fs/inode.c
index cd40cbf87ce4..a9ac97a27784 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2130,14 +2130,21 @@ EXPORT_SYMBOL(init_special_inode);
 
 /**
  * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
+ * @mnt_userns:	User namespace of the mount the inode was created from
  * @inode: New inode
  * @dir: Directory inode
  * @mode: mode of the new inode
+ *
+ * If the inode has been created through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions
+ * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
+ * checking is to be performed on the raw inode simply passs init_user_ns.
  */
-void inode_init_owner(struct inode *inode, const struct inode *dir,
-			umode_t mode)
+void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+		      const struct inode *dir, umode_t mode)
 {
-	inode->i_uid = current_fsuid();
+	inode->i_uid = fsuid_into_mnt(mnt_userns);
 	if (dir && dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 
@@ -2145,32 +2152,41 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
-			 !in_group_p(inode->i_gid) &&
-			 !capable_wrt_inode_uidgid(&init_user_ns, dir,
-						   CAP_FSETID))
+			 !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
+			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
 			mode &= ~S_ISGID;
 	} else
-		inode->i_gid = current_fsgid();
+		inode->i_gid = fsgid_into_mnt(mnt_userns);
 	inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
 
 /**
  * inode_owner_or_capable - check current task permissions to inode
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: inode being checked
  *
  * Return true if current either has CAP_FOWNER in a namespace with the
  * inode owner uid mapped, or owns the file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-bool inode_owner_or_capable(const struct inode *inode)
+bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+			    const struct inode *inode)
 {
+	kuid_t i_uid;
 	struct user_namespace *ns;
 
-	if (uid_eq(current_fsuid(), inode->i_uid))
+	i_uid = i_uid_into_mnt(mnt_userns, inode);
+	if (uid_eq(current_fsuid(), i_uid))
 		return true;
 
 	ns = current_user_ns();
-	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
+	if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
 		return true;
 	return false;
 }
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 10ee0ecca1a8..2581d4db58ff 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -76,7 +76,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			err = -EACCES;
 			goto setflags_out;
 		}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 4cef170630db..59379089e939 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		goto fail_put;
 	}
 
-	inode_init_owner(inode, parent, mode);
+	inode_init_owner(&init_user_ns, inode, parent, mode);
 	/*
 	 * New inodes need to save sane values on disk when
 	 * uid & gid mount options are used
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index f4e5e5181a14..9115948c624e 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -252,7 +252,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error)
 		iput(inode);
 		return NULL;
 	}
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_ino = j;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_blocks = 0;
diff --git a/fs/namei.c b/fs/namei.c
index d78d74f5f5af..04b001ddade3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1088,7 +1088,8 @@ int may_linkat(struct path *link)
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
-	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
+	if (safe_hardlink_source(inode) ||
+	    inode_owner_or_capable(&init_user_ns, inode))
 		return 0;
 
 	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
@@ -2940,7 +2941,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
-	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+	if (flag & O_NOATIME && !inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	return 0;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b6517220cad5..11225a659736 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -348,7 +348,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
 	atomic64_inc(&root->inodes_count);
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 07d26f61f22a..b053b40315bf 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -132,7 +132,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 	unsigned int flags, oldflags;
 	int ret;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	if (get_user(flags, (int __user *)argp))
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 583820ec63e2..37c7d03a6284 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -329,7 +329,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode_init_owner(inode, NULL, mode);
+		inode_init_owner(&init_user_ns, inode, NULL, mode);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 		inc_nlink(inode);
 
@@ -352,7 +352,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 		return NULL;
 
 	inode->i_ino = get_next_ino();
-	inode_init_owner(inode, parent, mode);
+	inode_init_owner(&init_user_ns, inode, parent, mode);
 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 
 	ip = DLMFS_I(inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 89984172fc4a..50c9b30ee9f6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -96,7 +96,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	}
 
 	status = -EACCES;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		goto bail_unlock;
 
 	if (!S_ISDIR(inode->i_mode))
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2a237ab00453..908b79e1082b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -198,7 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
 	 * callers. */
 	if (S_ISDIR(mode))
 		set_nlink(inode, 2);
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	status = dquot_initialize(inode);
 	if (status)
 		return ERR_PTR(status);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index ce93ccca8639..2a0e83236c01 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
 		goto fail;
 
 	inode->i_ino = new_block;
-	inode_init_owner(inode, NULL, mode);
+	inode_init_owner(&init_user_ns, inode, NULL, mode);
 	inode->i_mapping->a_ops = &omfs_aops;
 
 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 28a075b5f5b2..98a23353b19a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -636,7 +636,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
 	inode->i_state |= I_CREATING;
 	spin_unlock(&inode->i_lock);
 
-	inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+	inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode);
 	attr.mode = inode->i_mode;
 
 	err = ovl_create_or_link(dentry, inode, &attr, false);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index b2948e7b3210..7d8b84c715b3 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -54,7 +54,7 @@ static struct file *ovl_open_realfile(const struct file *file,
 	if (err) {
 		realfile = ERR_PTR(err);
 	} else {
-		if (!inode_owner_or_capable(realinode))
+		if (!inode_owner_or_capable(&init_user_ns, realinode))
 			flags &= ~O_NOATIME;
 
 		realfile = open_with_fake_path(&file->f_path, flags, realinode,
@@ -520,7 +520,7 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
 	long ret;
 	struct inode *inode = file_inode(file);
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EACCES;
 
 	ret = mnt_want_write_file(file);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 88d877787770..3e925deaa19a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1005,7 +1005,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 		goto out_acl_release;
 	}
 	err = -EPERM;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		goto out_acl_release;
 
 	posix_acl_release(acl);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index de5c2047a0e9..06013b7b1e87 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -484,7 +484,7 @@ struct file *ovl_path_open(struct path *path, int flags)
 		return ERR_PTR(err);
 
 	/* O_NOATIME is an optimization, don't fail if not permitted */
-	if (inode_owner_or_capable(inode))
+	if (inode_owner_or_capable(&init_user_ns, inode))
 		flags |= O_NOATIME;
 
 	return dentry_open(path, flags, current_cred());
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5d9fe2fb2953..9ce8214bfdac 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -874,7 +874,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return acl ? -EACCES : 0;
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	if (acl) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ee179a81b3da..3fd4326f36b5 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -67,7 +67,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adb21bea3d60..4f1cbd930179 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (err)
 				break;
 
-			if (!inode_owner_or_capable(inode)) {
+			if (!inode_owner_or_capable(&init_user_ns, inode)) {
 				err = -EPERM;
 				goto setflags_out;
 			}
@@ -101,7 +101,7 @@ setflags_out:
 		err = put_user(inode->i_generation, (int __user *)arg);
 		break;
 	case REISERFS_IOC_SETVERSION:
-		if (!inode_owner_or_capable(inode)) {
+		if (!inode_owner_or_capable(&init_user_ns, inode)) {
 			err = -EPERM;
 			break;
 		}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 1594687582f0..a67a7d371725 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -615,7 +615,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
 	 * the quota init calls have to know who to charge the quota to, so
 	 * we have to set uid and gid here
 	 */
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	return dquot_initialize(inode);
 }
 
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6c9801986af6..50df794a3c1f 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
 	*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
 	fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
 	dirty_sb(sb);
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_ino = fs16_to_cpu(sbi, ino);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_blocks = 0;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9a6b8660425a..694e7714545b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -94,7 +94,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 	 */
 	inode->i_flags |= S_NOCMTIME;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			 current_time(inode);
 	inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 4363d85a3fd4..2326d5122beb 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -155,7 +155,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if (!inode_owner_or_capable(inode))
+		if (!inode_owner_or_capable(&init_user_ns, inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 84ed23edebfd..2ecf0e87660e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -103,7 +103,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 		mutex_unlock(&sbi->s_alloc_mutex);
 	}
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
 		inode->i_uid = sbi->s_uid;
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 969fd60436d3..7e3e08c0166f 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -289,7 +289,7 @@ cg_found:
 	ufs_mark_sb_dirty(sb);
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 	inode->i_blocks = 0;
 	inode->i_generation = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index 56151bd9e642..c669922e1bde 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -127,7 +127,8 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-		    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
+		    (mask & MAY_WRITE) &&
+		    !inode_owner_or_capable(&init_user_ns, inode))
 			return -EPERM;
 	}
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 97bd29fc8c43..218e80afc859 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1300,7 +1300,7 @@ xfs_ioctl_setattr_get_trans(
 	 * The user ID of the calling process must be equal to the file owner
 	 * ID, except in cases where the CAP_FSETID capability is applicable.
 	 */
-	if (!inode_owner_or_capable(VFS_I(ip))) {
+	if (!inode_owner_or_capable(&init_user_ns, VFS_I(ip))) {
 		error = -EPERM;
 		goto out_cancel;
 	}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bec47f2d074b..569525ee8f69 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1223,7 +1223,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
 	struct super_block *sb = parent->i_sb;
 
 	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
-	inode_init_owner(inode, parent, S_IFDIR | 0555);
+	inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
 	inode->i_op = &zonefs_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a85dfe6962df..2a9d4af6a64d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1762,8 +1762,8 @@ static inline bool sb_start_intwrite_trylock(struct super_block *sb)
 	return __sb_start_write_trylock(sb, SB_FREEZE_FS);
 }
 
-
-extern bool inode_owner_or_capable(const struct inode *inode);
+bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+			    const struct inode *inode);
 
 /*
  * VFS helper functions..
@@ -1805,8 +1805,8 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
 /*
  * VFS file helper functions.
  */
-extern void inode_init_owner(struct inode *inode, const struct inode *dir,
-			umode_t mode);
+void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+		      const struct inode *dir, umode_t mode);
 extern bool may_open_dev(const struct path *path);
 
 /*
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e3226b65f5dc..05b1f51d15e0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime;
 	inode->i_ctime = inode->i_atime;
 
-	inode_init_owner(inode, dir, mode);
+	inode_init_owner(&init_user_ns, inode, dir, mode);
 
 	return inode;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 175c5582d8a9..d4f5eece9d56 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -539,7 +539,8 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
 	 * otherwise we'd be including shared non-exclusive mappings, which
 	 * opens a side channel.
 	 */
-	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+	return inode_owner_or_capable(&init_user_ns,
+				      file_inode(vma->vm_file)) ||
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
diff --git a/mm/mincore.c b/mm/mincore.c
index 7bdb4673f776..9122676b54d6 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -166,7 +166,8 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
 	 * for writing; otherwise we'd be including shared non-exclusive
 	 * mappings, which opens a side channel.
 	 */
-	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+	return inode_owner_or_capable(&init_user_ns,
+				      file_inode(vma->vm_file)) ||
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c6b6d8f6c39..1c68c9edba5e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2303,7 +2303,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 	inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = ino;
-		inode_init_owner(inode, dir, mode);
+		inode_init_owner(&init_user_ns, inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 		inode->i_generation = prandom_u32();
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 644b17ec9e63..9d6d3da2caf2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3140,13 +3140,13 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 	}
 
 	if (!selinux_initialized(&selinux_state))
-		return (inode_owner_or_capable(inode) ? 0 : -EPERM);
+		return (inode_owner_or_capable(&init_user_ns, inode) ? 0 : -EPERM);
 
 	sbsec = inode->i_sb->s_security;
 	if (!(sbsec->flags & SBLABEL_MNT))
 		return -EOPNOTSUPP;
 
-	if (!inode_owner_or_capable(inode))
+	if (!inode_owner_or_capable(&init_user_ns, inode))
 		return -EPERM;
 
 	ad.type = LSM_AUDIT_DATA_DENTRY;
-- 
cgit v1.2.3


From 2f221d6f7b881d95de1f356a3097d755ab1e47d4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:26 +0100
Subject: attr: handle idmapped mounts

When file attributes are changed most filesystems rely on the
setattr_prepare(), setattr_copy(), and notify_change() helpers for
initialization and permission checking. Let them handle idmapped mounts.
If the inode is accessed through an idmapped mount map it into the
mount's user namespace. Afterwards the checks are identical to
non-idmapped mounts. If the initial user namespace is passed nothing
changes so non-idmapped mounts will see identical behavior as before.

Helpers that perform checks on the ia_uid and ia_gid fields in struct
iattr assume that ia_uid and ia_gid are intended values and have already
been mapped correctly at the userspace-kernelspace boundary as we
already do today. If the initial user namespace is passed nothing
changes so non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-8-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/powerpc/platforms/cell/spufs/inode.c |   2 +-
 drivers/base/devtmpfs.c                   |   4 +-
 fs/9p/vfs_inode.c                         |   4 +-
 fs/9p/vfs_inode_dotl.c                    |   4 +-
 fs/adfs/inode.c                           |   2 +-
 fs/affs/inode.c                           |   4 +-
 fs/attr.c                                 | 119 ++++++++++++++++++++++--------
 fs/btrfs/inode.c                          |   4 +-
 fs/cachefiles/interface.c                 |   4 +-
 fs/ceph/inode.c                           |   2 +-
 fs/cifs/inode.c                           |   8 +-
 fs/ecryptfs/inode.c                       |   7 +-
 fs/exfat/file.c                           |   4 +-
 fs/ext2/inode.c                           |   4 +-
 fs/ext4/inode.c                           |   4 +-
 fs/f2fs/file.c                            |  10 ++-
 fs/fat/file.c                             |   4 +-
 fs/fuse/dir.c                             |   2 +-
 fs/gfs2/inode.c                           |   4 +-
 fs/hfs/inode.c                            |   4 +-
 fs/hfsplus/inode.c                        |   4 +-
 fs/hostfs/hostfs_kern.c                   |   4 +-
 fs/hpfs/inode.c                           |   4 +-
 fs/hugetlbfs/inode.c                      |   4 +-
 fs/inode.c                                |   2 +-
 fs/jffs2/fs.c                             |   2 +-
 fs/jfs/file.c                             |   4 +-
 fs/kernfs/inode.c                         |   4 +-
 fs/libfs.c                                |   4 +-
 fs/minix/file.c                           |   4 +-
 fs/nfsd/nfsproc.c                         |   2 +-
 fs/nfsd/vfs.c                             |   4 +-
 fs/nilfs2/inode.c                         |   4 +-
 fs/ntfs/inode.c                           |   2 +-
 fs/ocfs2/dlmfs/dlmfs.c                    |   4 +-
 fs/ocfs2/file.c                           |   4 +-
 fs/omfs/file.c                            |   4 +-
 fs/open.c                                 |   8 +-
 fs/orangefs/inode.c                       |   4 +-
 fs/overlayfs/copy_up.c                    |   8 +-
 fs/overlayfs/dir.c                        |   2 +-
 fs/overlayfs/inode.c                      |   4 +-
 fs/overlayfs/super.c                      |   2 +-
 fs/proc/base.c                            |   4 +-
 fs/proc/generic.c                         |   4 +-
 fs/proc/proc_sysctl.c                     |   4 +-
 fs/ramfs/file-nommu.c                     |   4 +-
 fs/reiserfs/inode.c                       |   4 +-
 fs/sysv/file.c                            |   4 +-
 fs/ubifs/file.c                           |   2 +-
 fs/udf/file.c                             |   4 +-
 fs/ufs/inode.c                            |   4 +-
 fs/utimes.c                               |   3 +-
 fs/xfs/xfs_iops.c                         |   2 +-
 fs/zonefs/super.c                         |   4 +-
 include/linux/fs.h                        |   8 +-
 mm/shmem.c                                |   4 +-
 57 files changed, 206 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 25390569e24c..3de526eb2275 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -98,7 +98,7 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    (attr->ia_size != inode->i_size))
 		return -EINVAL;
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index eac184e6d657..2e0c3cdb4184 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -221,7 +221,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		newattrs.ia_gid = gid;
 		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
 		inode_lock(d_inode(dentry));
-		notify_change(dentry, &newattrs, NULL);
+		notify_change(&init_user_ns, dentry, &newattrs, NULL);
 		inode_unlock(d_inode(dentry));
 
 		/* mark as kernel-created inode */
@@ -328,7 +328,7 @@ static int handle_remove(const char *nodename, struct device *dev)
 			newattrs.ia_valid =
 				ATTR_UID|ATTR_GID|ATTR_MODE;
 			inode_lock(d_inode(dentry));
-			notify_change(dentry, &newattrs, NULL);
+			notify_change(&init_user_ns, dentry, &newattrs, NULL);
 			inode_unlock(d_inode(dentry));
 			err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
 			if (!err || err == -ENOENT)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f66eb3c12c8a..9c3ff6e9ab82 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1062,7 +1062,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct p9_wstat wstat;
 
 	p9_debug(P9_DEBUG_VFS, "\n");
-	retval = setattr_prepare(dentry, iattr);
+	retval = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (retval)
 		return retval;
 
@@ -1118,7 +1118,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	v9fs_invalidate_inode_attr(d_inode(dentry));
 
-	setattr_copy(d_inode(dentry), iattr);
+	setattr_copy(&init_user_ns, d_inode(dentry), iattr);
 	mark_inode_dirty(d_inode(dentry));
 	return 0;
 }
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 823c2eb5f1bf..302553101fcb 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -549,7 +549,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 
 	p9_debug(P9_DEBUG_VFS, "\n");
 
-	retval = setattr_prepare(dentry, iattr);
+	retval = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (retval)
 		return retval;
 
@@ -590,7 +590,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 		truncate_setsize(inode, iattr->ia_size);
 
 	v9fs_invalidate_inode_attr(inode);
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE) {
 		/* We also want to update ACL when we update mode bits */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 32620f4a7623..278dcee6ae22 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -299,7 +299,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid = attr->ia_valid;
 	int error;
 	
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 
 	/*
 	 * we can't change the UID or GID of any file -
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 044412110b52..767e5bdfb703 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -223,7 +223,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 
 	pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		goto out;
 
@@ -249,7 +249,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
 		affs_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 
 	if (attr->ia_valid & ATTR_MODE)
diff --git a/fs/attr.c b/fs/attr.c
index 00ae0b000146..f4543d2abdfb 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,27 +18,55 @@
 #include <linux/evm.h>
 #include <linux/ima.h>
 
-static bool chown_ok(const struct inode *inode, kuid_t uid)
+/**
+ * chown_ok - verify permissions to chown inode
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check permissions on
+ * @uid:	uid to chown @inode to
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
+static bool chown_ok(struct user_namespace *mnt_userns,
+		     const struct inode *inode,
+		     kuid_t uid)
 {
-	if (uid_eq(current_fsuid(), inode->i_uid) &&
-	    uid_eq(uid, inode->i_uid))
+	kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
+	if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid))
 		return true;
-	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
 		return true;
-	if (uid_eq(inode->i_uid, INVALID_UID) &&
+	if (uid_eq(kuid, INVALID_UID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
 		return true;
 	return false;
 }
 
-static bool chgrp_ok(const struct inode *inode, kgid_t gid)
+/**
+ * chgrp_ok - verify permissions to chgrp inode
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check permissions on
+ * @gid:	gid to chown @inode to
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
+static bool chgrp_ok(struct user_namespace *mnt_userns,
+		     const struct inode *inode, kgid_t gid)
 {
-	if (uid_eq(current_fsuid(), inode->i_uid) &&
-	    (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
+	kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+	if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
+	    (in_group_p(gid) || gid_eq(gid, kgid)))
 		return true;
-	if (capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_CHOWN))
+	if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
 		return true;
-	if (gid_eq(inode->i_gid, INVALID_GID) &&
+	if (gid_eq(kgid, INVALID_GID) &&
 	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
 		return true;
 	return false;
@@ -46,6 +74,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
 
 /**
  * setattr_prepare - check if attribute changes to a dentry are allowed
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry:	dentry to check
  * @attr:	attributes to change
  *
@@ -55,10 +84,17 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid)
  * SGID bit from mode if user is not allowed to set it. Also file capabilities
  * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * Should be called as the first thing in ->setattr implementations,
  * possibly after taking additional locks.
  */
-int setattr_prepare(struct dentry *dentry, struct iattr *attr)
+int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
+		    struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
@@ -78,27 +114,27 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
 		goto kill_priv;
 
 	/* Make sure a caller can chown. */
-	if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid))
+	if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
-	if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid))
+	if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		if (!inode_owner_or_capable(&init_user_ns, inode))
+		if (!inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
-		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-				inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+               if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
+                                i_gid_into_mnt(mnt_userns, inode)) &&
+                    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-		if (!inode_owner_or_capable(&init_user_ns, inode))
+		if (!inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 	}
 
@@ -162,20 +198,33 @@ EXPORT_SYMBOL(inode_newsize_ok);
 
 /**
  * setattr_copy - copy simple metadata updates into the generic inode
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
  * setattr_copy must be called with i_mutex held.
  *
  * setattr_copy updates the inode's metadata with that specified
- * in attr. Noticeably missing is inode size update, which is more complex
+ * in attr on idmapped mounts. If file ownership is changed setattr_copy
+ * doesn't map ia_uid and ia_gid. It will asssume the caller has already
+ * provided the intended values. Necessary permission checks to determine
+ * whether or not the S_ISGID property needs to be removed are performed with
+ * the correct idmapped mount permission helpers.
+ * Noticeably missing is inode size update, which is more complex
  * as it requires pagecache updates.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * The inode is not marked as dirty after this operation. The rationale is
  * that for "simple" filesystems, the struct inode is the inode storage.
  * The caller is free to mark the inode dirty afterwards if needed.
  */
-void setattr_copy(struct inode *inode, const struct iattr *attr)
+void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
+		  const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -191,9 +240,9 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 		inode->i_ctime = attr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
-
-		if (!in_group_p(inode->i_gid) &&
-		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+		if (!in_group_p(kgid) &&
+		    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
@@ -202,6 +251,7 @@ EXPORT_SYMBOL(setattr_copy);
 
 /**
  * notify_change - modify attributes of a filesytem object
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes
  * @delegated_inode: returns inode, if the inode is delegated
@@ -214,13 +264,23 @@ EXPORT_SYMBOL(setattr_copy);
  * retry.  Because breaking a delegation may take a long time, the
  * caller should drop the i_mutex before doing so.
  *
+ * If file ownership is changed notify_change() doesn't map ia_uid and
+ * ia_gid. It will asssume the caller has already provided the intended values.
+ *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.  Also, passing NULL is fine for callers holding
  * the file open for write, as there can be no conflicting delegation in
  * that case.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
  */
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	umode_t mode = inode->i_mode;
@@ -243,9 +303,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 		if (IS_IMMUTABLE(inode))
 			return -EPERM;
 
-		if (!inode_owner_or_capable(&init_user_ns, inode)) {
-			error = inode_permission(&init_user_ns, inode,
-						 MAY_WRITE);
+		if (!inode_owner_or_capable(mnt_userns, inode)) {
+			error = inode_permission(mnt_userns, inode, MAY_WRITE);
 			if (error)
 				return error;
 		}
@@ -321,9 +380,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	/* Don't allow modifications of files with invalid uids or
 	 * gids unless those uids & gids are being made valid.
 	 */
-	if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+	if (!(ia_valid & ATTR_UID) &&
+	    !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
-	if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+	if (!(ia_valid & ATTR_GID) &&
+	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	error = security_inode_setattr(dentry, attr);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 07fe8b2f3bab..792191a8705b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5054,7 +5054,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -5065,7 +5065,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid) {
-		setattr_copy(inode, attr);
+		setattr_copy(&init_user_ns, inode, attr);
 		inode_inc_iversion(inode);
 		err = btrfs_dirty_inode(inode);
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 4cea5fbf695e..5efa6a3702c0 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -470,14 +470,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 		_debug("discard tail %llx", oi_size);
 		newattrs.ia_valid = ATTR_SIZE;
 		newattrs.ia_size = oi_size & PAGE_MASK;
-		ret = notify_change(object->backer, &newattrs, NULL);
+		ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL);
 		if (ret < 0)
 			goto truncate_failed;
 	}
 
 	newattrs.ia_valid = ATTR_SIZE;
 	newattrs.ia_size = ni_size;
-	ret = notify_change(object->backer, &newattrs, NULL);
+	ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL);
 
 truncate_failed:
 	inode_unlock(d_inode(object->backer));
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e8a15ee09bc1..285d3baca27e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2247,7 +2247,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err != 0)
 		return err;
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a83b3a8ffaac..27554f71f744 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2610,7 +2610,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		attrs->ia_valid |= ATTR_FORCE;
 
-	rc = setattr_prepare(direntry, attrs);
+	rc = setattr_prepare(&init_user_ns, direntry, attrs);
 	if (rc < 0)
 		goto out;
 
@@ -2715,7 +2715,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	    attrs->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attrs->ia_size);
 
-	setattr_copy(inode, attrs);
+	setattr_copy(&init_user_ns, inode, attrs);
 	mark_inode_dirty(inode);
 
 	/* force revalidate when any of these times are set since some
@@ -2757,7 +2757,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		attrs->ia_valid |= ATTR_FORCE;
 
-	rc = setattr_prepare(direntry, attrs);
+	rc = setattr_prepare(&init_user_ns, direntry, attrs);
 	if (rc < 0) {
 		free_xid(xid);
 		return rc;
@@ -2913,7 +2913,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	    attrs->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attrs->ia_size);
 
-	setattr_copy(inode, attrs);
+	setattr_copy(&init_user_ns, inode, attrs);
 	mark_inode_dirty(inode);
 
 cifs_setattr_exit:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0b346baf110d..d3ea0c57b075 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -855,7 +855,8 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
 		inode_lock(d_inode(lower_dentry));
-		rc = notify_change(lower_dentry, &lower_ia, NULL);
+		rc = notify_change(&init_user_ns, lower_dentry,
+				   &lower_ia, NULL);
 		inode_unlock(d_inode(lower_dentry));
 	}
 	return rc;
@@ -934,7 +935,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
 
-	rc = setattr_prepare(dentry, ia);
+	rc = setattr_prepare(&init_user_ns, dentry, ia);
 	if (rc)
 		goto out;
 	if (ia->ia_valid & ATTR_SIZE) {
@@ -960,7 +961,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 		lower_ia.ia_valid &= ~ATTR_MODE;
 
 	inode_lock(d_inode(lower_dentry));
-	rc = notify_change(lower_dentry, &lower_ia, NULL);
+	rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL);
 	inode_unlock(d_inode(lower_dentry));
 out:
 	fsstack_copy_attr_all(inode, lower_inode);
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a92478eabfa4..ace35aa8e64b 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -305,7 +305,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr)
 				ATTR_TIMES_SET);
 	}
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	attr->ia_valid = ia_valid;
 	if (error)
 		goto out;
@@ -340,7 +340,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr)
 		up_write(&EXFAT_I(inode)->truncate_lock);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	exfat_truncate_atime(&inode->i_atime);
 	mark_inode_dirty(inode);
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 78c417d3c898..06c0cf28c1a0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1669,7 +1669,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		return error;
 
@@ -1689,7 +1689,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (error)
 			return error;
 	}
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
 		error = posix_acl_chmod(inode, inode->i_mode);
 	mark_inode_dirty(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c173c8405856..8edfa3e226e6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5337,7 +5337,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				  ATTR_GID | ATTR_TIMES_SET))))
 		return -EPERM;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -5512,7 +5512,7 @@ out_mmap_sem:
 	}
 
 	if (!error) {
-		setattr_copy(inode, attr);
+		setattr_copy(&init_user_ns, inode, attr);
 		mark_inode_dirty(inode);
 	}
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5fc0ff28b5dd..90d7b89176de 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -831,7 +831,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 }
 
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+static void __setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
+			   const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -847,8 +848,9 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
 		inode->i_ctime = attr->ia_ctime;
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
+		kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
 
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		if (!in_group_p(kgid) && !capable(CAP_FSETID))
 			mode &= ~S_ISGID;
 		set_acl_inode(inode, mode);
 	}
@@ -869,7 +871,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		!f2fs_is_compress_backend_ready(inode))
 		return -EOPNOTSUPP;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -945,7 +947,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		spin_unlock(&F2FS_I(inode)->i_size_lock);
 	}
 
-	__setattr_copy(inode, attr);
+	__setattr_copy(&init_user_ns, inode, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
 		err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f9ee27cf4d7c..805b501467e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -480,7 +480,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_valid &= ~TIMES_SET_FLAGS;
 	}
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	attr->ia_valid = ia_valid;
 	if (error) {
 		if (sbi->options.quiet)
@@ -550,7 +550,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
 	attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 out:
 	return error;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7497009a5a45..74fdb6a7ebb3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1611,7 +1611,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	if (!fc->default_permissions)
 		attr->ia_valid |= ATTR_FORCE;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5b2ff0c74b67..59c25181d108 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1861,7 +1861,7 @@ int gfs2_permission(struct inode *inode, int mask)
 
 static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
 {
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
@@ -1982,7 +1982,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 		goto error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		goto error;
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f35a37c65e5f..c646218b72bf 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -608,7 +608,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
 	int error;
 
-	error = setattr_prepare(dentry, attr); /* basic permission checks */
+	error = setattr_prepare(&init_user_ns, dentry, attr); /* basic permission checks */
 	if (error)
 		return error;
 
@@ -647,7 +647,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 						  current_time(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 21357046b039..ffa137f8234e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -246,7 +246,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -264,7 +264,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 
 	return 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b841a05a2b8c..6970e29a5287 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -792,7 +792,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	int fd = HOSTFS_I(inode)->fd;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -849,7 +849,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attr->ia_size);
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index eb8b4baf0f2e..8ba2152a78ba 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -274,7 +274,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
 		goto out_unlock;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		goto out_unlock;
 
@@ -288,7 +288,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 		hpfs_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 
 	hpfs_write_inode(inode);
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6737929e443c..327e572b4e00 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -761,7 +761,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	BUG_ON(!inode);
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -780,7 +780,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 			return error;
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
index a9ac97a27784..49b512592dcd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1912,7 +1912,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 	 * Note we call this on write, so notify_change will not
 	 * encounter any conflicting delegations:
 	 */
-	return notify_change(dentry, &newattrs, NULL);
+	return notify_change(&init_user_ns, dentry, &newattrs, NULL);
 }
 
 /*
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 78858f6e9583..67993808f4da 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -195,7 +195,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int rc;
 
-	rc = setattr_prepare(dentry, iattr);
+	rc = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (rc)
 		return rc;
 
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 930d2701f206..ff49876e9c9b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -90,7 +90,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int rc;
 
-	rc = setattr_prepare(dentry, iattr);
+	rc = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (rc)
 		return rc;
 
@@ -118,7 +118,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		jfs_truncate(inode);
 	}
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index ff5598cc1de0..86bd4c593b78 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -122,7 +122,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
 		return -EINVAL;
 
 	mutex_lock(&kernfs_mutex);
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		goto out;
 
@@ -131,7 +131,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
 		goto out;
 
 	/* this ignores size changes */
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 
 out:
 	mutex_unlock(&kernfs_mutex);
diff --git a/fs/libfs.c b/fs/libfs.c
index f8b3c02b4f0f..a73fe109403c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -497,13 +497,13 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		return error;
 
 	if (iattr->ia_valid & ATTR_SIZE)
 		truncate_setsize(inode, iattr->ia_size);
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/minix/file.c b/fs/minix/file.c
index c50b0a20fcd9..f07acd268577 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -27,7 +27,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -41,7 +41,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
 		minix_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 9473d048efec..0ea0554d20d1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -90,7 +90,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
 		if (delta < 0)
 			delta = -delta;
 		if (delta < MAX_TOUCH_TIME_ERROR &&
-		    setattr_prepare(fhp->fh_dentry, iap) != 0) {
+		    setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) {
 			/*
 			 * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
 			 * This will cause notify_change to set these times
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0edf11258aaa..1905b39be1c2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -448,7 +448,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			.ia_size	= iap->ia_size,
 		};
 
-		host_err = notify_change(dentry, &size_attr, NULL);
+		host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
 		if (host_err)
 			goto out_unlock;
 		iap->ia_valid &= ~ATTR_SIZE;
@@ -463,7 +463,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	}
 
 	iap->ia_valid |= ATTR_CTIME;
-	host_err = notify_change(dentry, iap, NULL);
+	host_err = notify_change(&init_user_ns, dentry, iap, NULL);
 
 out_unlock:
 	fh_unlock(fhp);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 11225a659736..8aad3c48092a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -812,7 +812,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct super_block *sb = inode->i_sb;
 	int err;
 
-	err = setattr_prepare(dentry, iattr);
+	err = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (err)
 		return err;
 
@@ -827,7 +827,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		nilfs_truncate(inode);
 	}
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE) {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f7e4cbc26eaf..38f4cf1d4497 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,7 +2866,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 	int err;
 	unsigned int ia_valid = attr->ia_valid;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		goto out;
 	/* We do not support NTFS ACLs yet. */
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 37c7d03a6284..9fa66cd1f622 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -196,11 +196,11 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 
 	attr->ia_valid &= ~ATTR_SIZE;
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0c75619adf54..cabf355b148f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1142,7 +1142,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
 		return 0;
 
-	status = setattr_prepare(dentry, attr);
+	status = setattr_prepare(&init_user_ns, dentry, attr);
 	if (status)
 		return status;
 
@@ -1263,7 +1263,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 
 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 2c7b70ee1388..729339cd7902 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -348,7 +348,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -361,7 +361,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
 		omfs_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/open.c b/fs/open.c
index a6dac6d97988..c3e4dc43dd8d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,7 +61,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 
 	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
-	ret = notify_change(dentry, &newattrs, NULL);
+	ret = notify_change(&init_user_ns, dentry, &newattrs, NULL);
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
@@ -580,7 +580,8 @@ retry_deleg:
 		goto out_unlock;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	error = notify_change(&init_user_ns, path->dentry, &newattrs,
+			      &delegated_inode);
 out_unlock:
 	inode_unlock(inode);
 	if (delegated_inode) {
@@ -671,7 +672,8 @@ retry_deleg:
 	inode_lock(inode);
 	error = security_path_chown(path, uid, gid);
 	if (!error)
-		error = notify_change(path->dentry, &newattrs, &delegated_inode);
+		error = notify_change(&init_user_ns, path->dentry, &newattrs,
+				      &delegated_inode);
 	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 4c790cc8042d..8ac9491ceb9a 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -855,7 +855,7 @@ again:
 		ORANGEFS_I(inode)->attr_uid = current_fsuid();
 		ORANGEFS_I(inode)->attr_gid = current_fsgid();
 	}
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	spin_unlock(&inode->i_lock);
 	mark_inode_dirty(inode);
 
@@ -876,7 +876,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
 	int ret;
 	gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
 	    dentry);
-	ret = setattr_prepare(dentry, iattr);
+	ret = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (ret)
 	        goto out;
 	ret = __orangefs_setattr(d_inode(dentry), iattr);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index e5b616c93e11..3e9957ae19fa 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -235,7 +235,7 @@ static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat)
 		.ia_size = stat->size,
 	};
 
-	return notify_change(upperdentry, &attr, NULL);
+	return notify_change(&init_user_ns, upperdentry, &attr, NULL);
 }
 
 static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
@@ -247,7 +247,7 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
 		.ia_mtime = stat->mtime,
 	};
 
-	return notify_change(upperdentry, &attr, NULL);
+	return notify_change(&init_user_ns, upperdentry, &attr, NULL);
 }
 
 int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
@@ -259,7 +259,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
 			.ia_valid = ATTR_MODE,
 			.ia_mode = stat->mode,
 		};
-		err = notify_change(upperdentry, &attr, NULL);
+		err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
 	}
 	if (!err) {
 		struct iattr attr = {
@@ -267,7 +267,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
 			.ia_uid = stat->uid,
 			.ia_gid = stat->gid,
 		};
-		err = notify_change(upperdentry, &attr, NULL);
+		err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
 	}
 	if (!err)
 		ovl_set_timestamps(upperdentry, stat);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 98a23353b19a..29840820a46c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -508,7 +508,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 			.ia_mode = cattr->mode,
 		};
 		inode_lock(newdentry->d_inode);
-		err = notify_change(newdentry, &attr, NULL);
+		err = notify_change(&init_user_ns, newdentry, &attr, NULL);
 		inode_unlock(newdentry->d_inode);
 		if (err)
 			goto out_cleanup;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c101ebbb7a77..5aa66881dbd7 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -21,7 +21,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 	struct dentry *upperdentry;
 	const struct cred *old_cred;
 
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
@@ -79,7 +79,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 
 		inode_lock(upperdentry->d_inode);
 		old_cred = ovl_override_creds(dentry->d_sb);
-		err = notify_change(upperdentry, attr, NULL);
+		err = notify_change(&init_user_ns, upperdentry, attr, NULL);
 		revert_creds(old_cred);
 		if (!err)
 			ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3e925deaa19a..39b2e9aa0e5b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -804,7 +804,7 @@ retry:
 
 		/* Clear any inherited mode bits */
 		inode_lock(work->d_inode);
-		err = notify_change(work, &attr, NULL);
+		err = notify_change(&init_user_ns, work, &attr, NULL);
 		inode_unlock(work->d_inode);
 		if (err)
 			goto out_dput;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b4ec9293625e..bb4e63a3684f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -693,11 +693,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_MODE)
 		return -EPERM;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 6c0a05f55d6b..6d4fabab8aa7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -121,11 +121,11 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	struct proc_dir_entry *de = PDE(inode);
 	int error;
 
-	error = setattr_prepare(dentry, iattr);
+	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 	mark_inode_dirty(inode);
 
 	proc_set_user(de, inode->i_uid, inode->i_gid);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 317899222d7f..ec67dbc1f705 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -821,11 +821,11 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
 		return -EPERM;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 355523f4a4bf..f0358fe410d3 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -165,7 +165,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 	int ret = 0;
 
 	/* POSIX UID/GID verification for setting inode attributes */
-	ret = setattr_prepare(dentry, ia);
+	ret = setattr_prepare(&init_user_ns, dentry, ia);
 	if (ret)
 		return ret;
 
@@ -185,7 +185,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 
-	setattr_copy(inode, ia);
+	setattr_copy(&init_user_ns, inode, ia);
  out:
 	ia->ia_valid = old_ia_valid;
 	return ret;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c76d563dec0e..944f2b487cf8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3288,7 +3288,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid;
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -3413,7 +3413,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (!error) {
-		setattr_copy(inode, attr);
+		setattr_copy(&init_user_ns, inode, attr);
 		mark_inode_dirty(inode);
 	}
 
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 45fc79a18594..ca7e216b7b9e 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -34,7 +34,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -47,7 +47,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
 		sysv_truncate(inode);
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2bc7780d2963..76ef392b1e41 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1265,7 +1265,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	dbg_gen("ino %lu, mode %#x, ia_valid %#x",
 		inode->i_ino, inode->i_mode, attr->ia_valid);
-	err = setattr_prepare(dentry, attr);
+	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err)
 		return err;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 3671a40ed3c3..7c7d161315c2 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -259,7 +259,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 	struct super_block *sb = inode->i_sb;
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -282,7 +282,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_MODE)
 		udf_update_extra_perms(inode, attr->ia_mode);
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c843ec858cf7..6b51f3b20143 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1217,7 +1217,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid = attr->ia_valid;
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -1227,7 +1227,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 			return error;
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index fd3cc4226224..4572b91ddb91 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -62,7 +62,8 @@ int vfs_utimes(const struct path *path, struct timespec64 *times)
 	}
 retry_deleg:
 	inode_lock(inode);
-	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+	error = notify_change(&init_user_ns, path->dentry, &newattrs,
+			      &delegated_inode);
 	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 67c8dc9de8aa..08a478d25122 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -637,7 +637,7 @@ xfs_vn_change_ok(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	return setattr_prepare(dentry, iattr);
+	return setattr_prepare(&init_user_ns, dentry, iattr);
 }
 
 /*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 569525ee8f69..8a1f69677784 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -488,7 +488,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (unlikely(IS_IMMUTABLE(inode)))
 		return -EPERM;
 
-	ret = setattr_prepare(dentry, iattr);
+	ret = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (ret)
 		return ret;
 
@@ -516,7 +516,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 			return ret;
 	}
 
-	setattr_copy(inode, iattr);
+	setattr_copy(&init_user_ns, inode, iattr);
 
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a9d4af6a64d..e3ea1d7c3367 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2809,7 +2809,8 @@ static inline int bmap(struct inode *inode,  sector_t *block)
 }
 #endif
 
-extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+int notify_change(struct user_namespace *, struct dentry *,
+		  struct iattr *, struct inode **);
 int inode_permission(struct user_namespace *, struct inode *, int);
 int generic_permission(struct user_namespace *, struct inode *, int);
 static inline int file_permission(struct file *file, int mask)
@@ -3274,9 +3275,10 @@ extern int buffer_migrate_page_norefs(struct address_space *,
 #define buffer_migrate_page_norefs NULL
 #endif
 
-extern int setattr_prepare(struct dentry *, struct iattr *);
+int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
-extern void setattr_copy(struct inode *inode, const struct iattr *attr);
+void setattr_copy(struct user_namespace *, struct inode *inode,
+		  const struct iattr *attr);
 
 extern int file_update_time(struct file *file);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c68c9edba5e..1cb451e131ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1087,7 +1087,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	int error;
 
-	error = setattr_prepare(dentry, attr);
+	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
 		return error;
 
@@ -1141,7 +1141,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	setattr_copy(inode, attr);
+	setattr_copy(&init_user_ns, inode, attr);
 	if (attr->ia_valid & ATTR_MODE)
 		error = posix_acl_chmod(inode, inode->i_mode);
 	return error;
-- 
cgit v1.2.3


From e65ce2a50cf6af216bea6fd80d771fcbb4c0aaa1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:27 +0100
Subject: acl: handle idmapped mounts

The posix acl permission checking helpers determine whether a caller is
privileged over an inode according to the acls associated with the
inode. Add helpers that make it possible to handle acls on idmapped
mounts.

The vfs and the filesystems targeted by this first iteration make use of
posix_acl_fix_xattr_from_user() and posix_acl_fix_xattr_to_user() to
translate basic posix access and default permissions such as the
ACL_USER and ACL_GROUP type according to the initial user namespace (or
the superblock's user namespace) to and from the caller's current user
namespace. Adapt these two helpers to handle idmapped mounts whereby we
either map from or into the mount's user namespace depending on in which
direction we're translating.
Similarly, cap_convert_nscap() is used by the vfs to translate user
namespace and non-user namespace aware filesystem capabilities from the
superblock's user namespace to the caller's user namespace. Enable it to
handle idmapped mounts by accounting for the mount's user namespace.

In addition the fileystems targeted in the first iteration of this patch
series make use of the posix_acl_chmod() and, posix_acl_update_mode()
helpers. Both helpers perform permission checks on the target inode. Let
them handle idmapped mounts. These two helpers are called when posix
acls are set by the respective filesystems to handle this case we extend
the ->set() method to take an additional user namespace argument to pass
the mount's user namespace down.

Link: https://lore.kernel.org/r/20210121131959.646623-9-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 Documentation/filesystems/locking.rst |  7 ++--
 Documentation/filesystems/porting.rst |  2 +
 fs/9p/acl.c                           |  4 +-
 fs/9p/xattr.c                         |  1 +
 fs/afs/xattr.c                        |  2 +
 fs/btrfs/acl.c                        |  3 +-
 fs/btrfs/inode.c                      |  3 +-
 fs/btrfs/xattr.c                      |  2 +
 fs/ceph/acl.c                         |  3 +-
 fs/ceph/inode.c                       |  2 +-
 fs/ceph/xattr.c                       |  1 +
 fs/cifs/xattr.c                       |  1 +
 fs/ecryptfs/inode.c                   |  1 +
 fs/ext2/acl.c                         |  3 +-
 fs/ext2/inode.c                       |  2 +-
 fs/ext2/xattr_security.c              |  1 +
 fs/ext2/xattr_trusted.c               |  1 +
 fs/ext2/xattr_user.c                  |  1 +
 fs/ext4/acl.c                         |  3 +-
 fs/ext4/inode.c                       |  2 +-
 fs/ext4/xattr_hurd.c                  |  1 +
 fs/ext4/xattr_security.c              |  1 +
 fs/ext4/xattr_trusted.c               |  1 +
 fs/ext4/xattr_user.c                  |  1 +
 fs/f2fs/acl.c                         |  3 +-
 fs/f2fs/file.c                        |  7 ++--
 fs/f2fs/xattr.c                       |  2 +
 fs/fuse/xattr.c                       |  2 +
 fs/gfs2/acl.c                         |  2 +-
 fs/gfs2/inode.c                       |  3 +-
 fs/gfs2/xattr.c                       |  1 +
 fs/hfs/attr.c                         |  1 +
 fs/hfsplus/xattr.c                    |  1 +
 fs/hfsplus/xattr_security.c           |  1 +
 fs/hfsplus/xattr_trusted.c            |  1 +
 fs/hfsplus/xattr_user.c               |  1 +
 fs/jffs2/acl.c                        |  3 +-
 fs/jffs2/fs.c                         |  2 +-
 fs/jffs2/security.c                   |  1 +
 fs/jffs2/xattr_trusted.c              |  1 +
 fs/jffs2/xattr_user.c                 |  1 +
 fs/jfs/acl.c                          |  2 +-
 fs/jfs/file.c                         |  2 +-
 fs/jfs/xattr.c                        |  2 +
 fs/kernfs/inode.c                     |  2 +
 fs/nfs/nfs4proc.c                     |  3 ++
 fs/nfsd/nfs2acl.c                     |  6 ++-
 fs/nfsd/nfs3acl.c                     |  6 ++-
 fs/nfsd/nfs4acl.c                     |  5 ++-
 fs/ocfs2/acl.c                        |  3 +-
 fs/ocfs2/xattr.c                      |  3 ++
 fs/orangefs/acl.c                     |  3 +-
 fs/orangefs/inode.c                   |  2 +-
 fs/orangefs/xattr.c                   |  1 +
 fs/overlayfs/super.c                  |  3 ++
 fs/posix_acl.c                        | 79 +++++++++++++++++++++++++----------
 fs/reiserfs/xattr_acl.c               |  5 ++-
 fs/reiserfs/xattr_security.c          |  3 +-
 fs/reiserfs/xattr_trusted.c           |  3 +-
 fs/reiserfs/xattr_user.c              |  3 +-
 fs/ubifs/xattr.c                      |  1 +
 fs/xattr.c                            | 14 ++++---
 fs/xfs/xfs_acl.c                      |  3 +-
 fs/xfs/xfs_iops.c                     |  2 +-
 fs/xfs/xfs_xattr.c                    |  7 ++--
 include/linux/capability.h            |  3 +-
 include/linux/posix_acl.h             | 11 +++--
 include/linux/posix_acl_xattr.h       | 12 ++++--
 include/linux/xattr.h                 |  3 +-
 mm/shmem.c                            |  3 +-
 net/socket.c                          |  1 +
 security/commoncap.c                  | 45 ++++++++++++++++----
 72 files changed, 238 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index c0f2c7586531..b7dcc86c92a4 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -126,9 +126,10 @@ prototypes::
 	int (*get)(const struct xattr_handler *handler, struct dentry *dentry,
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
-	int (*set)(const struct xattr_handler *handler, struct dentry *dentry,
-		   struct inode *inode, const char *name, const void *buffer,
-		   size_t size, int flags);
+	int (*set)(const struct xattr_handler *handler,
+                   struct user_namespace *mnt_userns,
+                   struct dentry *dentry, struct inode *inode, const char *name,
+                   const void *buffer, size_t size, int flags);
 
 locking rules:
 	all may block
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 867036aa90b8..de1dcec3b5b8 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -717,6 +717,8 @@ be removed.  Switch while you still can; the old one won't stay.
 **mandatory**
 
 ->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+The xattr_handler.set() gets passed the user namespace of the mount the inode
+is seen from so filesystems can idmap the i_uid and i_gid accordingly.
 dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
 in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
 called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index d77b28e8d57a..1c14f18a6ec9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -239,6 +239,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+			      struct user_namespace *mnt_userns,
 			      struct dentry *dentry, struct inode *inode,
 			      const char *name, const void *value,
 			      size_t size, int flags)
@@ -279,7 +280,8 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			struct iattr iattr = { 0 };
 			struct posix_acl *old_acl = acl;
 
-			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+			retval = posix_acl_update_mode(mnt_userns, inode,
+						       &iattr.ia_mode, &acl);
 			if (retval)
 				goto err_out;
 			if (!acl) {
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 87217dd0433e..ee331845e2c7 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -157,6 +157,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+				  struct user_namespace *mnt_userns,
 				  struct dentry *dentry, struct inode *inode,
 				  const char *name, const void *value,
 				  size_t size, int flags)
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 95c573dcda11..c629caae5002 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -120,6 +120,7 @@ static const struct afs_operation_ops afs_store_acl_operation = {
  * Set a file's AFS3 ACL.
  */
 static int afs_xattr_set_acl(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
                              struct dentry *dentry,
                              struct inode *inode, const char *name,
                              const void *buffer, size_t size, int flags)
@@ -248,6 +249,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = {
  * Set a file's YFS ACL.
  */
 static int afs_xattr_set_yfs(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
                              struct dentry *dentry,
                              struct inode *inode, const char *name,
                              const void *buffer, size_t size, int flags)
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index a0af1b952c4d..d12a5a8730a8 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -113,7 +113,8 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	umode_t old_mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
-		ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+		ret = posix_acl_update_mode(&init_user_ns, inode,
+					    &inode->i_mode, &acl);
 		if (ret)
 			return ret;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 792191a8705b..6c18fb1a25af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5070,7 +5070,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
-			err = posix_acl_chmod(inode, inode->i_mode);
+			err = posix_acl_chmod(&init_user_ns, inode,
+					      inode->i_mode);
 	}
 
 	return err;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index af6246f36a9e..b025102e435f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -362,6 +362,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *buffer,
 				   size_t size, int flags)
@@ -371,6 +372,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+					struct user_namespace *mnt_userns,
 					struct dentry *unused, struct inode *inode,
 					const char *name, const void *value,
 					size_t size, int flags)
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index e0465741c591..52a01ddbc4ac 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -100,7 +100,8 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
-			ret = posix_acl_update_mode(inode, &new_mode, &acl);
+			ret = posix_acl_update_mode(&init_user_ns, inode,
+						    &new_mode, &acl);
 			if (ret)
 				goto out;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 285d3baca27e..145e26a4ddbb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2262,7 +2262,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	err = __ceph_setattr(inode, attr);
 
 	if (err >= 0 && (attr->ia_valid & ATTR_MODE))
-		err = posix_acl_chmod(inode, attr->ia_mode);
+		err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
 
 	return err;
 }
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 24997982de01..02f59bcb4f27 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1238,6 +1238,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
 }
 
 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
+				  struct user_namespace *mnt_userns,
 				  struct dentry *unused, struct inode *inode,
 				  const char *name, const void *value,
 				  size_t size, int flags)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 6b658a1172ef..41a611e76bb7 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -101,6 +101,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon,
 }
 
 static int cifs_xattr_set(const struct xattr_handler *handler,
+			  struct user_namespace *mnt_userns,
 			  struct dentry *dentry, struct inode *inode,
 			  const char *name, const void *value,
 			  size_t size, int flags)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d3ea0c57b075..ac6472a82567 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1133,6 +1133,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ecryptfs_xattr_set(const struct xattr_handler *handler,
+			      struct user_namespace *mnt_userns,
 			      struct dentry *dentry, struct inode *inode,
 			      const char *name, const void *value, size_t size,
 			      int flags)
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index cf4c77f8dd08..9031f7df2d48 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -223,7 +223,8 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	umode_t mode = inode->i_mode;
 
 	if (type == ACL_TYPE_ACCESS && acl) {
-		error = posix_acl_update_mode(inode, &mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					      &acl);
 		if (error)
 			return error;
 		update_mode = 1;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 06c0cf28c1a0..9de813635d8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1691,7 +1691,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	}
 	setattr_copy(&init_user_ns, inode, iattr);
 	if (iattr->ia_valid & ATTR_MODE)
-		error = posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 	mark_inode_dirty(inode);
 
 	return error;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 9a682e440acb..ebade1f52451 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -19,6 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_security_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *unused, struct inode *inode,
 			const char *name, const void *value,
 			size_t size, int flags)
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 49add1107850..18a87d5dd1ab 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -26,6 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_trusted_set(const struct xattr_handler *handler,
+		       struct user_namespace *mnt_userns,
 		       struct dentry *unused, struct inode *inode,
 		       const char *name, const void *value,
 		       size_t size, int flags)
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index c243a3b4d69d..58092449f8ff 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -30,6 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_user_set(const struct xattr_handler *handler,
+		    struct user_namespace *mnt_userns,
 		    struct dentry *unused, struct inode *inode,
 		    const char *name, const void *value,
 		    size_t size, int flags)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 68aaed48315f..7b0fb66bc04d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -245,7 +245,8 @@ retry:
 	ext4_fc_start_update(inode);
 
 	if ((type == ACL_TYPE_ACCESS) && acl) {
-		error = posix_acl_update_mode(inode, &mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					      &acl);
 		if (error)
 			goto out_stop;
 		if (mode != inode->i_mode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8edfa3e226e6..24ea5851e90a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5524,7 +5524,7 @@ out_mmap_sem:
 		ext4_orphan_del(NULL, inode);
 
 	if (!error && (ia_valid & ATTR_MODE))
-		rc = posix_acl_chmod(inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 
 err_out:
 	if  (error)
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
index 8cfa74a56361..c78df5790377 100644
--- a/fs/ext4/xattr_hurd.c
+++ b/fs/ext4/xattr_hurd.c
@@ -32,6 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_hurd_set(const struct xattr_handler *handler,
+		    struct user_namespace *mnt_userns,
 		    struct dentry *unused, struct inode *inode,
 		    const char *name, const void *value,
 		    size_t size, int flags)
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 197a9d8a15ef..8213f66f7b2d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -23,6 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_security_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *unused, struct inode *inode,
 			const char *name, const void *value,
 			size_t size, int flags)
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e9389e5d75c3..7c21ffb26d25 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -30,6 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_trusted_set(const struct xattr_handler *handler,
+		       struct user_namespace *mnt_userns,
 		       struct dentry *unused, struct inode *inode,
 		       const char *name, const void *value,
 		       size_t size, int flags)
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d4546184b34b..2fe7ff0a479c 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -31,6 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_user_set(const struct xattr_handler *handler,
+		    struct user_namespace *mnt_userns,
 		    struct dentry *unused, struct inode *inode,
 		    const char *name, const void *value,
 		    size_t size, int flags)
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1e5e9b1136ee..6a95bf28f602 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -213,7 +213,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl && !ipage) {
-			error = posix_acl_update_mode(inode, &mode, &acl);
+			error = posix_acl_update_mode(&init_user_ns, inode,
+						      &mode, &acl);
 			if (error)
 				return error;
 			set_acl_inode(inode, mode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 90d7b89176de..6ccdfe0606d9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -831,8 +831,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 }
 
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
-static void __setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
-			   const struct iattr *attr)
+static void __setattr_copy(struct user_namespace *mnt_userns,
+			   struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -950,7 +950,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	__setattr_copy(&init_user_ns, inode, attr);
 
 	if (attr->ia_valid & ATTR_MODE) {
-		err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
+		err = posix_acl_chmod(&init_user_ns, inode,
+				      f2fs_get_inode_mode(inode));
 		if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
 			inode->i_mode = F2FS_I(inode)->i_acl_mode;
 			clear_inode_flag(inode, FI_ACL_MODE);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index d772bf13a814..10081bf74324 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -64,6 +64,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+		struct user_namespace *mnt_userns,
 		struct dentry *unused, struct inode *inode,
 		const char *name, const void *value,
 		size_t size, int flags)
@@ -107,6 +108,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+		struct user_namespace *mnt_userns,
 		struct dentry *unused, struct inode *inode,
 		const char *name, const void *value,
 		size_t size, int flags)
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index cdea18de94f7..1a7d7ace54e1 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -188,6 +188,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler,
 }
 
 static int fuse_xattr_set(const struct xattr_handler *handler,
+			  struct user_namespace *mnt_userns,
 			  struct dentry *dentry, struct inode *inode,
 			  const char *name, const void *value, size_t size,
 			  int flags)
@@ -214,6 +215,7 @@ static int no_xattr_get(const struct xattr_handler *handler,
 }
 
 static int no_xattr_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *dentry, struct inode *nodee,
 			const char *name, const void *value,
 			size_t size, int flags)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 2e939f5fe751..ce88ef29eef0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -130,7 +130,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 	mode = inode->i_mode;
 	if (type == ACL_TYPE_ACCESS && acl) {
-		ret = posix_acl_update_mode(inode, &mode, &acl);
+		ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl);
 		if (ret)
 			goto unlock;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 59c25181d108..728405d15a05 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1993,7 +1993,8 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	else {
 		error = gfs2_setattr_simple(inode, attr);
 		if (!error && attr->ia_valid & ATTR_MODE)
-			error = posix_acl_chmod(inode, inode->i_mode);
+			error = posix_acl_chmod(&init_user_ns, inode,
+						inode->i_mode);
 	}
 
 error:
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 9d7667bc4292..13969a813410 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1214,6 +1214,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 }
 
 static int gfs2_xattr_set(const struct xattr_handler *handler,
+			  struct user_namespace *mnt_userns,
 			  struct dentry *unused, struct inode *inode,
 			  const char *name, const void *value,
 			  size_t size, int flags)
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 74fa62643136..2bd54efaf416 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -121,6 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int hfs_xattr_set(const struct xattr_handler *handler,
+			 struct user_namespace *mnt_userns,
 			 struct dentry *unused, struct inode *inode,
 			 const char *name, const void *value, size_t size,
 			 int flags)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index bb0b27d88e50..4d169c5a2673 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -858,6 +858,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+				struct user_namespace *mnt_userns,
 				struct dentry *unused, struct inode *inode,
 				const char *name, const void *buffer,
 				size_t size, int flags)
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index cfbe6a3bfb1e..c1c7a16cbf21 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -23,6 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *unused, struct inode *inode,
 				     const char *name, const void *buffer,
 				     size_t size, int flags)
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index fbad91e1dada..e150372ec564 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -22,6 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+				    struct user_namespace *mnt_userns,
 				    struct dentry *unused, struct inode *inode,
 				    const char *name, const void *buffer,
 				    size_t size, int flags)
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 74d19faf255e..a6b60b153916 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -22,6 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+				 struct user_namespace *mnt_userns,
 				 struct dentry *unused, struct inode *inode,
 				 const char *name, const void *buffer,
 				 size_t size, int flags)
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 093ffbd82395..5f27ac593479 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -236,7 +236,8 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		if (acl) {
 			umode_t mode;
 
-			rc = posix_acl_update_mode(inode, &mode, &acl);
+			rc = posix_acl_update_mode(&init_user_ns, inode, &mode,
+						   &acl);
 			if (rc)
 				return rc;
 			if (inode->i_mode != mode) {
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 67993808f4da..ee9f51bab4c6 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -201,7 +201,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	rc = jffs2_do_setattr(inode, iattr);
 	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = posix_acl_chmod(inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 
 	return rc;
 }
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index c2332e30f218..aef5522551db 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -57,6 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_security_setxattr(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *buffer,
 				   size_t size, int flags)
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 5d6030826c52..cc3f24883e7d 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -25,6 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+				  struct user_namespace *mnt_userns,
 				  struct dentry *unused, struct inode *inode,
 				  const char *name, const void *buffer,
 				  size_t size, int flags)
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 9d027b4abcf9..fb945977c013 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -25,6 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_user_setxattr(const struct xattr_handler *handler,
+			       struct user_namespace *mnt_userns,
 			       struct dentry *unused, struct inode *inode,
 			       const char *name, const void *buffer,
 			       size_t size, int flags)
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 92cc0ac2d1fc..cf79a34bfada 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -101,7 +101,7 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	tid = txBegin(inode->i_sb, 0);
 	mutex_lock(&JFS_IP(inode)->commit_mutex);
 	if (type == ACL_TYPE_ACCESS && acl) {
-		rc = posix_acl_update_mode(inode, &mode, &acl);
+		rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl);
 		if (rc)
 			goto end_tx;
 		if (mode != inode->i_mode)
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index ff49876e9c9b..61c3b0c1fbf6 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -122,7 +122,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	mark_inode_dirty(inode);
 
 	if (iattr->ia_valid & ATTR_MODE)
-		rc = posix_acl_chmod(inode, inode->i_mode);
+		rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 	return rc;
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index db41e7803163..f9273f6901c8 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -932,6 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set(const struct xattr_handler *handler,
+			 struct user_namespace *mnt_userns,
 			 struct dentry *unused, struct inode *inode,
 			 const char *name, const void *value,
 			 size_t size, int flags)
@@ -950,6 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set_os2(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
 			     struct dentry *unused, struct inode *inode,
 			     const char *name, const void *value,
 			     size_t size, int flags)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 86bd4c593b78..7e44052b42e1 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -319,6 +319,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
+				struct user_namespace *mnt_userns,
 				struct dentry *unused, struct inode *inode,
 				const char *suffix, const void *value,
 				size_t size, int flags)
@@ -385,6 +386,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
 }
 
 static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *unused, struct inode *inode,
 				     const char *suffix, const void *value,
 				     size_t size, int flags)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2f4679a62712..a07530cf673d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7491,6 +7491,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *key, const void *buf,
 				   size_t buflen, int flags)
@@ -7513,6 +7514,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *unused, struct inode *inode,
 				     const char *key, const void *buf,
 				     size_t buflen, int flags)
@@ -7563,6 +7565,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
 
 #ifdef CONFIG_NFS_V4_2
 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler,
+				    struct user_namespace *mnt_userns,
 				    struct dentry *unused, struct inode *inode,
 				    const char *key, const void *buf,
 				    size_t buflen, int flags)
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b0f66604532a..b83f222558e3 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -113,10 +113,12 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
 
 	fh_lock(fh);
 
-	error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+			      argp->acl_access);
 	if (error)
 		goto out_drop_lock;
-	error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+			      argp->acl_default);
 	if (error)
 		goto out_drop_lock;
 
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 7c30876a31a1..f18ec7e8094d 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -103,10 +103,12 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
 
 	fh_lock(fh);
 
-	error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+			      argp->acl_access);
 	if (error)
 		goto out_drop_lock;
-	error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+	error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+			      argp->acl_default);
 
 out_drop_lock:
 	fh_unlock(fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 71292a0d6f09..eaa3a0cf38f1 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -781,12 +781,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	fh_lock(fhp);
 
-	host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl);
+	host_error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, pacl);
 	if (host_error < 0)
 		goto out_drop_lock;
 
 	if (S_ISDIR(inode->i_mode)) {
-		host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl);
+		host_error = set_posix_acl(&init_user_ns, inode,
+					   ACL_TYPE_DEFAULT, dpacl);
 	}
 
 out_drop_lock:
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7b07f5df3a29..990756cee4bd 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -274,7 +274,8 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	if (type == ACL_TYPE_ACCESS && acl) {
 		umode_t mode;
 
-		status = posix_acl_update_mode(inode, &mode, &acl);
+		status = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					       &acl);
 		if (status)
 			goto unlock;
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9ccd19d8f7b1..36ae47a4aef6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7249,6 +7249,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+				    struct user_namespace *mnt_userns,
 				    struct dentry *unused, struct inode *inode,
 				    const char *name, const void *value,
 				    size_t size, int flags)
@@ -7321,6 +7322,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *value,
 				   size_t size, int flags)
@@ -7351,6 +7353,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+				struct user_namespace *mnt_userns,
 				struct dentry *unused, struct inode *inode,
 				const char *name, const void *value,
 				size_t size, int flags)
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index a25e6c890975..628921952d16 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -132,7 +132,8 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		 * and "mode" to the new desired value. It is up to
 		 * us to propagate the new mode back to the server...
 		 */
-		error = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode,
+					      &iattr.ia_mode, &acl);
 		if (error) {
 			gossip_err("%s: posix_acl_update_mode err: %d\n",
 				   __func__,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 8ac9491ceb9a..563fe9ab8eb2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -861,7 +861,7 @@ again:
 
 	if (iattr->ia_valid & ATTR_MODE)
 		/* change mod on a file that has ACLs */
-		ret = posix_acl_chmod(inode, inode->i_mode);
+		ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 
 	ret = 0;
 out:
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index bdc285aea360..9a5b757fbd2f 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -526,6 +526,7 @@ out_unlock:
 }
 
 static int orangefs_xattr_set_default(const struct xattr_handler *handler,
+				      struct user_namespace *mnt_userns,
 				      struct dentry *unused,
 				      struct inode *inode,
 				      const char *name,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 39b2e9aa0e5b..e24c995c5fd4 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -980,6 +980,7 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
 
 static int __maybe_unused
 ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+			struct user_namespace *mnt_userns,
 			struct dentry *dentry, struct inode *inode,
 			const char *name, const void *value,
 			size_t size, int flags)
@@ -1044,6 +1045,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ovl_own_xattr_set(const struct xattr_handler *handler,
+			     struct user_namespace *mnt_userns,
 			     struct dentry *dentry, struct inode *inode,
 			     const char *name, const void *value,
 			     size_t size, int flags)
@@ -1059,6 +1061,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ovl_other_xattr_set(const struct xattr_handler *handler,
+			       struct user_namespace *mnt_userns,
 			       struct dentry *dentry, struct inode *inode,
 			       const char *name, const void *value,
 			       size_t size, int flags)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 9ce8214bfdac..d31b60f5d40d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -558,8 +558,22 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 }
 EXPORT_SYMBOL(__posix_acl_chmod);
 
+/**
+ * posix_acl_chmod - chmod a posix acl
+ *
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	inode to check permissions on
+ * @mode:	the new mode of @inode
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
 int
-posix_acl_chmod(struct inode *inode, umode_t mode)
+ posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode,
+		    umode_t mode)
 {
 	struct posix_acl *acl;
 	int ret = 0;
@@ -638,9 +652,10 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
 
 /**
  * posix_acl_update_mode  -  update mode in set_acl
- * @inode: target inode
- * @mode_p: mode (pointer) for update
- * @acl: acl pointer
+ * @mnt_userns:	user namespace of the mount @inode was found from
+ * @inode:	target inode
+ * @mode_p:	mode (pointer) for update
+ * @acl:	acl pointer
  *
  * Update the file mode when setting an ACL: compute the new file permission
  * bits based on the ACL.  In addition, if the ACL is equivalent to the new
@@ -649,9 +664,16 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
  * As with chmod, clear the setgid bit if the caller is not in the owning group
  * or capable of CAP_FSETID (see inode_change_ok).
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * Called from set_acl inode operations.
  */
-int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+int posix_acl_update_mode(struct user_namespace *mnt_userns,
+			  struct inode *inode, umode_t *mode_p,
 			  struct posix_acl **acl)
 {
 	umode_t mode = inode->i_mode;
@@ -662,8 +684,8 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
 		return error;
 	if (error == 0)
 		*acl = NULL;
-	if (!in_group_p(inode->i_gid) &&
-	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+	if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+	    !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
 		mode &= ~S_ISGID;
 	*mode_p = mode;
 	return 0;
@@ -675,7 +697,8 @@ EXPORT_SYMBOL(posix_acl_update_mode);
  */
 static void posix_acl_fix_xattr_userns(
 	struct user_namespace *to, struct user_namespace *from,
-	void *value, size_t size)
+	struct user_namespace *mnt_userns,
+	void *value, size_t size, bool from_user)
 {
 	struct posix_acl_xattr_header *header = value;
 	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
@@ -700,10 +723,18 @@ static void posix_acl_fix_xattr_userns(
 		switch(le16_to_cpu(entry->e_tag)) {
 		case ACL_USER:
 			uid = make_kuid(from, le32_to_cpu(entry->e_id));
+			if (from_user)
+				uid = kuid_from_mnt(mnt_userns, uid);
+			else
+				uid = kuid_into_mnt(mnt_userns, uid);
 			entry->e_id = cpu_to_le32(from_kuid(to, uid));
 			break;
 		case ACL_GROUP:
 			gid = make_kgid(from, le32_to_cpu(entry->e_id));
+			if (from_user)
+				gid = kgid_from_mnt(mnt_userns, gid);
+			else
+				gid = kgid_into_mnt(mnt_userns, gid);
 			entry->e_id = cpu_to_le32(from_kgid(to, gid));
 			break;
 		default:
@@ -712,20 +743,24 @@ static void posix_acl_fix_xattr_userns(
 	}
 }
 
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
+void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+				   void *value, size_t size)
 {
 	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
+	if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
 		return;
-	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
+	posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
+				   size, true);
 }
 
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
+void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+				 void *value, size_t size)
 {
 	struct user_namespace *user_ns = current_user_ns();
-	if (user_ns == &init_user_ns)
+	if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
 		return;
-	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
+	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
+				   size, false);
 }
 
 /*
@@ -865,7 +900,8 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 }
 
 int
-set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
+set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	      int type, struct posix_acl *acl)
 {
 	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
@@ -874,7 +910,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return acl ? -EACCES : 0;
-	if (!inode_owner_or_capable(&init_user_ns, inode))
+	if (!inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	if (acl) {
@@ -888,9 +924,10 @@ EXPORT_SYMBOL(set_posix_acl);
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
-		    struct dentry *unused, struct inode *inode,
-		    const char *name, const void *value,
-		    size_t size, int flags)
+			   struct user_namespace *mnt_userns,
+			   struct dentry *unused, struct inode *inode,
+			   const char *name, const void *value, size_t size,
+			   int flags)
 {
 	struct posix_acl *acl = NULL;
 	int ret;
@@ -900,7 +937,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 	}
-	ret = set_posix_acl(inode, handler->flags, acl);
+	ret = set_posix_acl(mnt_userns, inode, handler->flags, acl);
 	posix_acl_release(acl);
 	return ret;
 }
@@ -934,7 +971,7 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	int error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_update_mode(inode,
+		error = posix_acl_update_mode(&init_user_ns, inode,
 				&inode->i_mode, &acl);
 		if (error)
 			return error;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index ccd40df6eb45..4bf976bc7bad 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -40,7 +40,8 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	reiserfs_write_unlock(inode->i_sb);
 	if (error == 0) {
 		if (type == ACL_TYPE_ACCESS && acl) {
-			error = posix_acl_update_mode(inode, &mode, &acl);
+			error = posix_acl_update_mode(&init_user_ns, inode,
+						      &mode, &acl);
 			if (error)
 				goto unlock;
 			update_mode = 1;
@@ -399,5 +400,5 @@ int reiserfs_acl_chmod(struct inode *inode)
 	    !reiserfs_posixacl(inode->i_sb))
 		return 0;
 
-	return posix_acl_chmod(inode, inode->i_mode);
+	return posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 }
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 20be9a0e5870..8965c8e5e172 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -21,7 +21,8 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-security_set(const struct xattr_handler *handler, struct dentry *unused,
+security_set(const struct xattr_handler *handler,
+	     struct user_namespace *mnt_userns, struct dentry *unused,
 	     struct inode *inode, const char *name, const void *buffer,
 	     size_t size, int flags)
 {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5ed48da3d02b..d853cea2afcd 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -20,7 +20,8 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+trusted_set(const struct xattr_handler *handler,
+	    struct user_namespace *mnt_userns, struct dentry *unused,
 	    struct inode *inode, const char *name, const void *buffer,
 	    size_t size, int flags)
 {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index a573ca45bacc..65d9cd10a5ea 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -18,7 +18,8 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-user_set(const struct xattr_handler *handler, struct dentry *unused,
+user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns,
+	 struct dentry *unused,
 	 struct inode *inode, const char *name, const void *buffer,
 	 size_t size, int flags)
 {
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index a0b9b349efe6..8f4135c22ab6 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -681,6 +681,7 @@ static int xattr_get(const struct xattr_handler *handler,
 }
 
 static int xattr_set(const struct xattr_handler *handler,
+			   struct user_namespace *mnt_userns,
 			   struct dentry *dentry, struct inode *inode,
 			   const char *name, const void *value,
 			   size_t size, int flags)
diff --git a/fs/xattr.c b/fs/xattr.c
index c669922e1bde..d777025121e0 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,8 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 		return -EOPNOTSUPP;
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
-	return handler->set(handler, dentry, inode, name, value, size, flags);
+	return handler->set(handler, &init_user_ns, dentry, inode, name, value,
+			    size, flags);
 }
 EXPORT_SYMBOL(__vfs_setxattr);
 
@@ -281,7 +282,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	int error;
 
 	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
-		error = cap_convert_nscap(dentry, &value, size);
+		error = cap_convert_nscap(&init_user_ns, dentry, &value, size);
 		if (error < 0)
 			return error;
 		size = error;
@@ -450,7 +451,8 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
 		return PTR_ERR(handler);
 	if (!handler->set)
 		return -EOPNOTSUPP;
-	return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE);
+	return handler->set(handler, &init_user_ns, dentry, inode, name, NULL,
+			    0, XATTR_REPLACE);
 }
 EXPORT_SYMBOL(__vfs_removexattr);
 
@@ -548,7 +550,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		}
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_from_user(kvalue, size);
+			posix_acl_fix_xattr_from_user(&init_user_ns, kvalue,
+						      size);
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
@@ -642,7 +645,8 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	if (error > 0) {
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_to_user(kvalue, error);
+			posix_acl_fix_xattr_to_user(&init_user_ns, kvalue,
+						    error);
 		if (size && copy_to_user(value, kvalue, error))
 			error = -EFAULT;
 	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 779cb73b3d00..368351298bd5 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -252,7 +252,8 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		return error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_update_mode(inode, &mode, &acl);
+		error = posix_acl_update_mode(&init_user_ns, inode, &mode,
+					      &acl);
 		if (error)
 			return error;
 		set_mode = true;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 08a478d25122..26d22edef741 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -807,7 +807,7 @@ xfs_setattr_nonsize(
 	 * 	     Posix ACL code seems to care about this issue either.
 	 */
 	if (mask & ATTR_MODE) {
-		error = posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 		if (error)
 			return error;
 	}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index bca48b308c02..12be32f66dc1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -38,9 +38,10 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
-		struct inode *inode, const char *name, const void *value,
-		size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler,
+	      struct user_namespace *mnt_userns, struct dentry *unused,
+	      struct inode *inode, const char *name, const void *value,
+	      size_t size, int flags)
 {
 	struct xfs_da_args	args = {
 		.dp		= XFS_I(inode),
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 62bfa3ed84d7..da21ef118b04 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -273,6 +273,7 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
-extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
+int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      const void **ivalue, size_t size);
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 85fb4c0c650a..6dcd8b8f6ab5 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -69,13 +69,15 @@ extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
 extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);
 
 extern struct posix_acl *get_posix_acl(struct inode *, int);
-extern int set_posix_acl(struct inode *, int, struct posix_acl *);
+extern int set_posix_acl(struct user_namespace *, struct inode *, int,
+			 struct posix_acl *);
 
 #ifdef CONFIG_FS_POSIX_ACL
-extern int posix_acl_chmod(struct inode *, umode_t);
+int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 		struct posix_acl **);
-extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);
+int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *,
+			  struct posix_acl **);
 
 extern int simple_set_acl(struct inode *, struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
@@ -95,7 +97,8 @@ static inline void cache_no_acl(struct inode *inode)
 	inode->i_default_acl = NULL;
 }
 #else
-static inline int posix_acl_chmod(struct inode *inode, umode_t mode)
+static inline int posix_acl_chmod(struct user_namespace *mnt_userns,
+				  struct inode *inode, umode_t mode)
 {
 	return 0;
 }
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 2387709991b5..060e8d203181 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -33,13 +33,17 @@ posix_acl_xattr_count(size_t size)
 }
 
 #ifdef CONFIG_FS_POSIX_ACL
-void posix_acl_fix_xattr_from_user(void *value, size_t size);
-void posix_acl_fix_xattr_to_user(void *value, size_t size);
+void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+				   void *value, size_t size);
+void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+				 void *value, size_t size);
 #else
-static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
+static inline void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
+						 void *value, size_t size)
 {
 }
-static inline void posix_acl_fix_xattr_to_user(void *value, size_t size)
+static inline void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
+					       void *value, size_t size)
 {
 }
 #endif
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 10b4dc2709f0..260c9bcb0edb 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -34,7 +34,8 @@ struct xattr_handler {
 	int (*get)(const struct xattr_handler *, struct dentry *dentry,
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
-	int (*set)(const struct xattr_handler *, struct dentry *dentry,
+	int (*set)(const struct xattr_handler *,
+		   struct user_namespace *mnt_userns, struct dentry *dentry,
 		   struct inode *inode, const char *name, const void *buffer,
 		   size_t size, int flags);
 };
diff --git a/mm/shmem.c b/mm/shmem.c
index 1cb451e131ec..23b8e9c15a42 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1143,7 +1143,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 
 	setattr_copy(&init_user_ns, inode, attr);
 	if (attr->ia_valid & ATTR_MODE)
-		error = posix_acl_chmod(inode, inode->i_mode);
+		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 	return error;
 }
 
@@ -3273,6 +3273,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+				   struct user_namespace *mnt_userns,
 				   struct dentry *unused, struct inode *inode,
 				   const char *name, const void *value,
 				   size_t size, int flags)
diff --git a/net/socket.c b/net/socket.c
index 33e8b6c4e1d3..c76703c6f480 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -334,6 +334,7 @@ static const struct xattr_handler sockfs_xattr_handler = {
 };
 
 static int sockfs_security_xattr_set(const struct xattr_handler *handler,
+				     struct user_namespace *mnt_userns,
 				     struct dentry *dentry, struct inode *inode,
 				     const char *suffix, const void *value,
 				     size_t size, int flags)
diff --git a/security/commoncap.c b/security/commoncap.c
index 88ee345f7bfa..c3fd9b86ea9a 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -450,16 +450,33 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 	return size;
 }
 
+/**
+ * rootid_from_xattr - translate root uid of vfs caps
+ *
+ * @value:	vfs caps value which may be modified by this function
+ * @size:	size of @ivalue
+ * @task_ns:	user namespace of the caller
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ */
 static kuid_t rootid_from_xattr(const void *value, size_t size,
-				struct user_namespace *task_ns)
+				struct user_namespace *task_ns,
+				struct user_namespace *mnt_userns)
 {
 	const struct vfs_ns_cap_data *nscap = value;
+	kuid_t rootkid;
 	uid_t rootid = 0;
 
 	if (size == XATTR_CAPS_SZ_3)
 		rootid = le32_to_cpu(nscap->rootid);
 
-	return make_kuid(task_ns, rootid);
+	rootkid = make_kuid(task_ns, rootid);
+	return kuid_from_mnt(mnt_userns, rootkid);
 }
 
 static bool validheader(size_t size, const struct vfs_cap_data *cap)
@@ -467,13 +484,27 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap)
 	return is_v2header(size, cap) || is_v3header(size, cap);
 }
 
-/*
+/**
+ * cap_convert_nscap - check vfs caps
+ *
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	used to retrieve inode to check permissions on
+ * @ivalue:	vfs caps value which may be modified by this function
+ * @size:	size of @ivalue
+ *
  * User requested a write of security.capability.  If needed, update the
  * xattr to change from v2 to v3, or to fixup the v3 rootid.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * If all is ok, we return the new size, on error return < 0.
  */
-int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
+int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      const void **ivalue, size_t size)
 {
 	struct vfs_ns_cap_data *nscap;
 	uid_t nsrootid;
@@ -489,14 +520,14 @@ int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 		return -EINVAL;
 	if (!validheader(size, cap))
 		return -EINVAL;
-	if (!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_SETFCAP))
+	if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
 		return -EPERM;
-	if (size == XATTR_CAPS_SZ_2)
+	if (size == XATTR_CAPS_SZ_2 && (mnt_userns == &init_user_ns))
 		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
 			/* user is privileged, just write the v2 */
 			return size;
 
-	rootid = rootid_from_xattr(*ivalue, size, task_ns);
+	rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns);
 	if (!uid_valid(rootid))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From c7c7a1a18af4c3bb7749d33e3df3acdf0a95bbb5 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.pizza>
Date: Thu, 21 Jan 2021 14:19:28 +0100
Subject: xattr: handle idmapped mounts

When interacting with extended attributes the vfs verifies that the
caller is privileged over the inode with which the extended attribute is
associated. For posix access and posix default extended attributes a uid
or gid can be stored on-disk. Let the functions handle posix extended
attributes on idmapped mounts. If the inode is accessed through an
idmapped mount we need to map it according to the mount's user
namespace. Afterwards the checks are identical to non-idmapped mounts.
This has no effect for e.g. security xattrs since they don't store uids
or gids and don't perform permission checks on them like posix acls do.

Link: https://lore.kernel.org/r/20210121131959.646623-10-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/cachefiles/xattr.c                 |  29 ++++----
 fs/ecryptfs/crypto.c                  |   4 +-
 fs/ecryptfs/inode.c                   |   5 +-
 fs/ecryptfs/mmap.c                    |   4 +-
 fs/nfsd/vfs.c                         |  14 ++--
 fs/overlayfs/copy_up.c                |  14 ++--
 fs/overlayfs/dir.c                    |   2 +-
 fs/overlayfs/inode.c                  |   9 +--
 fs/overlayfs/overlayfs.h              |   6 +-
 fs/overlayfs/super.c                  |   6 +-
 fs/xattr.c                            | 120 +++++++++++++++++++---------------
 include/linux/xattr.h                 |  27 +++++---
 security/apparmor/domain.c            |   4 +-
 security/commoncap.c                  |   6 +-
 security/integrity/evm/evm_crypto.c   |  11 ++--
 security/integrity/evm/evm_main.c     |   4 +-
 security/integrity/ima/ima_appraise.c |   8 +--
 security/selinux/hooks.c              |   3 +-
 security/smack/smack_lsm.c            |   8 ++-
 19 files changed, 160 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 72e42438f3d7..a591b5e09637 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -39,8 +39,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 	_enter("%p{%s}", object, type);
 
 	/* attempt to install a type label directly */
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
-			   XATTR_CREATE);
+	ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type,
+			   2, XATTR_CREATE);
 	if (ret == 0) {
 		_debug("SET"); /* we succeeded */
 		goto error;
@@ -54,7 +54,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
 	}
 
 	/* read the current type label */
-	ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+	ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype,
+			   3);
 	if (ret < 0) {
 		if (ret == -ERANGE)
 			goto bad_type_length;
@@ -110,9 +111,8 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
 	_debug("SET #%u", auxdata->len);
 
 	clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_CREATE);
+	ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len, XATTR_CREATE);
 	if (ret < 0 && ret != -ENOMEM)
 		cachefiles_io_error_obj(
 			object,
@@ -140,9 +140,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
 	_debug("SET #%u", auxdata->len);
 
 	clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags);
-	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-			   &auxdata->type, auxdata->len,
-			   XATTR_REPLACE);
+	ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
+			   &auxdata->type, auxdata->len, XATTR_REPLACE);
 	if (ret < 0 && ret != -ENOMEM)
 		cachefiles_io_error_obj(
 			object,
@@ -171,7 +170,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
 	if (!auxbuf)
 		return -ENOMEM;
 
-	xlen = vfs_getxattr(dentry, cachefiles_xattr_cache,
+	xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
 			    &auxbuf->type, 512 + 1);
 	ret = -ESTALE;
 	if (xlen < 1 ||
@@ -213,7 +212,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
 	}
 
 	/* read the current type label */
-	ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+	ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
 			   &auxbuf->type, 512 + 1);
 	if (ret < 0) {
 		if (ret == -ENODATA)
@@ -270,9 +269,9 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
 		}
 
 		/* update the current label */
-		ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
-				   &auxdata->type, auxdata->len,
-				   XATTR_REPLACE);
+		ret = vfs_setxattr(&init_user_ns, dentry,
+				   cachefiles_xattr_cache, &auxdata->type,
+				   auxdata->len, XATTR_REPLACE);
 		if (ret < 0) {
 			cachefiles_io_error_obj(object,
 						"Can't update xattr on %lu"
@@ -309,7 +308,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
 {
 	int ret;
 
-	ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+	ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache);
 	if (ret < 0) {
 		if (ret == -ENOENT || ret == -ENODATA)
 			ret = 0;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 0681540c48d9..943e523f4c9d 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1110,8 +1110,8 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
 	}
 
 	inode_lock(lower_inode);
-	rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
-			    page_virt, size, 0);
+	rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode,
+			    ECRYPTFS_XATTR_NAME, page_virt, size, 0);
 	if (!rc && ecryptfs_inode)
 		fsstack_copy_attr_all(ecryptfs_inode, lower_inode);
 	inode_unlock(lower_inode);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ac6472a82567..b9ccc4085d46 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1024,7 +1024,8 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+	rc = vfs_setxattr(&init_user_ns, lower_dentry, name, value, size,
+			  flags);
 	if (!rc && inode)
 		fsstack_copy_attr_all(inode, d_inode(lower_dentry));
 out:
@@ -1089,7 +1090,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode,
 		goto out;
 	}
 	inode_lock(lower_inode);
-	rc = __vfs_removexattr(lower_dentry, name);
+	rc = __vfs_removexattr(&init_user_ns, lower_dentry, name);
 	inode_unlock(lower_inode);
 out:
 	return rc;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 019572c6b39a..2f333a40ff4d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -426,8 +426,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	if (size < 0)
 		size = 8;
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
-	rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME,
-			    xattr_virt, size, 0);
+	rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode,
+			    ECRYPTFS_XATTR_NAME, xattr_virt, size, 0);
 	inode_unlock(lower_inode);
 	if (rc)
 		printk(KERN_ERR "Error whilst attempting to write inode size "
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1905b39be1c2..37d85046b4d6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -499,7 +499,8 @@ int nfsd4_is_junction(struct dentry *dentry)
 		return 0;
 	if (!(inode->i_mode & S_ISVTX))
 		return 0;
-	if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
+	if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME,
+			 NULL, 0) <= 0)
 		return 0;
 	return 1;
 }
@@ -2149,7 +2150,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 
 	inode_lock_shared(inode);
 
-	len = vfs_getxattr(dentry, name, NULL, 0);
+	len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
 
 	/*
 	 * Zero-length attribute, just return.
@@ -2176,7 +2177,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 		goto out;
 	}
 
-	len = vfs_getxattr(dentry, name, buf, len);
+	len = vfs_getxattr(&init_user_ns, dentry, name, buf, len);
 	if (len <= 0) {
 		kvfree(buf);
 		buf = NULL;
@@ -2283,7 +2284,8 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
 
 	fh_lock(fhp);
 
-	ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL);
+	ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
+				       name, NULL);
 
 	fh_unlock(fhp);
 	fh_drop_write(fhp);
@@ -2307,8 +2309,8 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
 		return nfserrno(ret);
 	fh_lock(fhp);
 
-	ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags,
-				    NULL);
+	ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
+				    len, flags, NULL);
 
 	fh_unlock(fhp);
 	fh_drop_write(fhp);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 3e9957ae19fa..f81b836c2256 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -85,9 +85,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
 		if (ovl_is_private_xattr(sb, name))
 			continue;
 retry:
-		size = vfs_getxattr(old, name, value, value_size);
+		size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
 		if (size == -ERANGE)
-			size = vfs_getxattr(old, name, NULL, 0);
+			size = vfs_getxattr(&init_user_ns, old, name, NULL, 0);
 
 		if (size < 0) {
 			error = size;
@@ -114,7 +114,7 @@ retry:
 			error = 0;
 			continue; /* Discard */
 		}
-		error = vfs_setxattr(new, name, value, size, 0);
+		error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
 		if (error) {
 			if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
 				break;
@@ -795,7 +795,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
 	ssize_t res;
 	char *buf;
 
-	res = vfs_getxattr(dentry, name, NULL, 0);
+	res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
 	if (res == -ENODATA || res == -EOPNOTSUPP)
 		res = 0;
 
@@ -804,7 +804,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
 		if (!buf)
 			return -ENOMEM;
 
-		res = vfs_getxattr(dentry, name, buf, res);
+		res = vfs_getxattr(&init_user_ns, dentry, name, buf, res);
 		if (res < 0)
 			kfree(buf);
 		else
@@ -846,8 +846,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
 	 * don't want that to happen for normal copy-up operation.
 	 */
 	if (capability) {
-		err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
-				   capability, cap_size, 0);
+		err = vfs_setxattr(&init_user_ns, upperpath.dentry,
+				   XATTR_NAME_CAPS, capability, cap_size, 0);
 		if (err)
 			goto out_free;
 	}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 29840820a46c..d75c96cb18c3 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -449,7 +449,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
 	if (err < 0)
 		goto out_free;
 
-	err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
+	err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE);
 out_free:
 	kfree(buffer);
 	return err;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 5aa66881dbd7..023fde466e3a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -352,7 +352,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 		goto out;
 
 	if (!value && !upperdentry) {
-		err = vfs_getxattr(realdentry, name, NULL, 0);
+		err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
 		if (err < 0)
 			goto out_drop_write;
 	}
@@ -367,10 +367,11 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 
 	old_cred = ovl_override_creds(dentry->d_sb);
 	if (value)
-		err = vfs_setxattr(realdentry, name, value, size, flags);
+		err = vfs_setxattr(&init_user_ns, realdentry, name, value, size,
+				   flags);
 	else {
 		WARN_ON(flags != XATTR_REPLACE);
-		err = vfs_removexattr(realdentry, name);
+		err = vfs_removexattr(&init_user_ns, realdentry, name);
 	}
 	revert_creds(old_cred);
 
@@ -392,7 +393,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
 		ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
 
 	old_cred = ovl_override_creds(dentry->d_sb);
-	res = vfs_getxattr(realdentry, name, value, size);
+	res = vfs_getxattr(&init_user_ns, realdentry, name, value, size);
 	revert_creds(old_cred);
 	return res;
 }
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b487e48c7fd4..0002834f664a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -186,7 +186,7 @@ static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry,
 				      size_t size)
 {
 	const char *name = ovl_xattr(ofs, ox);
-	return vfs_getxattr(dentry, name, value, size);
+	return vfs_getxattr(&init_user_ns, dentry, name, value, size);
 }
 
 static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
@@ -194,7 +194,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
 				  size_t size)
 {
 	const char *name = ovl_xattr(ofs, ox);
-	int err = vfs_setxattr(dentry, name, value, size, 0);
+	int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0);
 	pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
 		 dentry, name, min((int)size, 48), value, size, err);
 	return err;
@@ -204,7 +204,7 @@ static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
 				     enum ovl_xattr ox)
 {
 	const char *name = ovl_xattr(ofs, ox);
-	int err = vfs_removexattr(dentry, name);
+	int err = vfs_removexattr(&init_user_ns, dentry, name);
 	pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
 	return err;
 }
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e24c995c5fd4..8168ab2dda11 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -794,11 +794,13 @@ retry:
 		 * allowed as upper are limited to "normal" ones, where checking
 		 * for the above two errors is sufficient.
 		 */
-		err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
+		err = vfs_removexattr(&init_user_ns, work,
+				      XATTR_NAME_POSIX_ACL_DEFAULT);
 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
-		err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
+		err = vfs_removexattr(&init_user_ns, work,
+				      XATTR_NAME_POSIX_ACL_ACCESS);
 		if (err && err != -ENODATA && err != -EOPNOTSUPP)
 			goto out_dput;
 
diff --git a/fs/xattr.c b/fs/xattr.c
index d777025121e0..a49541713b11 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -83,7 +83,8 @@ xattr_resolve_name(struct inode *inode, const char **name)
  * because different namespaces have very different rules.
  */
 static int
-xattr_permission(struct inode *inode, const char *name, int mask)
+xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		 const char *name, int mask)
 {
 	/*
 	 * We can never set or remove an extended attribute on a read-only
@@ -128,11 +129,11 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
 		    (mask & MAY_WRITE) &&
-		    !inode_owner_or_capable(&init_user_ns, inode))
+		    !inode_owner_or_capable(mnt_userns, inode))
 			return -EPERM;
 	}
 
-	return inode_permission(&init_user_ns, inode, mask);
+	return inode_permission(mnt_userns, inode, mask);
 }
 
 /*
@@ -163,8 +164,9 @@ xattr_supported_namespace(struct inode *inode, const char *prefix)
 EXPORT_SYMBOL(xattr_supported_namespace);
 
 int
-__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
-	       const void *value, size_t size, int flags)
+__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	       struct inode *inode, const char *name, const void *value,
+	       size_t size, int flags)
 {
 	const struct xattr_handler *handler;
 
@@ -175,7 +177,7 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
 		return -EOPNOTSUPP;
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
-	return handler->set(handler, &init_user_ns, dentry, inode, name, value,
+	return handler->set(handler, mnt_userns, dentry, inode, name, value,
 			    size, flags);
 }
 EXPORT_SYMBOL(__vfs_setxattr);
@@ -184,6 +186,7 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *  __vfs_setxattr_noperm - perform setxattr operation without performing
  *  permission checks.
  *
+ *  @mnt_userns - user namespace of the mount the inode was found from
  *  @dentry - object to perform setxattr on
  *  @name - xattr name to set
  *  @value - value to set @name to
@@ -196,8 +199,9 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *  is executed. It also assumes that the caller will make the appropriate
  *  permission checks.
  */
-int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags)
+int __vfs_setxattr_noperm(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *name,
+			  const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	int error = -EAGAIN;
@@ -207,7 +211,8 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 	if (issec)
 		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_opflags & IOP_XATTR) {
-		error = __vfs_setxattr(dentry, inode, name, value, size, flags);
+		error = __vfs_setxattr(mnt_userns, dentry, inode, name, value,
+				       size, flags);
 		if (!error) {
 			fsnotify_xattr(dentry);
 			security_inode_post_setxattr(dentry, name, value,
@@ -246,14 +251,14 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
  *  a delegation was broken on, NULL if none.
  */
 int
-__vfs_setxattr_locked(struct dentry *dentry, const char *name,
-		const void *value, size_t size, int flags,
-		struct inode **delegated_inode)
+__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      const char *name, const void *value, size_t size,
+		      int flags, struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_WRITE);
+	error = xattr_permission(mnt_userns, inode, name, MAY_WRITE);
 	if (error)
 		return error;
 
@@ -265,7 +270,8 @@ __vfs_setxattr_locked(struct dentry *dentry, const char *name,
 	if (error)
 		goto out;
 
-	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+	error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value,
+				      size, flags);
 
 out:
 	return error;
@@ -273,8 +279,8 @@ out:
 EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
 
 int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
+vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	     const char *name, const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -282,7 +288,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	int error;
 
 	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
-		error = cap_convert_nscap(&init_user_ns, dentry, &value, size);
+		error = cap_convert_nscap(mnt_userns, dentry, &value, size);
 		if (error < 0)
 			return error;
 		size = error;
@@ -290,8 +296,8 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 retry_deleg:
 	inode_lock(inode);
-	error = __vfs_setxattr_locked(dentry, name, value, size, flags,
-	    &delegated_inode);
+	error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size,
+				      flags, &delegated_inode);
 	inode_unlock(inode);
 
 	if (delegated_inode) {
@@ -341,15 +347,16 @@ out_noalloc:
  * Returns the result of alloc, if failed, or the getxattr operation.
  */
 ssize_t
-vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
-		   size_t xattr_size, gfp_t flags)
+vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   const char *name, char **xattr_value, size_t xattr_size,
+		   gfp_t flags)
 {
 	const struct xattr_handler *handler;
 	struct inode *inode = dentry->d_inode;
 	char *value = *xattr_value;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_READ);
+	error = xattr_permission(mnt_userns, inode, name, MAY_READ);
 	if (error)
 		return error;
 
@@ -390,12 +397,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
 EXPORT_SYMBOL(__vfs_getxattr);
 
 ssize_t
-vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
+vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	     const char *name, void *value, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_READ);
+	error = xattr_permission(mnt_userns, inode, name, MAY_READ);
 	if (error)
 		return error;
 
@@ -441,7 +449,8 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size)
 EXPORT_SYMBOL_GPL(vfs_listxattr);
 
 int
-__vfs_removexattr(struct dentry *dentry, const char *name)
+__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  const char *name)
 {
 	struct inode *inode = d_inode(dentry);
 	const struct xattr_handler *handler;
@@ -451,8 +460,8 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
 		return PTR_ERR(handler);
 	if (!handler->set)
 		return -EOPNOTSUPP;
-	return handler->set(handler, &init_user_ns, dentry, inode, name, NULL,
-			    0, XATTR_REPLACE);
+	return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0,
+			    XATTR_REPLACE);
 }
 EXPORT_SYMBOL(__vfs_removexattr);
 
@@ -466,13 +475,14 @@ EXPORT_SYMBOL(__vfs_removexattr);
  *  a delegation was broken on, NULL if none.
  */
 int
-__vfs_removexattr_locked(struct dentry *dentry, const char *name,
-		struct inode **delegated_inode)
+__vfs_removexattr_locked(struct user_namespace *mnt_userns,
+			 struct dentry *dentry, const char *name,
+			 struct inode **delegated_inode)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	error = xattr_permission(inode, name, MAY_WRITE);
+	error = xattr_permission(mnt_userns, inode, name, MAY_WRITE);
 	if (error)
 		return error;
 
@@ -484,7 +494,7 @@ __vfs_removexattr_locked(struct dentry *dentry, const char *name,
 	if (error)
 		goto out;
 
-	error = __vfs_removexattr(dentry, name);
+	error = __vfs_removexattr(mnt_userns, dentry, name);
 
 	if (!error) {
 		fsnotify_xattr(dentry);
@@ -497,7 +507,8 @@ out:
 EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);
 
 int
-vfs_removexattr(struct dentry *dentry, const char *name)
+vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		const char *name)
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -505,7 +516,8 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 
 retry_deleg:
 	inode_lock(inode);
-	error = __vfs_removexattr_locked(dentry, name, &delegated_inode);
+	error = __vfs_removexattr_locked(mnt_userns, dentry,
+					 name, &delegated_inode);
 	inode_unlock(inode);
 
 	if (delegated_inode) {
@@ -522,8 +534,9 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
  * Extended attribute SET operations
  */
 static long
-setxattr(struct dentry *d, const char __user *name, const void __user *value,
-	 size_t size, int flags)
+setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+	 const char __user *name, const void __user *value, size_t size,
+	 int flags)
 {
 	int error;
 	void *kvalue = NULL;
@@ -550,11 +563,10 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		}
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_from_user(&init_user_ns, kvalue,
-						      size);
+			posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size);
 	}
 
-	error = vfs_setxattr(d, kname, kvalue, size, flags);
+	error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
 out:
 	kvfree(kvalue);
 
@@ -567,13 +579,15 @@ static int path_setxattr(const char __user *pathname,
 {
 	struct path path;
 	int error;
+
 retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(path.dentry, name, value, size, flags);
+		error = setxattr(mnt_user_ns(path.mnt), path.dentry, name,
+				 value, size, flags);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -609,7 +623,9 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 	audit_file(f.file);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = setxattr(f.file->f_path.dentry, name, value, size, flags);
+		error = setxattr(file_mnt_user_ns(f.file),
+				 f.file->f_path.dentry, name,
+				 value, size, flags);
 		mnt_drop_write_file(f.file);
 	}
 	fdput(f);
@@ -620,8 +636,8 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
  * Extended attribute GET operations
  */
 static ssize_t
-getxattr(struct dentry *d, const char __user *name, void __user *value,
-	 size_t size)
+getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+	 const char __user *name, void __user *value, size_t size)
 {
 	ssize_t error;
 	void *kvalue = NULL;
@@ -641,12 +657,11 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 			return -ENOMEM;
 	}
 
-	error = vfs_getxattr(d, kname, kvalue, size);
+	error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
 	if (error > 0) {
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
-			posix_acl_fix_xattr_to_user(&init_user_ns, kvalue,
-						    error);
+			posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error);
 		if (size && copy_to_user(value, kvalue, error))
 			error = -EFAULT;
 	} else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
@@ -670,7 +685,7 @@ retry:
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
-	error = getxattr(path.dentry, name, value, size);
+	error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -700,7 +715,8 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 	if (!f.file)
 		return error;
 	audit_file(f.file);
-	error = getxattr(f.file->f_path.dentry, name, value, size);
+	error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
+			 name, value, size);
 	fdput(f);
 	return error;
 }
@@ -784,7 +800,8 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
  * Extended attribute REMOVE operations
  */
 static long
-removexattr(struct dentry *d, const char __user *name)
+removexattr(struct user_namespace *mnt_userns, struct dentry *d,
+	    const char __user *name)
 {
 	int error;
 	char kname[XATTR_NAME_MAX + 1];
@@ -795,7 +812,7 @@ removexattr(struct dentry *d, const char __user *name)
 	if (error < 0)
 		return error;
 
-	return vfs_removexattr(d, kname);
+	return vfs_removexattr(mnt_userns, d, kname);
 }
 
 static int path_removexattr(const char __user *pathname,
@@ -809,7 +826,7 @@ retry:
 		return error;
 	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(path.dentry, name);
+		error = removexattr(mnt_user_ns(path.mnt), path.dentry, name);
 		mnt_drop_write(path.mnt);
 	}
 	path_put(&path);
@@ -842,7 +859,8 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 	audit_file(f.file);
 	error = mnt_want_write_file(f.file);
 	if (!error) {
-		error = removexattr(f.file->f_path.dentry, name);
+		error = removexattr(file_mnt_user_ns(f.file),
+				    f.file->f_path.dentry, name);
 		mnt_drop_write_file(f.file);
 	}
 	fdput(f);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 260c9bcb0edb..4c379d23ec6e 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
+#include <linux/user_namespace.h>
 #include <uapi/linux/xattr.h>
 
 struct inode;
@@ -49,18 +50,26 @@ struct xattr {
 };
 
 ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
-ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
+ssize_t vfs_getxattr(struct user_namespace *, struct dentry *, const char *,
+		     void *, size_t);
 ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
-int __vfs_setxattr(struct dentry *, struct inode *, const char *, const void *, size_t, int);
-int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int);
-int __vfs_setxattr_locked(struct dentry *, const char *, const void *, size_t, int, struct inode **);
-int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
-int __vfs_removexattr(struct dentry *, const char *);
-int __vfs_removexattr_locked(struct dentry *, const char *, struct inode **);
-int vfs_removexattr(struct dentry *, const char *);
+int __vfs_setxattr(struct user_namespace *, struct dentry *, struct inode *,
+		   const char *, const void *, size_t, int);
+int __vfs_setxattr_noperm(struct user_namespace *, struct dentry *,
+			  const char *, const void *, size_t, int);
+int __vfs_setxattr_locked(struct user_namespace *, struct dentry *,
+			  const char *, const void *, size_t, int,
+			  struct inode **);
+int vfs_setxattr(struct user_namespace *, struct dentry *, const char *,
+		 const void *, size_t, int);
+int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
+int __vfs_removexattr_locked(struct user_namespace *, struct dentry *,
+			     const char *, struct inode **);
+int vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
 
 ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
-ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
+ssize_t vfs_getxattr_alloc(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, const char *name,
 			   char **xattr_value, size_t size, gfp_t flags);
 
 int xattr_supported_namespace(struct inode *inode, const char *prefix);
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index f919ebd042fd..16f184bc48de 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -324,8 +324,8 @@ static int aa_xattrs_match(const struct linux_binprm *bprm,
 	d = bprm->file->f_path.dentry;
 
 	for (i = 0; i < profile->xattr_count; i++) {
-		size = vfs_getxattr_alloc(d, profile->xattrs[i], &value,
-					  value_size, GFP_KERNEL);
+		size = vfs_getxattr_alloc(&init_user_ns, d, profile->xattrs[i],
+					  &value, value_size, GFP_KERNEL);
 		if (size >= 0) {
 			u32 perm;
 
diff --git a/security/commoncap.c b/security/commoncap.c
index c3fd9b86ea9a..745dc1f2c97f 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -313,7 +313,7 @@ int cap_inode_killpriv(struct dentry *dentry)
 {
 	int error;
 
-	error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
+	error = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_CAPS);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
@@ -386,8 +386,8 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 		return -EINVAL;
 
 	size = sizeof(struct vfs_ns_cap_data);
-	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
-				 &tmpbuf, size, GFP_NOFS);
+	ret = (int)vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_CAPS,
+				      &tmpbuf, size, GFP_NOFS);
 	dput(dentry);
 
 	if (ret < 0)
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 168c3b78ac47..f720f78cbbb1 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -222,7 +222,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 				ima_present = true;
 			continue;
 		}
-		size = vfs_getxattr_alloc(dentry, xattr->name,
+		size = vfs_getxattr_alloc(&init_user_ns, dentry, xattr->name,
 					  &xattr_value, xattr_size, GFP_NOFS);
 		if (size == -ENOMEM) {
 			error = -ENOMEM;
@@ -275,8 +275,8 @@ static int evm_is_immutable(struct dentry *dentry, struct inode *inode)
 		return 1;
 
 	/* Do this the hard way */
-	rc = vfs_getxattr_alloc(dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0,
-				GFP_NOFS);
+	rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM,
+				(char **)&xattr_data, 0, GFP_NOFS);
 	if (rc <= 0) {
 		if (rc == -ENODATA)
 			return 0;
@@ -319,11 +319,12 @@ int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name,
 			   xattr_value_len, &data);
 	if (rc == 0) {
 		data.hdr.xattr.sha1.type = EVM_XATTR_HMAC;
-		rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_EVM,
+		rc = __vfs_setxattr_noperm(&init_user_ns, dentry,
+					   XATTR_NAME_EVM,
 					   &data.hdr.xattr.data[1],
 					   SHA1_DIGEST_SIZE + 1, 0);
 	} else if (rc == -ENODATA && (inode->i_opflags & IOP_XATTR)) {
-		rc = __vfs_removexattr(dentry, XATTR_NAME_EVM);
+		rc = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_EVM);
 	}
 	return rc;
 }
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 76d19146d74b..0de367aaa2d3 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -146,8 +146,8 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 	/* if status is not PASS, try to check again - against -ENOMEM */
 
 	/* first need to know the sig type */
-	rc = vfs_getxattr_alloc(dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0,
-				GFP_NOFS);
+	rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM,
+				(char **)&xattr_data, 0, GFP_NOFS);
 	if (rc <= 0) {
 		evm_status = INTEGRITY_FAIL;
 		if (rc == -ENODATA) {
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 8361941ee0a1..70b643c41c6b 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -94,7 +94,7 @@ static int ima_fix_xattr(struct dentry *dentry,
 		iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
 		iint->ima_hash->xattr.ng.algo = algo;
 	}
-	rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA,
+	rc = __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_IMA,
 				   &iint->ima_hash->xattr.data[offset],
 				   (sizeof(iint->ima_hash->xattr) - offset) +
 				   iint->ima_hash->length, 0);
@@ -215,8 +215,8 @@ int ima_read_xattr(struct dentry *dentry,
 {
 	ssize_t ret;
 
-	ret = vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)xattr_value,
-				 0, GFP_NOFS);
+	ret = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_IMA,
+				 (char **)xattr_value, 0, GFP_NOFS);
 	if (ret == -EOPNOTSUPP)
 		ret = 0;
 	return ret;
@@ -520,7 +520,7 @@ void ima_inode_post_setattr(struct dentry *dentry)
 
 	action = ima_must_appraise(inode, MAY_ACCESS, POST_SETATTR);
 	if (!action)
-		__vfs_removexattr(dentry, XATTR_NAME_IMA);
+		__vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_IMA);
 	iint = integrity_iint_find(inode);
 	if (iint) {
 		set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9d6d3da2caf2..2efedd7001b2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6526,7 +6526,8 @@ static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen
  */
 static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 {
-	return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0);
+	return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SELINUX,
+				     ctx, ctxlen, 0);
 }
 
 static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index f69c3dd9a0c6..746e5743accc 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3425,7 +3425,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 			 */
 			if (isp->smk_flags & SMK_INODE_CHANGED) {
 				isp->smk_flags &= ~SMK_INODE_CHANGED;
-				rc = __vfs_setxattr(dp, inode,
+				rc = __vfs_setxattr(&init_user_ns, dp, inode,
 					XATTR_NAME_SMACKTRANSMUTE,
 					TRANS_TRUE, TRANS_TRUE_SIZE,
 					0);
@@ -4597,12 +4597,14 @@ static int smack_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 
 static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
 {
-	return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx, ctxlen, 0);
+	return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx,
+				       ctxlen, 0);
 }
 
 static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 {
-	return __vfs_setxattr_noperm(dentry, XATTR_NAME_SMACK, ctx, ctxlen, 0);
+	return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SMACK,
+				     ctx, ctxlen, 0);
 }
 
 static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
-- 
cgit v1.2.3


From 71bc356f93a1c589fad13f7487258f89c417976e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:29 +0100
Subject: commoncap: handle idmapped mounts

When interacting with user namespace and non-user namespace aware
filesystem capabilities the vfs will perform various security checks to
determine whether or not the filesystem capabilities can be used by the
caller, whether they need to be removed and so on. The main
infrastructure for this resides in the capability codepaths but they are
called through the LSM security infrastructure even though they are not
technically an LSM or optional. This extends the existing security hooks
security_inode_removexattr(), security_inode_killpriv(),
security_inode_getsecurity() to pass down the mount's user namespace and
makes them aware of idmapped mounts.

In order to actually get filesystem capabilities from disk the
capability infrastructure exposes the get_vfs_caps_from_disk() helper.
For user namespace aware filesystem capabilities a root uid is stored
alongside the capabilities.

In order to determine whether the caller can make use of the filesystem
capability or whether it needs to be ignored it is translated according
to the superblock's user namespace. If it can be translated to uid 0
according to that id mapping the caller can use the filesystem
capabilities stored on disk. If we are accessing the inode that holds
the filesystem capabilities through an idmapped mount we map the root
uid according to the mount's user namespace. Afterwards the checks are
identical to non-idmapped mounts: reading filesystem caps from disk
enforces that the root uid associated with the filesystem capability
must have a mapping in the superblock's user namespace and that the
caller is either in the same user namespace or is a descendant of the
superblock's user namespace. For filesystems that are mountable inside
user namespace the caller can just mount the filesystem and won't
usually need to idmap it. If they do want to idmap it they can create an
idmapped mount and mark it with a user namespace they created and which
is thus a descendant of s_user_ns. For filesystems that are not
mountable inside user namespaces the descendant rule is trivially true
because the s_user_ns will be the initial user namespace.

If the initial user namespace is passed nothing changes so non-idmapped
mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-11-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                     |  2 +-
 fs/xattr.c                    | 18 ++++++++-----
 include/linux/capability.h    |  4 ++-
 include/linux/lsm_hook_defs.h | 15 ++++++-----
 include/linux/lsm_hooks.h     |  1 +
 include/linux/security.h      | 54 +++++++++++++++++++++++--------------
 kernel/auditsc.c              |  5 ++--
 security/commoncap.c          | 62 ++++++++++++++++++++++++++++++++++---------
 security/security.c           | 25 ++++++++++-------
 security/selinux/hooks.c      | 20 ++++++++------
 security/smack/smack_lsm.c    | 14 +++++-----
 11 files changed, 146 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index f4543d2abdfb..c2d3bc0869d4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -143,7 +143,7 @@ kill_priv:
 	if (ia_valid & ATTR_KILL_PRIV) {
 		int error;
 
-		error = security_inode_killpriv(dentry);
+		error = security_inode_killpriv(mnt_userns, dentry);
 		if (error)
 			return error;
 	}
diff --git a/fs/xattr.c b/fs/xattr.c
index a49541713b11..79c439d31eaa 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -262,7 +262,8 @@ __vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (error)
 		return error;
 
-	error = security_inode_setxattr(dentry, name, value, size, flags);
+	error = security_inode_setxattr(mnt_userns, dentry, name, value, size,
+					flags);
 	if (error)
 		goto out;
 
@@ -313,18 +314,20 @@ retry_deleg:
 EXPORT_SYMBOL_GPL(vfs_setxattr);
 
 static ssize_t
-xattr_getsecurity(struct inode *inode, const char *name, void *value,
-			size_t size)
+xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode,
+		  const char *name, void *value, size_t size)
 {
 	void *buffer = NULL;
 	ssize_t len;
 
 	if (!value || !size) {
-		len = security_inode_getsecurity(inode, name, &buffer, false);
+		len = security_inode_getsecurity(mnt_userns, inode, name,
+						 &buffer, false);
 		goto out_noalloc;
 	}
 
-	len = security_inode_getsecurity(inode, name, &buffer, true);
+	len = security_inode_getsecurity(mnt_userns, inode, name, &buffer,
+					 true);
 	if (len < 0)
 		return len;
 	if (size < len) {
@@ -414,7 +417,8 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN)) {
 		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		int ret = xattr_getsecurity(inode, suffix, value, size);
+		int ret = xattr_getsecurity(mnt_userns, inode, suffix, value,
+					    size);
 		/*
 		 * Only overwrite the return value if a security module
 		 * is actually active.
@@ -486,7 +490,7 @@ __vfs_removexattr_locked(struct user_namespace *mnt_userns,
 	if (error)
 		return error;
 
-	error = security_inode_removexattr(dentry, name);
+	error = security_inode_removexattr(mnt_userns, dentry, name);
 	if (error)
 		goto out;
 
diff --git a/include/linux/capability.h b/include/linux/capability.h
index da21ef118b04..65efb74c3585 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -271,7 +271,9 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 }
 
 /* audit system wants to get cap info from files as well */
-extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
+			   const struct dentry *dentry,
+			   struct cpu_vfs_cap_data *cpu_caps);
 
 int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
 		      const void **ivalue, size_t size);
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 7aaa753b8608..df4cdad6681e 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -133,17 +133,20 @@ LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode,
 LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
 LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr)
 LSM_HOOK(int, 0, inode_getattr, const struct path *path)
-LSM_HOOK(int, 0, inode_setxattr, struct dentry *dentry, const char *name,
-	 const void *value, size_t size, int flags)
+LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns,
+	 struct dentry *dentry, const char *name, const void *value,
+	 size_t size, int flags)
 LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
 	 const char *name, const void *value, size_t size, int flags)
 LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
 LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
-LSM_HOOK(int, 0, inode_removexattr, struct dentry *dentry, const char *name)
+LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns,
+	 struct dentry *dentry, const char *name)
 LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
-LSM_HOOK(int, 0, inode_killpriv, struct dentry *dentry)
-LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct inode *inode,
-	 const char *name, void **buffer, bool alloc)
+LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns,
+	 struct dentry *dentry)
+LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns,
+	 struct inode *inode, const char *name, void **buffer, bool alloc)
 LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
 	 const char *name, const void *value, size_t size, int flags)
 LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a19adef1f088..98a5e263aabb 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -444,6 +444,7 @@
  * @inode_killpriv:
  *	The setuid bit is being removed.  Remove similar security labels.
  *	Called with the dentry->d_inode->i_mutex held.
+ *	@mnt_userns: user namespace of the mount
  *	@dentry is the dentry being changed.
  *	Return 0 on success.  If error is returned, then the operation
  *	causing setuid bit removal is failed.
diff --git a/include/linux/security.h b/include/linux/security.h
index c35ea0ffccd9..4e4f6c31e413 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -145,13 +145,16 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
 extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
-extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags);
-extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
-extern int cap_inode_need_killpriv(struct dentry *dentry);
-extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_inode_getsecurity(struct inode *inode, const char *name,
-				 void **buffer, bool alloc);
+int cap_inode_setxattr(struct dentry *dentry, const char *name,
+		       const void *value, size_t size, int flags);
+int cap_inode_removexattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *name);
+int cap_inode_need_killpriv(struct dentry *dentry);
+int cap_inode_killpriv(struct user_namespace *mnt_userns,
+		       struct dentry *dentry);
+int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+			  struct inode *inode, const char *name, void **buffer,
+			  bool alloc);
 extern int cap_mmap_addr(unsigned long addr);
 extern int cap_mmap_file(struct file *file, unsigned long reqprot,
 			 unsigned long prot, unsigned long flags);
@@ -345,16 +348,21 @@ int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 int security_inode_permission(struct inode *inode, int mask);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(const struct path *path);
-int security_inode_setxattr(struct dentry *dentry, const char *name,
+int security_inode_setxattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, const char *name,
 			    const void *value, size_t size, int flags);
 void security_inode_post_setxattr(struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags);
 int security_inode_getxattr(struct dentry *dentry, const char *name);
 int security_inode_listxattr(struct dentry *dentry);
-int security_inode_removexattr(struct dentry *dentry, const char *name);
+int security_inode_removexattr(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, const char *name);
 int security_inode_need_killpriv(struct dentry *dentry);
-int security_inode_killpriv(struct dentry *dentry);
-int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc);
+int security_inode_killpriv(struct user_namespace *mnt_userns,
+			    struct dentry *dentry);
+int security_inode_getsecurity(struct user_namespace *mnt_userns,
+			       struct inode *inode, const char *name,
+			       void **buffer, bool alloc);
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
 void security_inode_getsecid(struct inode *inode, u32 *secid);
@@ -831,8 +839,9 @@ static inline int security_inode_getattr(const struct path *path)
 	return 0;
 }
 
-static inline int security_inode_setxattr(struct dentry *dentry,
-		const char *name, const void *value, size_t size, int flags)
+static inline int security_inode_setxattr(struct user_namespace *mnt_userns,
+		struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
 {
 	return cap_inode_setxattr(dentry, name, value, size, flags);
 }
@@ -852,10 +861,11 @@ static inline int security_inode_listxattr(struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_inode_removexattr(struct dentry *dentry,
-			const char *name)
+static inline int security_inode_removexattr(struct user_namespace *mnt_userns,
+					     struct dentry *dentry,
+					     const char *name)
 {
-	return cap_inode_removexattr(dentry, name);
+	return cap_inode_removexattr(mnt_userns, dentry, name);
 }
 
 static inline int security_inode_need_killpriv(struct dentry *dentry)
@@ -863,14 +873,18 @@ static inline int security_inode_need_killpriv(struct dentry *dentry)
 	return cap_inode_need_killpriv(dentry);
 }
 
-static inline int security_inode_killpriv(struct dentry *dentry)
+static inline int security_inode_killpriv(struct user_namespace *mnt_userns,
+					  struct dentry *dentry)
 {
-	return cap_inode_killpriv(dentry);
+	return cap_inode_killpriv(mnt_userns, dentry);
 }
 
-static inline int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+static inline int security_inode_getsecurity(struct user_namespace *mnt_userns,
+					     struct inode *inode,
+					     const char *name, void **buffer,
+					     bool alloc)
 {
-	return cap_inode_getsecurity(inode, name, buffer, alloc);
+	return cap_inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
 }
 
 static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce8c9e2279ba..fdfdd71405ff 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1930,7 +1930,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
 	if (!dentry)
 		return 0;
 
-	rc = get_vfs_caps_from_disk(dentry, &caps);
+	rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
 	if (rc)
 		return rc;
 
@@ -2481,7 +2481,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
 
-	get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+	get_vfs_caps_from_disk(&init_user_ns,
+			       bprm->file->f_path.dentry, &vcaps);
 
 	ax->fcap.permitted = vcaps.permitted;
 	ax->fcap.inheritable = vcaps.inheritable;
diff --git a/security/commoncap.c b/security/commoncap.c
index 745dc1f2c97f..234b074c2c58 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -303,17 +303,25 @@ int cap_inode_need_killpriv(struct dentry *dentry)
 
 /**
  * cap_inode_killpriv - Erase the security markings on an inode
- * @dentry: The inode/dentry to alter
+ *
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	The inode/dentry to alter
  *
  * Erase the privilege-enhancing security markings on an inode.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * Returns 0 if successful, -ve on error.
  */
-int cap_inode_killpriv(struct dentry *dentry)
+int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
 {
 	int error;
 
-	error = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_CAPS);
+	error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 	return error;
@@ -366,7 +374,8 @@ static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
  * by the integrity subsystem, which really wants the unconverted values -
  * so that's good.
  */
-int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+			  struct inode *inode, const char *name, void **buffer,
 			  bool alloc)
 {
 	int size, ret;
@@ -386,7 +395,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 		return -EINVAL;
 
 	size = sizeof(struct vfs_ns_cap_data);
-	ret = (int)vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_CAPS,
+	ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
 				      &tmpbuf, size, GFP_NOFS);
 	dput(dentry);
 
@@ -412,6 +421,9 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 	root = le32_to_cpu(nscap->rootid);
 	kroot = make_kuid(fs_ns, root);
 
+	/* If this is an idmapped mount shift the kuid. */
+	kroot = kuid_into_mnt(mnt_userns, kroot);
+
 	/* If the root kuid maps to a valid uid in current ns, then return
 	 * this as a nscap. */
 	mappedroot = from_kuid(current_user_ns(), kroot);
@@ -595,10 +607,24 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 	return *effective ? ret : 0;
 }
 
-/*
+/**
+ * get_vfs_caps_from_disk - retrieve vfs caps from disk
+ *
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	dentry from which @inode is retrieved
+ * @cpu_caps:	vfs capabilities
+ *
  * Extract the on-exec-apply capability sets for an executable file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
  */
-int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
+			   const struct dentry *dentry,
+			   struct cpu_vfs_cap_data *cpu_caps)
 {
 	struct inode *inode = d_backing_inode(dentry);
 	__u32 magic_etc;
@@ -654,6 +680,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 	/* Limit the caps to the mounter of the filesystem
 	 * or the more limited uid specified in the xattr.
 	 */
+	rootkuid = kuid_into_mnt(mnt_userns, rootkuid);
 	if (!rootid_owns_currentns(rootkuid))
 		return -ENODATA;
 
@@ -699,7 +726,8 @@ static int get_file_caps(struct linux_binprm *bprm, struct file *file,
 	if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
 		return 0;
 
-	rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
+	rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
+				    file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
 			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
@@ -964,16 +992,25 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 
 /**
  * cap_inode_removexattr - Determine whether an xattr may be removed
- * @dentry: The inode/dentry being altered
- * @name: The name of the xattr to be changed
+ *
+ * @mnt_userns:	User namespace of the mount the inode was found from
+ * @dentry:	The inode/dentry being altered
+ * @name:	The name of the xattr to be changed
  *
  * Determine whether an xattr may be removed from an inode, returning 0 if
  * permission is granted, -ve if denied.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before checking
+ * permissions. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
+ *
  * This is used to make sure security xattrs don't get removed by those who
  * aren't privileged to remove them.
  */
-int cap_inode_removexattr(struct dentry *dentry, const char *name)
+int cap_inode_removexattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, const char *name)
 {
 	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
 
@@ -987,8 +1024,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 		struct inode *inode = d_backing_inode(dentry);
 		if (!inode)
 			return -EINVAL;
-		if (!capable_wrt_inode_uidgid(&init_user_ns, inode,
-					      CAP_SETFCAP))
+		if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
diff --git a/security/security.c b/security/security.c
index 7b09cfbae94f..698a9f17bad7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1280,7 +1280,8 @@ int security_inode_getattr(const struct path *path)
 	return call_int_hook(inode_getattr, 0, path);
 }
 
-int security_inode_setxattr(struct dentry *dentry, const char *name,
+int security_inode_setxattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, const char *name,
 			    const void *value, size_t size, int flags)
 {
 	int ret;
@@ -1291,8 +1292,8 @@ int security_inode_setxattr(struct dentry *dentry, const char *name,
 	 * SELinux and Smack integrate the cap call,
 	 * so assume that all LSMs supplying this call do so.
 	 */
-	ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
-				flags);
+	ret = call_int_hook(inode_setxattr, 1, mnt_userns, dentry, name, value,
+			    size, flags);
 
 	if (ret == 1)
 		ret = cap_inode_setxattr(dentry, name, value, size, flags);
@@ -1327,7 +1328,8 @@ int security_inode_listxattr(struct dentry *dentry)
 	return call_int_hook(inode_listxattr, 0, dentry);
 }
 
-int security_inode_removexattr(struct dentry *dentry, const char *name)
+int security_inode_removexattr(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, const char *name)
 {
 	int ret;
 
@@ -1337,9 +1339,9 @@ int security_inode_removexattr(struct dentry *dentry, const char *name)
 	 * SELinux and Smack integrate the cap call,
 	 * so assume that all LSMs supplying this call do so.
 	 */
-	ret = call_int_hook(inode_removexattr, 1, dentry, name);
+	ret = call_int_hook(inode_removexattr, 1, mnt_userns, dentry, name);
 	if (ret == 1)
-		ret = cap_inode_removexattr(dentry, name);
+		ret = cap_inode_removexattr(mnt_userns, dentry, name);
 	if (ret)
 		return ret;
 	ret = ima_inode_removexattr(dentry, name);
@@ -1353,12 +1355,15 @@ int security_inode_need_killpriv(struct dentry *dentry)
 	return call_int_hook(inode_need_killpriv, 0, dentry);
 }
 
-int security_inode_killpriv(struct dentry *dentry)
+int security_inode_killpriv(struct user_namespace *mnt_userns,
+			    struct dentry *dentry)
 {
-	return call_int_hook(inode_killpriv, 0, dentry);
+	return call_int_hook(inode_killpriv, 0, mnt_userns, dentry);
 }
 
-int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+int security_inode_getsecurity(struct user_namespace *mnt_userns,
+			       struct inode *inode, const char *name,
+			       void **buffer, bool alloc)
 {
 	struct security_hook_list *hp;
 	int rc;
@@ -1369,7 +1374,7 @@ int security_inode_getsecurity(struct inode *inode, const char *name, void **buf
 	 * Only one module will provide an attribute with a given name.
 	 */
 	hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
-		rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
+		rc = hp->hook.inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
 		if (rc != LSM_RET_DEFAULT(inode_getsecurity))
 			return rc;
 	}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2efedd7001b2..9719dd124221 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3119,7 +3119,8 @@ static bool has_cap_mac_admin(bool audit)
 	return true;
 }
 
-static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
+static int selinux_inode_setxattr(struct user_namespace *mnt_userns,
+				  struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags)
 {
 	struct inode *inode = d_backing_inode(dentry);
@@ -3140,13 +3141,13 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 	}
 
 	if (!selinux_initialized(&selinux_state))
-		return (inode_owner_or_capable(&init_user_ns, inode) ? 0 : -EPERM);
+		return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM);
 
 	sbsec = inode->i_sb->s_security;
 	if (!(sbsec->flags & SBLABEL_MNT))
 		return -EOPNOTSUPP;
 
-	if (!inode_owner_or_capable(&init_user_ns, inode))
+	if (!inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	ad.type = LSM_AUDIT_DATA_DENTRY;
@@ -3267,10 +3268,11 @@ static int selinux_inode_listxattr(struct dentry *dentry)
 	return dentry_has_perm(cred, dentry, FILE__GETATTR);
 }
 
-static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
+static int selinux_inode_removexattr(struct user_namespace *mnt_userns,
+				     struct dentry *dentry, const char *name)
 {
 	if (strcmp(name, XATTR_NAME_SELINUX)) {
-		int rc = cap_inode_removexattr(dentry, name);
+		int rc = cap_inode_removexattr(mnt_userns, dentry, name);
 		if (rc)
 			return rc;
 
@@ -3336,7 +3338,9 @@ static int selinux_path_notify(const struct path *path, u64 mask,
  *
  * Permission check is handled by selinux_inode_getxattr hook.
  */
-static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+static int selinux_inode_getsecurity(struct user_namespace *mnt_userns,
+				     struct inode *inode, const char *name,
+				     void **buffer, bool alloc)
 {
 	u32 size;
 	int error;
@@ -6533,8 +6537,8 @@ static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
 static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 {
 	int len = 0;
-	len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX,
-						ctx, true);
+	len = selinux_inode_getsecurity(&init_user_ns, inode,
+					XATTR_SELINUX_SUFFIX, ctx, true);
 	if (len < 0)
 		return len;
 	*ctxlen = len;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 746e5743accc..12a45e61c1a5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1240,7 +1240,8 @@ static int smack_inode_getattr(const struct path *path)
  *
  * Returns 0 if access is permitted, an error code otherwise
  */
-static int smack_inode_setxattr(struct dentry *dentry, const char *name,
+static int smack_inode_setxattr(struct user_namespace *mnt_userns,
+				struct dentry *dentry, const char *name,
 				const void *value, size_t size, int flags)
 {
 	struct smk_audit_info ad;
@@ -1362,7 +1363,8 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name)
  *
  * Returns 0 if access is permitted, an error code otherwise
  */
-static int smack_inode_removexattr(struct dentry *dentry, const char *name)
+static int smack_inode_removexattr(struct user_namespace *mnt_userns,
+				   struct dentry *dentry, const char *name)
 {
 	struct inode_smack *isp;
 	struct smk_audit_info ad;
@@ -1377,7 +1379,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 		if (!smack_privileged(CAP_MAC_ADMIN))
 			rc = -EPERM;
 	} else
-		rc = cap_inode_removexattr(dentry, name);
+		rc = cap_inode_removexattr(mnt_userns, dentry, name);
 
 	if (rc != 0)
 		return rc;
@@ -1420,9 +1422,9 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
  *
  * Returns the size of the attribute or an error code
  */
-static int smack_inode_getsecurity(struct inode *inode,
-				   const char *name, void **buffer,
-				   bool alloc)
+static int smack_inode_getsecurity(struct user_namespace *mnt_userns,
+				   struct inode *inode, const char *name,
+				   void **buffer, bool alloc)
 {
 	struct socket_smack *ssp;
 	struct socket *sock;
-- 
cgit v1.2.3


From 0d56a4518d5eaf595a24ab2202e171330bb2ed72 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:30 +0100
Subject: stat: handle idmapped mounts

The generic_fillattr() helper fills in the basic attributes associated
with an inode. Enable it to handle idmapped mounts. If the inode is
accessed through an idmapped mount map it into the mount's user
namespace before we store the uid and gid. If the initial user namespace
is passed nothing changes so non-idmapped mounts will see identical
behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-12-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/9p/vfs_inode.c      |  4 ++--
 fs/9p/vfs_inode_dotl.c |  4 ++--
 fs/afs/inode.c         |  2 +-
 fs/btrfs/inode.c       |  2 +-
 fs/ceph/inode.c        |  2 +-
 fs/cifs/inode.c        |  2 +-
 fs/coda/inode.c        |  2 +-
 fs/ecryptfs/inode.c    |  4 ++--
 fs/erofs/inode.c       |  2 +-
 fs/exfat/file.c        |  2 +-
 fs/ext2/inode.c        |  2 +-
 fs/ext4/inode.c        |  2 +-
 fs/f2fs/file.c         |  2 +-
 fs/fat/file.c          |  2 +-
 fs/fuse/dir.c          |  2 +-
 fs/gfs2/inode.c        |  2 +-
 fs/hfsplus/inode.c     |  2 +-
 fs/kernfs/inode.c      |  2 +-
 fs/libfs.c             |  4 ++--
 fs/minix/inode.c       |  2 +-
 fs/nfs/inode.c         |  2 +-
 fs/nfs/namespace.c     |  2 +-
 fs/ocfs2/file.c        |  2 +-
 fs/orangefs/inode.c    |  2 +-
 fs/proc/base.c         |  4 ++--
 fs/proc/generic.c      |  2 +-
 fs/proc/proc_net.c     |  2 +-
 fs/proc/proc_sysctl.c  |  2 +-
 fs/proc/root.c         |  2 +-
 fs/stat.c              | 20 ++++++++++++++------
 fs/sysv/itree.c        |  2 +-
 fs/ubifs/dir.c         |  2 +-
 fs/udf/symlink.c       |  2 +-
 fs/vboxsf/utils.c      |  2 +-
 include/linux/fs.h     |  2 +-
 mm/shmem.c             |  2 +-
 36 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9c3ff6e9ab82..c21b146c8d91 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1027,7 +1027,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 		return 0;
 	}
 	fid = v9fs_fid_lookup(dentry);
@@ -1040,7 +1040,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
 		return PTR_ERR(st);
 
 	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 
 	p9stat_free(st);
 	kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 302553101fcb..984f28315d2a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -468,7 +468,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
 	p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 		return 0;
 	}
 	fid = v9fs_fid_lookup(dentry);
@@ -485,7 +485,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
 		return PTR_ERR(st);
 
 	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 	/* Change block size to what the server returned */
 	stat->blksize = st->st_blksize;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index b0d7b892090d..795ee5cb3817 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -745,7 +745,7 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 
 	do {
 		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
-		generic_fillattr(inode, stat);
+		generic_fillattr(&init_user_ns, inode, stat);
 		if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
 		    stat->nlink > 0)
 			stat->nlink -= 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c18fb1a25af..a63faed171de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8842,7 +8842,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_IMMUTABLE |
 				  STATX_ATTR_NODUMP);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 
 	spin_lock(&BTRFS_I(inode)->lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 145e26a4ddbb..179a2bb88538 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2385,7 +2385,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 			return err;
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->ino = ceph_present_inode(inode);
 
 	/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 27554f71f744..374abce7efaf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2408,7 +2408,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
 			return rc;
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blksize = cifs_sb->ctx->bsize;
 	stat->ino = CIFS_I(inode)->uniqueid;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b1c70e2b9b1e..4d113e191cb8 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -256,7 +256,7 @@ int coda_getattr(const struct path *path, struct kstat *stat,
 {
 	int err = coda_revalidate_inode(d_inode(path->dentry));
 	if (!err)
-		generic_fillattr(d_inode(path->dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	return err;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b9ccc4085d46..385b5e8741c0 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -977,7 +977,7 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
 
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
 						dentry->d_sb)->mount_crypt_stat;
-	generic_fillattr(d_inode(dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
 		char *target;
 		size_t targetsiz;
@@ -1005,7 +1005,7 @@ static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
 	if (!rc) {
 		fsstack_copy_attr_all(d_inode(dentry),
 				      ecryptfs_inode_to_lower(d_inode(dentry)));
-		generic_fillattr(d_inode(dentry), stat);
+		generic_fillattr(&init_user_ns, d_inode(dentry), stat);
 		stat->blocks = lower_stat.blocks;
 	}
 	return rc;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3e21c0e8adae..083818063ac6 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -343,7 +343,7 @@ int erofs_getattr(const struct path *path, struct kstat *stat,
 	stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
 				  STATX_ATTR_IMMUTABLE);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index ace35aa8e64b..e9705b3295d3 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -273,7 +273,7 @@ int exfat_getattr(const struct path *path, struct kstat *stat,
 	struct inode *inode = d_backing_inode(path->dentry);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	exfat_truncate_atime(&stat->atime);
 	stat->result_mask |= STATX_BTIME;
 	stat->btime.tv_sec = ei->i_crtime.tv_sec;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9de813635d8d..3d8acafca8ce 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1660,7 +1660,7 @@ int ext2_getattr(const struct path *path, struct kstat *stat,
 			STATX_ATTR_IMMUTABLE |
 			STATX_ATTR_NODUMP);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24ea5851e90a..3a303d3f8423 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5571,7 +5571,7 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_NODUMP |
 				  STATX_ATTR_VERITY);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6ccdfe0606d9..44cd0dbdbb5d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -820,7 +820,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_NODUMP |
 				  STATX_ATTR_VERITY);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	/* we need to show initial sectors used for inline_data/dentries */
 	if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 805b501467e9..f7e04f533d31 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -398,7 +398,7 @@ int fat_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
 
 	if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 74fdb6a7ebb3..d2e318ed9b26 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1087,7 +1087,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
 		forget_all_cached_acls(inode);
 		err = fuse_do_getattr(inode, stat, file);
 	} else if (stat) {
-		generic_fillattr(inode, stat);
+		generic_fillattr(&init_user_ns, inode, stat);
 		stat->mode = fi->orig_i_mode;
 		stat->ino = fi->orig_ino;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 728405d15a05..226b5b1dc1fa 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2050,7 +2050,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_IMMUTABLE |
 				  STATX_ATTR_NODUMP);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (gfs2_holder_initialized(&gh))
 		gfs2_glock_dq_uninit(&gh);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index ffa137f8234e..642e067d8fe8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -286,7 +286,7 @@ int hfsplus_getattr(const struct path *path, struct kstat *stat,
 	stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
 				 STATX_ATTR_NODUMP;
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 7e44052b42e1..032d3d7546d8 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -193,7 +193,7 @@ int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
 	kernfs_refresh_inode(kn, inode);
 	mutex_unlock(&kernfs_mutex);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a73fe109403c..508e9ea8e6f3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -31,7 +31,7 @@ int simple_getattr(const struct path *path, struct kstat *stat,
 		   u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
 	return 0;
 }
@@ -1304,7 +1304,7 @@ static int empty_dir_getattr(const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 34f546404aa1..91c81d2fc90d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -658,7 +658,7 @@ int minix_getattr(const struct path *path, struct kstat *stat,
 	struct super_block *sb = path->dentry->d_sb;
 	struct inode *inode = d_inode(path->dentry);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	if (INODE_VERSION(inode) == MINIX_V1)
 		stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
 	else
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 522aa10a1a3e..cab123ec1664 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -857,7 +857,7 @@ out_no_revalidate:
 	/* Only return attributes that were revalidated. */
 	stat->result_mask &= request_mask;
 out_no_update:
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
 	if (S_ISDIR(inode->i_mode))
 		stat->blksize = NFS_SERVER(inode)->dtsize;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2bcbe38afe2e..55fc711e368b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -213,7 +213,7 @@ nfs_namespace_getattr(const struct path *path, struct kstat *stat,
 {
 	if (NFS_FH(d_inode(path->dentry))->size != 0)
 		return nfs_getattr(path, stat, request_mask, query_flags);
-	generic_fillattr(d_inode(path->dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	return 0;
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cabf355b148f..a070d4c9b6ed 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1313,7 +1313,7 @@ int ocfs2_getattr(const struct path *path, struct kstat *stat,
 		goto bail;
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	/*
 	 * If there is inline data in the inode, the inode will normally not
 	 * have data blocks allocated (it may have an external xattr block).
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 563fe9ab8eb2..b94032f77e61 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -903,7 +903,7 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 	ret = orangefs_inode_getattr(inode,
 	    request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
 	if (ret == 0) {
-		generic_fillattr(inode, stat);
+		generic_fillattr(&init_user_ns, inode, stat);
 
 		/* override block size reported to stat */
 		if (!(request_mask & STATX_SIZE))
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bb4e63a3684f..d45aa68c1f17 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1934,7 +1934,7 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
 	struct task_struct *task;
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	stat->uid = GLOBAL_ROOT_UID;
 	stat->gid = GLOBAL_ROOT_GID;
@@ -3803,7 +3803,7 @@ static int proc_task_getattr(const struct path *path, struct kstat *stat,
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct task_struct *p = get_proc_task(inode);
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (p) {
 		stat->nlink += get_nr_threads(p);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 6d4fabab8aa7..0db96a761149 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -145,7 +145,7 @@ static int proc_getattr(const struct path *path, struct kstat *stat,
 		}
 	}
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	return 0;
 }
 
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 18601042af99..4aef49ccf571 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -297,7 +297,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
 
 	net = get_proc_task_net(inode);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (net != NULL) {
 		stat->nlink = net->proc_net->nlink;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ec67dbc1f705..87c828348140 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -840,7 +840,7 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat,
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	if (table)
 		stat->mode = (stat->mode & S_IFMT) | table->mode;
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5e444d4f9717..244e4b6f15ef 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -311,7 +311,7 @@ void __init proc_root_init(void)
 static int proc_root_getattr(const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
-	generic_fillattr(d_inode(path->dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	stat->nlink = proc_root.nlink + nr_processes();
 	return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index dacecdda2e79..2c471c2fd766 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -26,21 +26,29 @@
 
 /**
  * generic_fillattr - Fill in the basic attributes from the inode struct
- * @inode: Inode to use as the source
- * @stat: Where to fill in the attributes
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @inode:	Inode to use as the source
+ * @stat:	Where to fill in the attributes
  *
  * Fill in the basic attributes in the kstat structure from data that's to be
  * found on the VFS inode structure.  This is the default if no getattr inode
  * operation is supplied.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then
+ * take care to map the inode according to @mnt_userns before filling in the
+ * uid and gid filds. On non-idmapped mounts or if permission checking is to be
+ * performed on the raw inode simply passs init_user_ns.
  */
-void generic_fillattr(struct inode *inode, struct kstat *stat)
+void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
+		      struct kstat *stat)
 {
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = inode->i_ino;
 	stat->mode = inode->i_mode;
 	stat->nlink = inode->i_nlink;
-	stat->uid = inode->i_uid;
-	stat->gid = inode->i_gid;
+	stat->uid = i_uid_into_mnt(mnt_userns, inode);
+	stat->gid = i_gid_into_mnt(mnt_userns, inode);
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
 	stat->atime = inode->i_atime;
@@ -87,7 +95,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 		return inode->i_op->getattr(path, stat, request_mask,
 					    query_flags);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
 	return 0;
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index bcb67b0cabe7..83cffab6955f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -445,7 +445,7 @@ int sysv_getattr(const struct path *path, struct kstat *stat,
 		 u32 request_mask, unsigned int flags)
 {
 	struct super_block *s = path->dentry->d_sb;
-	generic_fillattr(d_inode(path->dentry), stat);
+	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
 	stat->blksize = s->s_blocksize;
 	return 0;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 694e7714545b..a8881ed61620 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1589,7 +1589,7 @@ int ubifs_getattr(const struct path *path, struct kstat *stat,
 				STATX_ATTR_ENCRYPTED |
 				STATX_ATTR_IMMUTABLE);
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	stat->blksize = UBIFS_BLOCK_SIZE;
 	stat->size = ui->ui_size;
 
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c973db239604..54a44d1f023c 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -159,7 +159,7 @@ static int udf_symlink_getattr(const struct path *path, struct kstat *stat,
 	struct inode *inode = d_backing_inode(dentry);
 	struct page *page;
 
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 	page = read_mapping_page(inode->i_mapping, 0, NULL);
 	if (IS_ERR(page))
 		return PTR_ERR(page);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 018057546067..d2cd1c99f48e 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -233,7 +233,7 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat,
 	if (err)
 		return err;
 
-	generic_fillattr(d_inode(dentry), kstat);
+	generic_fillattr(&init_user_ns, d_inode(dentry), kstat);
 	return 0;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3ea1d7c3367..182641d8322f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3154,7 +3154,7 @@ extern int __page_symlink(struct inode *inode, const char *symname, int len,
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
-extern void generic_fillattr(struct inode *, struct kstat *);
+void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
diff --git a/mm/shmem.c b/mm/shmem.c
index 23b8e9c15a42..339d5530d3a9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1072,7 +1072,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
 		shmem_recalc_inode(inode);
 		spin_unlock_irq(&info->lock);
 	}
-	generic_fillattr(inode, stat);
+	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (is_huge_enabled(sb_info))
 		stat->blksize = HPAGE_PMD_SIZE;
-- 
cgit v1.2.3


From ba73d98745be1c10dc3cce68e8d7b95012d07d05 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:31 +0100
Subject: namei: handle idmapped mounts in may_*() helpers

The may_follow_link(), may_linkat(), may_lookup(), may_open(),
may_o_create(), may_create_in_sticky(), may_delete(), and may_create()
helpers determine whether the caller is privileged enough to perform the
associated operations. Let them handle idmapped mounts by mapping the
inode or fsids according to the mount's user namespace. Afterwards the
checks are identical to non-idmapped inodes. The patch takes care to
retrieve the mount's user namespace right before performing permission
checks and passing it down into the fileystem so the user namespace
can't change in between by someone idmapping a mount that is currently
not idmapped. If the initial user namespace is passed nothing changes so
non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-13-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Morris <jamorris@linux.microsoft.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/btrfs/ioctl.c   |   5 +-
 fs/init.c          |   2 +-
 fs/inode.c         |   2 +-
 fs/internal.h      |   2 +-
 fs/namei.c         | 148 +++++++++++++++++++++++++++++++++--------------------
 fs/xattr.c         |   2 +-
 include/linux/fs.h |  14 +++--
 7 files changed, 108 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1f763c60415b..56f53d692fa2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -927,8 +927,9 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
-	if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
-	    IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+	if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+	    IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+	    IS_SWAPFILE(d_inode(victim)))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
diff --git a/fs/init.c b/fs/init.c
index 02723bea8499..891284f8a443 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -181,7 +181,7 @@ int __init init_link(const char *oldname, const char *newname)
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	error = may_linkat(&old_path);
+	error = may_linkat(&init_user_ns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
diff --git a/fs/inode.c b/fs/inode.c
index 49b512592dcd..46116ef44c9f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1796,7 +1796,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
 	/* Atime updates will likely cause i_uid and i_gid to be written
 	 * back improprely if their true value is unknown to the vfs.
 	 */
-	if (HAS_UNMAPPED_ID(inode))
+	if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
 		return false;
 
 	if (IS_NOATIME(inode))
diff --git a/fs/internal.h b/fs/internal.h
index 77c50befbfbe..6c8a4eddc7e6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,7 +73,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
 long do_rmdir(int dfd, struct filename *name);
 long do_unlinkat(int dfd, struct filename *name);
-int may_linkat(struct path *link);
+int may_linkat(struct user_namespace *mnt_userns, struct path *link);
 int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
 		 struct filename *newname, unsigned int flags);
 
diff --git a/fs/namei.c b/fs/namei.c
index 04b001ddade3..93fa7d803fb2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -506,7 +506,7 @@ int inode_permission(struct user_namespace *mnt_userns,
 		 * written back improperly if their true value is unknown
 		 * to the vfs.
 		 */
-		if (HAS_UNMAPPED_ID(inode))
+		if (HAS_UNMAPPED_ID(mnt_userns, inode))
 			return -EACCES;
 	}
 
@@ -1004,11 +1004,16 @@ int sysctl_protected_regular __read_mostly;
  */
 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
 {
+	struct user_namespace *mnt_userns;
+	kuid_t i_uid;
+
 	if (!sysctl_protected_symlinks)
 		return 0;
 
+	mnt_userns = mnt_user_ns(nd->path.mnt);
+	i_uid = i_uid_into_mnt(mnt_userns, inode);
 	/* Allowed if owner and follower match. */
-	if (uid_eq(current_cred()->fsuid, inode->i_uid))
+	if (uid_eq(current_cred()->fsuid, i_uid))
 		return 0;
 
 	/* Allowed if parent directory not sticky and world-writable. */
@@ -1016,7 +1021,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
 		return 0;
 
 	/* Allowed if parent directory and link owner match. */
-	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
+	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
 		return 0;
 
 	if (nd->flags & LOOKUP_RCU)
@@ -1029,6 +1034,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
 
 /**
  * safe_hardlink_source - Check for safe hardlink conditions
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: the source inode to hardlink from
  *
  * Return false if at least one of the following conditions:
@@ -1039,7 +1045,8 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
  *
  * Otherwise returns true.
  */
-static bool safe_hardlink_source(struct inode *inode)
+static bool safe_hardlink_source(struct user_namespace *mnt_userns,
+				 struct inode *inode)
 {
 	umode_t mode = inode->i_mode;
 
@@ -1056,7 +1063,7 @@ static bool safe_hardlink_source(struct inode *inode)
 		return false;
 
 	/* Hardlinking to unreadable or unwritable sources is dangerous. */
-	if (inode_permission(&init_user_ns, inode, MAY_READ | MAY_WRITE))
+	if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
 		return false;
 
 	return true;
@@ -1064,6 +1071,7 @@ static bool safe_hardlink_source(struct inode *inode)
 
 /**
  * may_linkat - Check permissions for creating a hardlink
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @link: the source to hardlink from
  *
  * Block hardlink when all of:
@@ -1072,14 +1080,21 @@ static bool safe_hardlink_source(struct inode *inode)
  *  - hardlink source is unsafe (see safe_hardlink_source() above)
  *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ *
  * Returns 0 if successful, -ve on error.
  */
-int may_linkat(struct path *link)
+int may_linkat(struct user_namespace *mnt_userns, struct path *link)
 {
 	struct inode *inode = link->dentry->d_inode;
 
 	/* Inode writeback is not safe when the uid or gid are invalid. */
-	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	if (!sysctl_protected_hardlinks)
@@ -1088,8 +1103,8 @@ int may_linkat(struct path *link)
 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
 	 * otherwise, it must be a safe source.
 	 */
-	if (safe_hardlink_source(inode) ||
-	    inode_owner_or_capable(&init_user_ns, inode))
+	if (safe_hardlink_source(mnt_userns, inode) ||
+	    inode_owner_or_capable(mnt_userns, inode))
 		return 0;
 
 	audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
@@ -1100,6 +1115,7 @@ int may_linkat(struct path *link)
  * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
  *			  should be allowed, or not, on files that already
  *			  exist.
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dir_mode: mode bits of directory
  * @dir_uid: owner of directory
  * @inode: the inode of the file to open
@@ -1115,16 +1131,25 @@ int may_linkat(struct path *link)
  * the directory doesn't have to be world writable: being group writable will
  * be enough.
  *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ *
  * Returns 0 if the open is allowed, -ve on error.
  */
-static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
-				struct inode * const inode)
+static int may_create_in_sticky(struct user_namespace *mnt_userns,
+				struct nameidata *nd, struct inode *const inode)
 {
+	umode_t dir_mode = nd->dir_mode;
+	kuid_t dir_uid = nd->dir_uid;
+
 	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
 	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
 	    likely(!(dir_mode & S_ISVTX)) ||
-	    uid_eq(inode->i_uid, dir_uid) ||
-	    uid_eq(current_fsuid(), inode->i_uid))
+	    uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
+	    uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
 		return 0;
 
 	if (likely(dir_mode & 0002) ||
@@ -1614,17 +1639,18 @@ static struct dentry *lookup_slow(const struct qstr *name,
 	return res;
 }
 
-static inline int may_lookup(struct nameidata *nd)
+static inline int may_lookup(struct user_namespace *mnt_userns,
+			     struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
-		int err = inode_permission(&init_user_ns, nd->inode,
+		int err = inode_permission(mnt_userns, nd->inode,
 					   MAY_EXEC | MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
 		if (unlazy_walk(nd))
 			return -ECHILD;
 	}
-	return inode_permission(&init_user_ns, nd->inode, MAY_EXEC);
+	return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
 }
 
 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
@@ -2177,7 +2203,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		u64 hash_len;
 		int type;
 
-		err = may_lookup(nd);
+		err = may_lookup(&init_user_ns, nd);
 		if (err)
 			return err;
 
@@ -2225,7 +2251,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 OK:
 			/* pathname or trailing symlink, done */
 			if (!depth) {
-				nd->dir_uid = nd->inode->i_uid;
+				nd->dir_uid = i_uid_into_mnt(&init_user_ns, nd->inode);
 				nd->dir_mode = nd->inode->i_mode;
 				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
@@ -2703,15 +2729,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 }
 EXPORT_SYMBOL(user_path_at_empty);
 
-int __check_sticky(struct inode *dir, struct inode *inode)
+int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+		   struct inode *inode)
 {
 	kuid_t fsuid = current_fsuid();
 
-	if (uid_eq(inode->i_uid, fsuid))
+	if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
 		return 0;
-	if (uid_eq(dir->i_uid, fsuid))
+	if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
 		return 0;
-	return !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FOWNER);
+	return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
 }
 EXPORT_SYMBOL(__check_sticky);
 
@@ -2735,7 +2762,8 @@ EXPORT_SYMBOL(__check_sticky);
  * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *victim, bool isdir)
 {
 	struct inode *inode = d_backing_inode(victim);
 	int error;
@@ -2747,19 +2775,21 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 	BUG_ON(victim->d_parent->d_inode != dir);
 
 	/* Inode writeback is not safe when the uid or gid are invalid. */
-	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+	if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+	    !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
 		return -EOVERFLOW;
 
 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 
-	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
 		return -EPERM;
 
-	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
+	if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
+	    HAS_UNMAPPED_ID(mnt_userns, inode))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
@@ -2784,7 +2814,8 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  *  4. We should have write and exec permissions on dir
  *  5. We can't do it if dir is immutable (done in permission())
  */
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *child)
 {
 	struct user_namespace *s_user_ns;
 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
@@ -2793,10 +2824,10 @@ static inline int may_create(struct inode *dir, struct dentry *child)
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
 	s_user_ns = dir->i_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
 		return -EOVERFLOW;
-	return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 }
 
 /*
@@ -2846,7 +2877,7 @@ EXPORT_SYMBOL(unlock_rename);
 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		bool want_excl)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 	if (error)
 		return error;
 
@@ -2869,7 +2900,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode,
 		void *arg)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 	if (error)
 		return error;
 
@@ -2891,7 +2922,8 @@ bool may_open_dev(const struct path *path)
 		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
 }
 
-static int may_open(const struct path *path, int acc_mode, int flag)
+static int may_open(struct user_namespace *mnt_userns, const struct path *path,
+		    int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
@@ -2926,7 +2958,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 		break;
 	}
 
-	error = inode_permission(&init_user_ns, inode, MAY_OPEN | acc_mode);
+	error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
 	if (error)
 		return error;
 
@@ -2941,7 +2973,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
-	if (flag & O_NOATIME && !inode_owner_or_capable(&init_user_ns, inode))
+	if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
 		return -EPERM;
 
 	return 0;
@@ -2976,7 +3008,9 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
-static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
+static int may_o_create(struct user_namespace *mnt_userns,
+			const struct path *dir, struct dentry *dentry,
+			umode_t mode)
 {
 	struct user_namespace *s_user_ns;
 	int error = security_path_mknod(dir, dentry, mode, 0);
@@ -2984,11 +3018,11 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
 		return error;
 
 	s_user_ns = dir->dentry->d_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
 		return -EOVERFLOW;
 
-	error = inode_permission(&init_user_ns, dir->dentry->d_inode,
+	error = inode_permission(mnt_userns, dir->dentry->d_inode,
 				 MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
@@ -3121,7 +3155,8 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current_umask();
 		if (likely(got_write))
-			create_error = may_o_create(&nd->path, dentry, mode);
+			create_error = may_o_create(&init_user_ns, &nd->path,
+						    dentry, mode);
 		else
 			create_error = -EROFS;
 	}
@@ -3282,7 +3317,7 @@ static int do_open(struct nameidata *nd,
 			return -EEXIST;
 		if (d_is_dir(nd->path.dentry))
 			return -EISDIR;
-		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
+		error = may_create_in_sticky(&init_user_ns, nd,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
 			return error;
@@ -3302,7 +3337,7 @@ static int do_open(struct nameidata *nd,
 			return error;
 		do_truncate = true;
 	}
-	error = may_open(&nd->path, acc_mode, open_flag);
+	error = may_open(&init_user_ns, &nd->path, acc_mode, open_flag);
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
 	if (!error)
@@ -3377,7 +3412,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	path.dentry = child;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
-	error = may_open(&path, 0, op->open_flag);
+	error = may_open(&init_user_ns, &path, 0, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = path.mnt;
@@ -3584,7 +3619,7 @@ EXPORT_SYMBOL(user_path_create);
 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 
 	if (error)
 		return error;
@@ -3685,7 +3720,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 	unsigned max_links = dir->i_sb->s_max_links;
 
 	if (error)
@@ -3746,7 +3781,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	int error = may_delete(dir, dentry, 1);
+	int error = may_delete(&init_user_ns, dir, dentry, 1);
 
 	if (error)
 		return error;
@@ -3868,7 +3903,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
-	int error = may_delete(dir, dentry, 0);
+	int error = may_delete(&init_user_ns, dir, dentry, 0);
 
 	if (error)
 		return error;
@@ -4000,7 +4035,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(&init_user_ns, dir, dentry);
 
 	if (error)
 		return error;
@@ -4089,7 +4124,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(dir, new_dentry);
+	error = may_create(&init_user_ns, dir, new_dentry);
 	if (error)
 		return error;
 
@@ -4106,7 +4141,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 * be writen back improperly if their true value is unknown to
 	 * the vfs.
 	 */
-	if (HAS_UNMAPPED_ID(inode))
+	if (HAS_UNMAPPED_ID(&init_user_ns, inode))
 		return -EPERM;
 	if (!dir->i_op->link)
 		return -EPERM;
@@ -4188,7 +4223,7 @@ retry:
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	error = may_linkat(&old_path);
+	error = may_linkat(&init_user_ns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
@@ -4281,6 +4316,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	       struct inode **delegated_inode, unsigned int flags)
 {
 	int error;
+	struct user_namespace *mnt_userns = &init_user_ns;
 	bool is_dir = d_is_dir(old_dentry);
 	struct inode *source = old_dentry->d_inode;
 	struct inode *target = new_dentry->d_inode;
@@ -4291,19 +4327,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (source == target)
 		return 0;
 
-	error = may_delete(old_dir, old_dentry, is_dir);
+	error = may_delete(mnt_userns, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(new_dir, new_dentry);
+		error = may_create(mnt_userns, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(new_dir, new_dentry, is_dir);
+			error = may_delete(mnt_userns, new_dir, new_dentry, is_dir);
 		else
-			error = may_delete(new_dir, new_dentry, new_is_dir);
+			error = may_delete(mnt_userns, new_dir, new_dentry, new_is_dir);
 	}
 	if (error)
 		return error;
diff --git a/fs/xattr.c b/fs/xattr.c
index 79c439d31eaa..b3444e06cded 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -98,7 +98,7 @@ xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
 		 * to be writen back improperly if their true value is
 		 * unknown to the vfs.
 		 */
-		if (HAS_UNMAPPED_ID(inode))
+		if (HAS_UNMAPPED_ID(mnt_userns, inode))
 			return -EPERM;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 182641d8322f..a27884af7222 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2083,9 +2083,11 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
 
-static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
+				   struct inode *inode)
 {
-	return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+	return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+	       !gid_valid(i_gid_into_mnt(mnt_userns, inode));
 }
 
 static inline enum rw_hint file_write_hint(struct file *file)
@@ -2823,7 +2825,8 @@ static inline int path_permission(const struct path *path, int mask)
 	return inode_permission(mnt_user_ns(path->mnt),
 				d_inode(path->dentry), mask);
 }
-extern int __check_sticky(struct inode *dir, struct inode *inode);
+int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+		   struct inode *inode);
 
 static inline bool execute_ok(struct inode *inode)
 {
@@ -3442,12 +3445,13 @@ static inline bool is_sxid(umode_t mode)
 	return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
 }
 
-static inline int check_sticky(struct inode *dir, struct inode *inode)
+static inline int check_sticky(struct user_namespace *mnt_userns,
+			       struct inode *dir, struct inode *inode)
 {
 	if (!(dir->i_mode & S_ISVTX))
 		return 0;
 
-	return __check_sticky(dir, inode);
+	return __check_sticky(mnt_userns, dir, inode);
 }
 
 static inline void inode_has_no_xattr(struct inode *inode)
-- 
cgit v1.2.3


From 9fe61450972d3900bffb1dc26a17ebb9cdd92db2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:32 +0100
Subject: namei: introduce struct renamedata

In order to handle idmapped mounts we will extend the vfs rename helper
to take two new arguments in follow up patches. Since this operations
already takes a bunch of arguments add a simple struct renamedata and
make the current helper use it before we extend it.

Link: https://lore.kernel.org/r/20210121131959.646623-14-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/cachefiles/namei.c    |  9 +++++++--
 fs/ecryptfs/inode.c      | 10 +++++++---
 fs/namei.c               | 21 +++++++++++++++------
 fs/nfsd/vfs.c            |  8 +++++++-
 fs/overlayfs/overlayfs.h |  9 ++++++++-
 include/linux/fs.h       | 12 +++++++++++-
 6 files changed, 55 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ecc8ecbbfa5a..7b987de0babe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -412,9 +412,14 @@ try_again:
 	if (ret < 0) {
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
+		struct renamedata rd = {
+			.old_dir	= d_inode(dir),
+			.old_dentry	= rep,
+			.new_dir	= d_inode(cache->graveyard),
+			.new_dentry	= grave,
+		};
 		trace_cachefiles_rename(object, rep, grave, why);
-		ret = vfs_rename(d_inode(dir), rep,
-				 d_inode(cache->graveyard), grave, NULL, 0);
+		ret = vfs_rename(&rd);
 		if (ret != 0 && ret != -ENOMEM)
 			cachefiles_io_error(cache,
 					    "Rename failed with error %d", ret);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 385b5e8741c0..ff48abb09679 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -590,6 +590,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dir_dentry;
 	struct dentry *trap;
 	struct inode *target_inode;
+	struct renamedata rd = {};
 
 	if (flags)
 		return -EINVAL;
@@ -619,9 +620,12 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		rc = -ENOTEMPTY;
 		goto out_lock;
 	}
-	rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
-			d_inode(lower_new_dir_dentry), lower_new_dentry,
-			NULL, 0);
+
+	rd.old_dir	= d_inode(lower_old_dir_dentry);
+	rd.old_dentry	= lower_old_dentry;
+	rd.new_dir	= d_inode(lower_new_dir_dentry);
+	rd.new_dentry	= lower_new_dentry;
+	rc = vfs_rename(&rd);
 	if (rc)
 		goto out_lock;
 	if (target_inode)
diff --git a/fs/namei.c b/fs/namei.c
index 93fa7d803fb2..38ab51881247 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4311,12 +4311,15 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   ->i_mutex on parents, which works but leads to some truly excessive
  *	   locking].
  */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-	       struct inode *new_dir, struct dentry *new_dentry,
-	       struct inode **delegated_inode, unsigned int flags)
+int vfs_rename(struct renamedata *rd)
 {
 	int error;
 	struct user_namespace *mnt_userns = &init_user_ns;
+	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+	struct dentry *old_dentry = rd->old_dentry;
+	struct dentry *new_dentry = rd->new_dentry;
+	struct inode **delegated_inode = rd->delegated_inode;
+	unsigned int flags = rd->flags;
 	bool is_dir = d_is_dir(old_dentry);
 	struct inode *source = old_dentry->d_inode;
 	struct inode *target = new_dentry->d_inode;
@@ -4442,6 +4445,7 @@ EXPORT_SYMBOL(vfs_rename);
 int do_renameat2(int olddfd, struct filename *from, int newdfd,
 		 struct filename *to, unsigned int flags)
 {
+	struct renamedata rd;
 	struct dentry *old_dentry, *new_dentry;
 	struct dentry *trap;
 	struct path old_path, new_path;
@@ -4545,9 +4549,14 @@ retry_deleg:
 				     &new_path, new_dentry, flags);
 	if (error)
 		goto exit5;
-	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
-			   new_path.dentry->d_inode, new_dentry,
-			   &delegated_inode, flags);
+
+	rd.old_dir	   = old_path.dentry->d_inode;
+	rd.old_dentry	   = old_dentry;
+	rd.new_dir	   = new_path.dentry->d_inode;
+	rd.new_dentry	   = new_dentry;
+	rd.delegated_inode = &delegated_inode;
+	rd.flags	   = flags;
+	error = vfs_rename(&rd);
 exit5:
 	dput(new_dentry);
 exit4:
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 37d85046b4d6..f7d83ff2b44e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1798,7 +1798,13 @@ retry:
 		close_cached = true;
 		goto out_dput_old;
 	} else {
-		host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+		struct renamedata rd = {
+			.old_dir	= fdir,
+			.old_dentry	= odentry,
+			.new_dir	= tdir,
+			.new_dentry	= ndentry,
+		};
+		host_err = vfs_rename(&rd);
 		if (!host_err) {
 			host_err = commit_metadata(tfhp);
 			if (!host_err)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0002834f664a..426899681df7 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -214,9 +214,16 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
 				unsigned int flags)
 {
 	int err;
+	struct renamedata rd = {
+		.old_dir 	= olddir,
+		.old_dentry 	= olddentry,
+		.new_dir 	= newdir,
+		.new_dentry 	= newdentry,
+		.flags 		= flags,
+	};
 
 	pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
-	err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+	err = vfs_rename(&rd);
 	if (err) {
 		pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
 			 olddentry, newdentry, err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a27884af7222..430e457f67f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1775,7 +1775,17 @@ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+
+struct renamedata {
+	struct inode *old_dir;
+	struct dentry *old_dentry;
+	struct inode *new_dir;
+	struct dentry *new_dentry;
+	struct inode **delegated_inode;
+	unsigned int flags;
+} __randomize_layout;
+
+int vfs_rename(struct renamedata *);
 
 static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
 {
-- 
cgit v1.2.3


From 6521f8917082928a4cb637eb64b77b5f2f5b30fc Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:33 +0100
Subject: namei: prepare for idmapped mounts

The various vfs_*() helpers are called by filesystems or by the vfs
itself to perform core operations such as create, link, mkdir, mknod, rename,
rmdir, tmpfile and unlink. Enable them to handle idmapped mounts. If the
inode is accessed through an idmapped mount map it into the
mount's user namespace and pass it down. Afterwards the checks and
operations are identical to non-idmapped mounts. If the initial user
namespace is passed nothing changes so non-idmapped mounts will see
identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-15-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 drivers/base/devtmpfs.c  |  11 ++-
 fs/cachefiles/namei.c    |  12 ++-
 fs/ecryptfs/inode.c      |  33 ++++---
 fs/init.c                |  14 +--
 fs/namei.c               | 227 ++++++++++++++++++++++++++++++++++++++---------
 fs/nfsd/nfs4recover.c    |   6 +-
 fs/nfsd/vfs.c            |  19 ++--
 fs/overlayfs/dir.c       |   4 +-
 fs/overlayfs/overlayfs.h |  20 +++--
 include/linux/fs.h       |  32 ++++---
 ipc/mqueue.c             |   3 +-
 net/unix/af_unix.c       |   3 +-
 12 files changed, 279 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 2e0c3cdb4184..653c8c6ac7a7 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -162,7 +162,7 @@ static int dev_mkdir(const char *name, umode_t mode)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mkdir(d_inode(path.dentry), dentry, mode);
+	err = vfs_mkdir(&init_user_ns, d_inode(path.dentry), dentry, mode);
 	if (!err)
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
@@ -212,7 +212,8 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt);
+	err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry, mode,
+			dev->devt);
 	if (!err) {
 		struct iattr newattrs;
 
@@ -242,7 +243,8 @@ static int dev_rmdir(const char *name)
 		return PTR_ERR(dentry);
 	if (d_really_is_positive(dentry)) {
 		if (d_inode(dentry)->i_private == &thread)
-			err = vfs_rmdir(d_inode(parent.dentry), dentry);
+			err = vfs_rmdir(&init_user_ns, d_inode(parent.dentry),
+					dentry);
 		else
 			err = -EPERM;
 	} else {
@@ -330,7 +332,8 @@ static int handle_remove(const char *nodename, struct device *dev)
 			inode_lock(d_inode(dentry));
 			notify_change(&init_user_ns, dentry, &newattrs, NULL);
 			inode_unlock(d_inode(dentry));
-			err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
+			err = vfs_unlink(&init_user_ns, d_inode(parent.dentry),
+					 dentry, NULL);
 			if (!err || err == -ENOENT)
 				deleted = 1;
 		}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7b987de0babe..7bf0732ae25c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -311,7 +311,8 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 			cachefiles_io_error(cache, "Unlink security error");
 		} else {
 			trace_cachefiles_unlink(object, rep, why);
-			ret = vfs_unlink(d_inode(dir), rep, NULL);
+			ret = vfs_unlink(&init_user_ns, d_inode(dir), rep,
+					 NULL);
 
 			if (preemptive)
 				cachefiles_mark_object_buried(cache, rep, why);
@@ -413,8 +414,10 @@ try_again:
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		struct renamedata rd = {
+			.old_mnt_userns	= &init_user_ns,
 			.old_dir	= d_inode(dir),
 			.old_dentry	= rep,
+			.new_mnt_userns	= &init_user_ns,
 			.new_dir	= d_inode(cache->graveyard),
 			.new_dentry	= grave,
 		};
@@ -566,7 +569,7 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 			start = jiffies;
-			ret = vfs_mkdir(d_inode(dir), next, 0);
+			ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0);
 			cachefiles_hist(cachefiles_mkdir_histogram, start);
 			if (!key)
 				trace_cachefiles_mkdir(object, next, ret);
@@ -602,7 +605,8 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 			start = jiffies;
-			ret = vfs_create(d_inode(dir), next, S_IFREG, true);
+			ret = vfs_create(&init_user_ns, d_inode(dir), next,
+					 S_IFREG, true);
 			cachefiles_hist(cachefiles_create_histogram, start);
 			trace_cachefiles_create(object, next, ret);
 			if (ret < 0)
@@ -796,7 +800,7 @@ retry:
 		ret = security_path_mkdir(&path, subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
-		ret = vfs_mkdir(d_inode(dir), subdir, 0700);
+		ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ff48abb09679..73e3d47e7b2d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -141,7 +141,8 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 	else if (d_unhashed(lower_dentry))
 		rc = -EINVAL;
 	else
-		rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+		rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry,
+				NULL);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
 		goto out_unlock;
@@ -180,7 +181,8 @@ ecryptfs_do_create(struct inode *directory_inode,
 
 	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
+	rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+			mode, true);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
 		       "rc = [%d]\n", __func__, rc);
@@ -190,7 +192,8 @@ ecryptfs_do_create(struct inode *directory_inode,
 	inode = __ecryptfs_get_inode(d_inode(lower_dentry),
 				     directory_inode->i_sb);
 	if (IS_ERR(inode)) {
-		vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
+		vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry),
+			   lower_dentry, NULL);
 		goto out_lock;
 	}
 	fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
@@ -436,8 +439,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	dget(lower_old_dentry);
 	dget(lower_new_dentry);
 	lower_dir_dentry = lock_parent(lower_new_dentry);
-	rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
-		      lower_new_dentry, NULL);
+	rc = vfs_link(lower_old_dentry, &init_user_ns,
+		      d_inode(lower_dir_dentry), lower_new_dentry, NULL);
 	if (rc || d_really_is_negative(lower_new_dentry))
 		goto out_lock;
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@ -481,7 +484,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 						  strlen(symname));
 	if (rc)
 		goto out_lock;
-	rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry,
+	rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
 			 encoded_symname);
 	kfree(encoded_symname);
 	if (rc || d_really_is_negative(lower_dentry))
@@ -507,7 +510,8 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
+	rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+		       mode);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -541,7 +545,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 	else if (d_unhashed(lower_dentry))
 		rc = -EINVAL;
 	else
-		rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+		rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry);
 	if (!rc) {
 		clear_nlink(d_inode(dentry));
 		fsstack_copy_attr_times(dir, lower_dir_inode);
@@ -563,7 +567,8 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
+	rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+		       mode, dev);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -621,10 +626,12 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out_lock;
 	}
 
-	rd.old_dir	= d_inode(lower_old_dir_dentry);
-	rd.old_dentry	= lower_old_dentry;
-	rd.new_dir	= d_inode(lower_new_dir_dentry);
-	rd.new_dentry	= lower_new_dentry;
+	rd.old_mnt_userns	= &init_user_ns;
+	rd.old_dir		= d_inode(lower_old_dir_dentry);
+	rd.old_dentry		= lower_old_dentry;
+	rd.new_mnt_userns	= &init_user_ns;
+	rd.new_dir		= d_inode(lower_new_dir_dentry);
+	rd.new_dentry		= lower_new_dentry;
 	rc = vfs_rename(&rd);
 	if (rc)
 		goto out_lock;
diff --git a/fs/init.c b/fs/init.c
index 891284f8a443..e65452750fa5 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -157,8 +157,8 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 		mode &= ~current_umask();
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
-		error = vfs_mknod(path.dentry->d_inode, dentry, mode,
-				  new_decode_dev(dev));
+		error = vfs_mknod(&init_user_ns, path.dentry->d_inode, dentry,
+				  mode, new_decode_dev(dev));
 	done_path_create(&path, dentry);
 	return error;
 }
@@ -187,8 +187,8 @@ int __init init_link(const char *oldname, const char *newname)
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry,
-			 NULL);
+	error = vfs_link(old_path.dentry, &init_user_ns,
+			 new_path.dentry->d_inode, new_dentry, NULL);
 out_dput:
 	done_path_create(&new_path, new_dentry);
 out:
@@ -207,7 +207,8 @@ int __init init_symlink(const char *oldname, const char *newname)
 		return PTR_ERR(dentry);
 	error = security_path_symlink(&path, dentry, oldname);
 	if (!error)
-		error = vfs_symlink(path.dentry->d_inode, dentry, oldname);
+		error = vfs_symlink(&init_user_ns, path.dentry->d_inode, dentry,
+				    oldname);
 	done_path_create(&path, dentry);
 	return error;
 }
@@ -230,7 +231,8 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 		mode &= ~current_umask();
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
-		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+		error = vfs_mkdir(&init_user_ns, path.dentry->d_inode, dentry,
+				  mode);
 	done_path_create(&path, dentry);
 	return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 38ab51881247..5c9f6f8e90c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2874,10 +2874,26 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl)
+/**
+ * vfs_create - create new file
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new file
+ * @want_excl:	whether the file must not yet exist
+ *
+ * Create a new file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, bool want_excl)
 {
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 	if (error)
 		return error;
 
@@ -3353,7 +3369,23 @@ static int do_open(struct nameidata *nd,
 	return error;
 }
 
-struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+/**
+ * vfs_tmpfile - create tmpfile
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new tmpfile
+ * @open_flags:	flags
+ *
+ * Create a temporary file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, umode_t mode, int open_flag)
 {
 	struct dentry *child = NULL;
 	struct inode *dir = dentry->d_inode;
@@ -3361,7 +3393,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
 	int error;
 
 	/* we want directory to be writable */
-	error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+	error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_err;
 	error = -EOPNOTSUPP;
@@ -3396,6 +3428,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		const struct open_flags *op,
 		struct file *file)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *child;
 	struct path path;
 	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
@@ -3404,7 +3437,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	error = mnt_want_write(path.mnt);
 	if (unlikely(error))
 		goto out;
-	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+	mnt_userns = mnt_user_ns(path.mnt);
+	child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
 	error = PTR_ERR(child);
 	if (IS_ERR(child))
 		goto out2;
@@ -3616,10 +3650,27 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
 }
 EXPORT_SYMBOL(user_path_create);
 
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+/**
+ * vfs_mknod - create device node or file
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new device node or file
+ * @dev:	device number of device to create
+ *
+ * Create a device node or file.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 
 	if (error)
 		return error;
@@ -3666,6 +3717,7 @@ static int may_mknod(umode_t mode)
 static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
 		unsigned int dev)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *dentry;
 	struct path path;
 	int error;
@@ -3684,18 +3736,22 @@ retry:
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (error)
 		goto out;
+
+	mnt_userns = mnt_user_ns(path.mnt);
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+			error = vfs_create(mnt_userns, path.dentry->d_inode,
+					   dentry, mode, true);
 			if (!error)
 				ima_post_path_mknod(dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
-			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
-					new_decode_dev(dev));
+			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+					  dentry, mode, new_decode_dev(dev));
 			break;
 		case S_IFIFO: case S_IFSOCK:
-			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
+			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+					  dentry, mode, 0);
 			break;
 	}
 out:
@@ -3718,9 +3774,25 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 	return do_mknodat(AT_FDCWD, filename, mode, dev);
 }
 
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+/**
+ * vfs_mkdir - create directory
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @mode:	mode of the new directory
+ *
+ * Create a directory.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode)
 {
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 	unsigned max_links = dir->i_sb->s_max_links;
 
 	if (error)
@@ -3759,8 +3831,11 @@ retry:
 	if (!IS_POSIXACL(path.dentry->d_inode))
 		mode &= ~current_umask();
 	error = security_path_mkdir(&path, dentry, mode);
-	if (!error)
-		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+	if (!error) {
+		struct user_namespace *mnt_userns;
+		mnt_userns = mnt_user_ns(path.mnt);
+		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, mode);
+	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -3779,9 +3854,24 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
 	return do_mkdirat(AT_FDCWD, pathname, mode);
 }
 
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+/**
+ * vfs_rmdir - remove directory
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ *
+ * Remove a directory.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry)
 {
-	int error = may_delete(&init_user_ns, dir, dentry, 1);
+	int error = may_delete(mnt_userns, dir, dentry, 1);
 
 	if (error)
 		return error;
@@ -3821,6 +3911,7 @@ EXPORT_SYMBOL(vfs_rmdir);
 
 long do_rmdir(int dfd, struct filename *name)
 {
+	struct user_namespace *mnt_userns;
 	int error = 0;
 	struct dentry *dentry;
 	struct path path;
@@ -3861,7 +3952,8 @@ retry:
 	error = security_path_rmdir(&path, dentry);
 	if (error)
 		goto exit3;
-	error = vfs_rmdir(path.dentry->d_inode, dentry);
+	mnt_userns = mnt_user_ns(path.mnt);
+	error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
 exit3:
 	dput(dentry);
 exit2:
@@ -3884,6 +3976,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 
 /**
  * vfs_unlink - unlink a filesystem object
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dir:	parent directory
  * @dentry:	victim
  * @delegated_inode: returns victim inode, if the inode is delegated.
@@ -3899,11 +3992,18 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, struct inode **delegated_inode)
 {
 	struct inode *target = dentry->d_inode;
-	int error = may_delete(&init_user_ns, dir, dentry, 0);
+	int error = may_delete(mnt_userns, dir, dentry, 0);
 
 	if (error)
 		return error;
@@ -3974,6 +4074,8 @@ retry_deleg:
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
+		struct user_namespace *mnt_userns;
+
 		/* Why not before? Because we want correct error value */
 		if (last.name[last.len])
 			goto slashes;
@@ -3984,7 +4086,8 @@ retry_deleg:
 		error = security_path_unlink(&path, dentry);
 		if (error)
 			goto exit2;
-		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+		mnt_userns = mnt_user_ns(path.mnt);
+		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, &delegated_inode);
 exit2:
 		dput(dentry);
 	}
@@ -4033,9 +4136,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 	return do_unlinkat(AT_FDCWD, getname(pathname));
 }
 
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+/**
+ * vfs_symlink - create symlink
+ * @mnt_userns:	user namespace of the mount the inode was found from
+ * @dir:	inode of @dentry
+ * @dentry:	pointer to dentry of the base directory
+ * @oldname:	name of the file to link to
+ *
+ * Create a symlink.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
+ */
+int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(&init_user_ns, dir, dentry);
+	int error = may_create(mnt_userns, dir, dentry);
 
 	if (error)
 		return error;
@@ -4073,8 +4192,13 @@ retry:
 		goto out_putname;
 
 	error = security_path_symlink(&path, dentry, from->name);
-	if (!error)
-		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+	if (!error) {
+		struct user_namespace *mnt_userns;
+
+		mnt_userns = mnt_user_ns(path.mnt);
+		error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
+				    from->name);
+	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -4099,6 +4223,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
 /**
  * vfs_link - create a new link
  * @old_dentry:	object to be linked
+ * @mnt_userns:	the user namespace of the mount
  * @dir:	new parent
  * @new_dentry:	where to create the new link
  * @delegated_inode: returns inode needing a delegation break
@@ -4114,8 +4239,16 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
+ *
+ * If the inode has been found through an idmapped mount the user namespace of
+ * the vfsmount must be passed through @mnt_userns. This function will then take
+ * care to map the inode according to @mnt_userns before checking permissions.
+ * On non-idmapped mounts or if permission checking is to be performed on the
+ * raw inode simply passs init_user_ns.
  */
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
+	     struct inode *dir, struct dentry *new_dentry,
+	     struct inode **delegated_inode)
 {
 	struct inode *inode = old_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
@@ -4124,7 +4257,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(&init_user_ns, dir, new_dentry);
+	error = may_create(mnt_userns, dir, new_dentry);
 	if (error)
 		return error;
 
@@ -4141,7 +4274,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 * be writen back improperly if their true value is unknown to
 	 * the vfs.
 	 */
-	if (HAS_UNMAPPED_ID(&init_user_ns, inode))
+	if (HAS_UNMAPPED_ID(mnt_userns, inode))
 		return -EPERM;
 	if (!dir->i_op->link)
 		return -EPERM;
@@ -4188,6 +4321,7 @@ EXPORT_SYMBOL(vfs_link);
 static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
 	      const char __user *newname, int flags)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
 	struct inode *delegated_inode = NULL;
@@ -4229,7 +4363,9 @@ retry:
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+	mnt_userns = mnt_user_ns(new_path.mnt);
+	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
+			 new_dentry, &delegated_inode);
 out_dput:
 	done_path_create(&new_path, new_dentry);
 	if (delegated_inode) {
@@ -4263,12 +4399,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 
 /**
  * vfs_rename - rename a filesystem object
- * @old_dir:	parent of source
- * @old_dentry:	source
- * @new_dir:	parent of destination
- * @new_dentry:	destination
- * @delegated_inode: returns an inode needing a delegation break
- * @flags:	rename flags
+ * @old_mnt_userns:	old user namespace of the mount the inode was found from
+ * @old_dir:		parent of source
+ * @old_dentry:		source
+ * @new_mnt_userns:	new user namespace of the mount the inode was found from
+ * @new_dir:		parent of destination
+ * @new_dentry:		destination
+ * @delegated_inode:	returns an inode needing a delegation break
+ * @flags:		rename flags
  *
  * The caller must hold multiple mutexes--see lock_rename()).
  *
@@ -4314,7 +4452,6 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 int vfs_rename(struct renamedata *rd)
 {
 	int error;
-	struct user_namespace *mnt_userns = &init_user_ns;
 	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
 	struct dentry *old_dentry = rd->old_dentry;
 	struct dentry *new_dentry = rd->new_dentry;
@@ -4330,19 +4467,21 @@ int vfs_rename(struct renamedata *rd)
 	if (source == target)
 		return 0;
 
-	error = may_delete(mnt_userns, old_dir, old_dentry, is_dir);
+	error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(mnt_userns, new_dir, new_dentry);
+		error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(mnt_userns, new_dir, new_dentry, is_dir);
+			error = may_delete(rd->new_mnt_userns, new_dir,
+					   new_dentry, is_dir);
 		else
-			error = may_delete(mnt_userns, new_dir, new_dentry, new_is_dir);
+			error = may_delete(rd->new_mnt_userns, new_dir,
+					   new_dentry, new_is_dir);
 	}
 	if (error)
 		return error;
@@ -4356,13 +4495,13 @@ int vfs_rename(struct renamedata *rd)
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(&init_user_ns, source,
+			error = inode_permission(rd->old_mnt_userns, source,
 						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(&init_user_ns, target,
+			error = inode_permission(rd->new_mnt_userns, target,
 						 MAY_WRITE);
 			if (error)
 				return error;
@@ -4552,8 +4691,10 @@ retry_deleg:
 
 	rd.old_dir	   = old_path.dentry->d_inode;
 	rd.old_dentry	   = old_dentry;
+	rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
 	rd.new_dir	   = new_path.dentry->d_inode;
 	rd.new_dentry	   = new_dentry;
+	rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
 	rd.delegated_inode = &delegated_inode;
 	rd.flags	   = flags;
 	error = vfs_rename(&rd);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 186fa2c2c6ba..891395c6c7d3 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 * as well be forgiving and just succeed silently.
 		 */
 		goto out_put;
-	status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
+	status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU);
 out_put:
 	dput(dentry);
 out_unlock:
@@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	status = -ENOENT;
 	if (d_really_is_negative(dentry))
 		goto out;
-	status = vfs_rmdir(d_inode(dir), dentry);
+	status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry);
 out:
 	dput(dentry);
 out_unlock:
@@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 	if (nfs4_has_reclaimed_state(name, nn))
 		goto out_free;
 
-	status = vfs_rmdir(d_inode(parent), child);
+	status = vfs_rmdir(&init_user_ns, d_inode(parent), child);
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f7d83ff2b44e..fab873178140 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,12 +1255,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	host_err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+		host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
 		if (!host_err)
 			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
-		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+		host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode);
 		if (!host_err && unlikely(d_unhashed(dchild))) {
 			struct dentry *d;
 			d = lookup_one_len(dchild->d_name.name,
@@ -1288,7 +1288,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
-		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+		host_err = vfs_mknod(&init_user_ns, dirp, dchild,
+				     iap->ia_mode, rdev);
 		break;
 	default:
 		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
@@ -1486,7 +1487,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (!IS_POSIXACL(dirp))
 		iap->ia_mode &= ~current_umask();
 
-	host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+	host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
 	if (host_err < 0) {
 		fh_drop_write(fhp);
 		goto out_nfserr;
@@ -1610,7 +1611,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (IS_ERR(dnew))
 		goto out_nfserr;
 
-	host_err = vfs_symlink(d_inode(dentry), dnew, path);
+	host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
 	err = nfserrno(host_err);
 	if (!err)
 		err = nfserrno(commit_metadata(fhp));
@@ -1678,7 +1679,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = nfserr_noent;
 	if (d_really_is_negative(dold))
 		goto out_dput;
-	host_err = vfs_link(dold, dirp, dnew, NULL);
+	host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
 		if (!err)
@@ -1799,8 +1800,10 @@ retry:
 		goto out_dput_old;
 	} else {
 		struct renamedata rd = {
+			.old_mnt_userns	= &init_user_ns,
 			.old_dir	= fdir,
 			.old_dentry	= odentry,
+			.new_mnt_userns	= &init_user_ns,
 			.new_dir	= tdir,
 			.new_dentry	= ndentry,
 		};
@@ -1891,9 +1894,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (type != S_IFDIR) {
 		if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
 			nfsd_close_cached_files(rdentry);
-		host_err = vfs_unlink(dirp, rdentry, NULL);
+		host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
 	} else {
-		host_err = vfs_rmdir(dirp, rdentry);
+		host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
 	}
 
 	if (!host_err)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index d75c96cb18c3..6904cc2ed7bb 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -821,9 +821,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
 		goto out_dput_upper;
 
 	if (is_dir)
-		err = vfs_rmdir(dir, upper);
+		err = vfs_rmdir(&init_user_ns, dir, upper);
 	else
-		err = vfs_unlink(dir, upper, NULL);
+		err = vfs_unlink(&init_user_ns, dir, upper, NULL);
 	ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
 
 	/*
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 426899681df7..5e9eb46e741a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -123,7 +123,7 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
 
 static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_rmdir(dir, dentry);
+	int err = vfs_rmdir(&init_user_ns, dir, dentry);
 
 	pr_debug("rmdir(%pd2) = %i\n", dentry, err);
 	return err;
@@ -131,7 +131,7 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
 
 static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_unlink(dir, dentry, NULL);
+	int err = vfs_unlink(&init_user_ns, dir, dentry, NULL);
 
 	pr_debug("unlink(%pd2) = %i\n", dentry, err);
 	return err;
@@ -140,7 +140,7 @@ static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
 static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
 			      struct dentry *new_dentry)
 {
-	int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+	int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL);
 
 	pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
 	return err;
@@ -149,7 +149,7 @@ static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
 static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
 				umode_t mode)
 {
-	int err = vfs_create(dir, dentry, mode, true);
+	int err = vfs_create(&init_user_ns, dir, dentry, mode, true);
 
 	pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
@@ -158,7 +158,7 @@ static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
 static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
 			       umode_t mode)
 {
-	int err = vfs_mkdir(dir, dentry, mode);
+	int err = vfs_mkdir(&init_user_ns, dir, dentry, mode);
 	pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
 	return err;
 }
@@ -166,7 +166,7 @@ static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
 static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
 			       umode_t mode, dev_t dev)
 {
-	int err = vfs_mknod(dir, dentry, mode, dev);
+	int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev);
 
 	pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
 	return err;
@@ -175,7 +175,7 @@ static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
 static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
 				 const char *oldname)
 {
-	int err = vfs_symlink(dir, dentry, oldname);
+	int err = vfs_symlink(&init_user_ns, dir, dentry, oldname);
 
 	pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
 	return err;
@@ -215,8 +215,10 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
 {
 	int err;
 	struct renamedata rd = {
+		.old_mnt_userns	= &init_user_ns,
 		.old_dir 	= olddir,
 		.old_dentry 	= olddentry,
+		.new_mnt_userns	= &init_user_ns,
 		.new_dir 	= newdir,
 		.new_dentry 	= newdentry,
 		.flags 		= flags,
@@ -233,14 +235,14 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
 
 static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
 {
-	int err = vfs_whiteout(dir, dentry);
+	int err = vfs_whiteout(&init_user_ns, dir, dentry);
 	pr_debug("whiteout(%pd2) = %i\n", dentry, err);
 	return err;
 }
 
 static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
 {
-	struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
+	struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0);
 	int err = PTR_ERR_OR_ZERO(ret);
 
 	pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 430e457f67f1..29d7b2fe7de4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1768,17 +1768,25 @@ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
 /*
  * VFS helper functions..
  */
-extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
-extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
-extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-extern int vfs_symlink(struct inode *, struct dentry *, const char *);
-extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
-extern int vfs_rmdir(struct inode *, struct dentry *);
-extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+int vfs_create(struct user_namespace *, struct inode *,
+	       struct dentry *, umode_t, bool);
+int vfs_mkdir(struct user_namespace *, struct inode *,
+	      struct dentry *, umode_t);
+int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+              umode_t, dev_t);
+int vfs_symlink(struct user_namespace *, struct inode *,
+		struct dentry *, const char *);
+int vfs_link(struct dentry *, struct user_namespace *, struct inode *,
+	     struct dentry *, struct inode **);
+int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *);
+int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *,
+	       struct inode **);
 
 struct renamedata {
+	struct user_namespace *old_mnt_userns;
 	struct inode *old_dir;
 	struct dentry *old_dentry;
+	struct user_namespace *new_mnt_userns;
 	struct inode *new_dir;
 	struct dentry *new_dentry;
 	struct inode **delegated_inode;
@@ -1787,13 +1795,15 @@ struct renamedata {
 
 int vfs_rename(struct renamedata *);
 
-static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+static inline int vfs_whiteout(struct user_namespace *mnt_userns,
+			       struct inode *dir, struct dentry *dentry)
 {
-	return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+	return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE,
+			 WHITEOUT_DEV);
 }
 
-extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
-				  int open_flag);
+struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, umode_t mode, int open_flag);
 
 int vfs_mkobj(struct dentry *, umode_t,
 		int (*f)(struct dentry *, umode_t, void *),
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 693f01fe1216..fcd56e077733 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -965,7 +965,8 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 		err = -ENOENT;
 	} else {
 		ihold(inode);
-		err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+		err = vfs_unlink(&init_user_ns, d_inode(dentry->d_parent),
+				 dentry, NULL);
 	}
 	dput(dentry);
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 18453d15dddf..9a1f3c04402e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -996,7 +996,8 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 	 */
 	err = security_path_mknod(&path, dentry, mode, 0);
 	if (!err) {
-		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+		err = vfs_mknod(&init_user_ns, d_inode(path.dentry), dentry,
+				mode, 0);
 		if (!err) {
 			res->mnt = mntget(path.mnt);
 			res->dentry = dget(dentry);
-- 
cgit v1.2.3


From 643fe55a0679ae5582a1a2a1df86dc240292cd1b Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:34 +0100
Subject: open: handle idmapped mounts in do_truncate()

When truncating files the vfs will verify that the caller is privileged
over the inode. Extend it to handle idmapped mounts. If the inode is
accessed through an idmapped mount it is mapped according to the mount's
user namespace. Afterwards the permissions checks are identical to
non-idmapped mounts. If the initial user namespace is passed nothing
changes so non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-16-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/coredump.c      | 10 +++++++---
 fs/inode.c         |  7 ++++---
 fs/namei.c         |  2 +-
 fs/open.c          | 16 +++++++++-------
 include/linux/fs.h |  4 ++--
 5 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coredump.c b/fs/coredump.c
index a2f6ecc8e345..ae778937a1ff 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -703,6 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 			goto close_fail;
 		}
 	} else {
+		struct user_namespace *mnt_userns;
 		struct inode *inode;
 		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
 				 O_LARGEFILE | O_EXCL;
@@ -780,13 +781,15 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		 * a process dumps core while its cwd is e.g. on a vfat
 		 * filesystem.
 		 */
-		if (!uid_eq(inode->i_uid, current_fsuid()))
+		mnt_userns = file_mnt_user_ns(cprm.file);
+		if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid()))
 			goto close_fail;
 		if ((inode->i_mode & 0677) != 0600)
 			goto close_fail;
 		if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
 			goto close_fail;
-		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+		if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
+				0, 0, cprm.file))
 			goto close_fail;
 	}
 
@@ -931,7 +934,8 @@ void dump_truncate(struct coredump_params *cprm)
 	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
 		offset = file->f_op->llseek(file, 0, SEEK_CUR);
 		if (i_size_read(file->f_mapping->host) < offset)
-			do_truncate(file->f_path.dentry, offset, 0, file);
+			do_truncate(file_mnt_user_ns(file), file->f_path.dentry,
+				    offset, 0, file);
 	}
 }
 EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/inode.c b/fs/inode.c
index 46116ef44c9f..08151968c9ef 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1903,7 +1903,8 @@ int dentry_needs_remove_privs(struct dentry *dentry)
 	return mask;
 }
 
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
@@ -1912,7 +1913,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
 	 * Note we call this on write, so notify_change will not
 	 * encounter any conflicting delegations:
 	 */
-	return notify_change(&init_user_ns, dentry, &newattrs, NULL);
+	return notify_change(mnt_userns, dentry, &newattrs, NULL);
 }
 
 /*
@@ -1939,7 +1940,7 @@ int file_remove_privs(struct file *file)
 	if (kill < 0)
 		return kill;
 	if (kill)
-		error = __remove_privs(dentry, kill);
+		error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
 	if (!error)
 		inode_has_no_xattr(inode);
 
diff --git a/fs/namei.c b/fs/namei.c
index 5c9f6f8e90c4..c8c083daf368 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3009,7 +3009,7 @@ static int handle_truncate(struct file *filp)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error) {
-		error = do_truncate(path->dentry, 0,
+		error = do_truncate(&init_user_ns, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 				    filp);
 	}
diff --git a/fs/open.c b/fs/open.c
index c3e4dc43dd8d..8b3f3eb652d0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -35,8 +35,8 @@
 
 #include "internal.h"
 
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-	struct file *filp)
+int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
+		loff_t length, unsigned int time_attrs, struct file *filp)
 {
 	int ret;
 	struct iattr newattrs;
@@ -61,13 +61,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 
 	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
-	ret = notify_change(&init_user_ns, dentry, &newattrs, NULL);
+	ret = notify_change(mnt_userns, dentry, &newattrs, NULL);
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
 
 long vfs_truncate(const struct path *path, loff_t length)
 {
+	struct user_namespace *mnt_userns;
 	struct inode *inode;
 	long error;
 
@@ -83,7 +84,8 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (error)
 		goto out;
 
-	error = inode_permission(&init_user_ns, inode, MAY_WRITE);
+	mnt_userns = mnt_user_ns(path->mnt);
+	error = inode_permission(mnt_userns, inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -107,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error)
-		error = do_truncate(path->dentry, length, 0, NULL);
+		error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
 
 put_write_and_out:
 	put_write_access(inode);
@@ -186,13 +188,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	/* Check IS_APPEND on real upper inode */
 	if (IS_APPEND(file_inode(f.file)))
 		goto out_putf;
-
 	sb_start_write(inode->i_sb);
 	error = locks_verify_truncate(inode, f.file, length);
 	if (!error)
 		error = security_path_truncate(&f.file->f_path);
 	if (!error)
-		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+		error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
+				    ATTR_MTIME | ATTR_CTIME, f.file);
 	sb_end_write(inode->i_sb);
 out_putf:
 	fdput(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d7b2fe7de4..f0601cca1930 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2593,8 +2593,8 @@ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
 	return mnt_user_ns(file->f_path.mnt);
 }
 extern long vfs_truncate(const struct path *, loff_t);
-extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
-		       struct file *filp);
+int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
+		unsigned int time_attrs, struct file *filp);
 extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
 			loff_t len);
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
-- 
cgit v1.2.3


From 549c7297717c32ee53f156cd949e055e601f67bb Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:43 +0100
Subject: fs: make helpers idmap mount aware

Extend some inode methods with an additional user namespace argument. A
filesystem that is aware of idmapped mounts will receive the user
namespace the mount has been marked with. This can be used for
additional permission checking and also to enable filesystems to
translate between uids and gids if they need to. We have implemented all
relevant helpers in earlier patches.

As requested we simply extend the exisiting inode method instead of
introducing new ones. This is a little more code churn but it's mostly
mechanical and doesnt't leave us with additional inode methods.

Link: https://lore.kernel.org/r/20210121131959.646623-25-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 Documentation/filesystems/vfs.rst         | 19 ++++++-----
 arch/powerpc/platforms/cell/spufs/inode.c |  3 +-
 drivers/android/binderfs.c                |  6 ++--
 fs/9p/acl.c                               |  4 +--
 fs/9p/v9fs.h                              |  3 +-
 fs/9p/v9fs_vfs.h                          |  3 +-
 fs/9p/vfs_inode.c                         | 26 ++++++++-------
 fs/9p/vfs_inode_dotl.c                    | 31 +++++++++--------
 fs/adfs/adfs.h                            |  3 +-
 fs/adfs/inode.c                           |  3 +-
 fs/affs/affs.h                            | 24 +++++++++-----
 fs/affs/inode.c                           |  3 +-
 fs/affs/namei.c                           | 15 +++++----
 fs/afs/dir.c                              | 34 ++++++++++---------
 fs/afs/inode.c                            |  7 ++--
 fs/afs/internal.h                         |  7 ++--
 fs/afs/security.c                         |  3 +-
 fs/attr.c                                 |  4 +--
 fs/autofs/root.c                          | 17 ++++++----
 fs/bad_inode.c                            | 36 ++++++++++++--------
 fs/bfs/dir.c                              | 10 +++---
 fs/btrfs/acl.c                            |  3 +-
 fs/btrfs/ctree.h                          |  3 +-
 fs/btrfs/inode.c                          | 33 +++++++++++--------
 fs/ceph/acl.c                             |  3 +-
 fs/ceph/dir.c                             | 23 ++++++-------
 fs/ceph/inode.c                           | 10 +++---
 fs/ceph/super.h                           | 12 ++++---
 fs/cifs/cifsfs.c                          |  3 +-
 fs/cifs/cifsfs.h                          | 25 ++++++++------
 fs/cifs/dir.c                             |  8 ++---
 fs/cifs/inode.c                           | 16 +++++----
 fs/cifs/link.c                            |  3 +-
 fs/coda/coda_linux.h                      |  8 +++--
 fs/coda/dir.c                             | 18 ++++++----
 fs/coda/inode.c                           |  7 ++--
 fs/coda/pioctl.c                          |  6 ++--
 fs/configfs/configfs_internal.h           |  6 ++--
 fs/configfs/dir.c                         |  3 +-
 fs/configfs/inode.c                       |  5 +--
 fs/configfs/symlink.c                     |  3 +-
 fs/debugfs/inode.c                        |  9 ++---
 fs/ecryptfs/inode.c                       | 30 ++++++++++-------
 fs/efivarfs/inode.c                       |  4 +--
 fs/erofs/inode.c                          |  5 +--
 fs/erofs/internal.h                       |  5 +--
 fs/exfat/exfat_fs.h                       |  8 +++--
 fs/exfat/file.c                           |  8 +++--
 fs/exfat/namei.c                          | 14 ++++----
 fs/ext2/acl.c                             |  3 +-
 fs/ext2/acl.h                             |  3 +-
 fs/ext2/ext2.h                            |  5 +--
 fs/ext2/inode.c                           |  7 ++--
 fs/ext2/namei.c                           | 22 ++++++++-----
 fs/ext4/acl.c                             |  3 +-
 fs/ext4/acl.h                             |  3 +-
 fs/ext4/ext4.h                            |  9 +++--
 fs/ext4/inode.c                           | 12 ++++---
 fs/ext4/namei.c                           | 19 ++++++-----
 fs/f2fs/acl.c                             |  3 +-
 fs/f2fs/acl.h                             |  3 +-
 fs/f2fs/f2fs.h                            |  7 ++--
 fs/f2fs/file.c                            |  7 ++--
 fs/f2fs/namei.c                           | 21 +++++++-----
 fs/fat/fat.h                              |  6 ++--
 fs/fat/file.c                             |  9 ++---
 fs/fat/namei_msdos.c                      | 10 +++---
 fs/fat/namei_vfat.c                       | 13 ++++----
 fs/fuse/acl.c                             |  3 +-
 fs/fuse/dir.c                             | 37 ++++++++++++---------
 fs/fuse/fuse_i.h                          |  4 +--
 fs/gfs2/acl.c                             |  3 +-
 fs/gfs2/acl.h                             |  3 +-
 fs/gfs2/file.c                            |  2 +-
 fs/gfs2/inode.c                           | 53 +++++++++++++++++------------
 fs/gfs2/inode.h                           |  3 +-
 fs/hfs/dir.c                              | 13 ++++----
 fs/hfs/hfs_fs.h                           |  3 +-
 fs/hfs/inode.c                            |  6 ++--
 fs/hfsplus/dir.c                          | 22 +++++++------
 fs/hfsplus/hfsplus_fs.h                   |  5 +--
 fs/hfsplus/inode.c                        |  8 +++--
 fs/hostfs/hostfs_kern.c                   | 23 ++++++++-----
 fs/hpfs/hpfs_fn.h                         |  2 +-
 fs/hpfs/inode.c                           |  3 +-
 fs/hpfs/namei.c                           | 20 ++++++-----
 fs/hugetlbfs/inode.c                      | 29 +++++++++-------
 fs/jffs2/acl.c                            |  3 +-
 fs/jffs2/acl.h                            |  3 +-
 fs/jffs2/dir.c                            | 33 +++++++++++--------
 fs/jffs2/fs.c                             |  3 +-
 fs/jffs2/os-linux.h                       |  2 +-
 fs/jfs/acl.c                              |  3 +-
 fs/jfs/file.c                             |  3 +-
 fs/jfs/jfs_acl.h                          |  3 +-
 fs/jfs/jfs_inode.h                        |  2 +-
 fs/jfs/namei.c                            | 21 ++++++------
 fs/kernfs/dir.c                           |  6 ++--
 fs/kernfs/inode.c                         |  9 +++--
 fs/kernfs/kernfs-internal.h               |  9 +++--
 fs/libfs.c                                | 31 +++++++++--------
 fs/minix/file.c                           |  3 +-
 fs/minix/inode.c                          |  4 +--
 fs/minix/minix.h                          |  3 +-
 fs/minix/namei.c                          | 24 ++++++++------
 fs/namei.c                                | 55 ++++++++++++++++++-------------
 fs/nfs/dir.c                              | 23 ++++++++-----
 fs/nfs/inode.c                            |  7 ++--
 fs/nfs/internal.h                         | 14 +++++---
 fs/nfs/namespace.c                        | 13 +++++---
 fs/nfs/nfs3_fs.h                          |  3 +-
 fs/nfs/nfs3acl.c                          |  3 +-
 fs/nilfs2/inode.c                         |  6 ++--
 fs/nilfs2/namei.c                         | 19 ++++++-----
 fs/nilfs2/nilfs.h                         |  6 ++--
 fs/ntfs/inode.c                           |  4 ++-
 fs/ntfs/inode.h                           |  3 +-
 fs/ocfs2/acl.c                            |  3 +-
 fs/ocfs2/acl.h                            |  3 +-
 fs/ocfs2/dlmfs/dlmfs.c                    |  9 +++--
 fs/ocfs2/file.c                           | 10 +++---
 fs/ocfs2/file.h                           | 11 ++++---
 fs/ocfs2/namei.c                          | 19 +++++++----
 fs/omfs/dir.c                             | 13 ++++----
 fs/omfs/file.c                            |  3 +-
 fs/orangefs/acl.c                         |  3 +-
 fs/orangefs/inode.c                       | 10 +++---
 fs/orangefs/namei.c                       | 12 ++++---
 fs/orangefs/orangefs-kernel.h             | 13 +++++---
 fs/overlayfs/dir.c                        | 21 ++++++------
 fs/overlayfs/inode.c                      | 10 +++---
 fs/overlayfs/overlayfs.h                  | 10 +++---
 fs/overlayfs/super.c                      |  2 +-
 fs/posix_acl.c                            |  9 ++---
 fs/proc/base.c                            | 16 +++++----
 fs/proc/fd.c                              |  3 +-
 fs/proc/fd.h                              |  3 +-
 fs/proc/generic.c                         |  6 ++--
 fs/proc/internal.h                        |  6 ++--
 fs/proc/proc_net.c                        |  3 +-
 fs/proc/proc_sysctl.c                     |  9 +++--
 fs/proc/root.c                            |  3 +-
 fs/ramfs/file-nommu.c                     |  5 +--
 fs/ramfs/inode.c                          | 16 +++++----
 fs/reiserfs/acl.h                         |  3 +-
 fs/reiserfs/inode.c                       |  3 +-
 fs/reiserfs/namei.c                       | 19 ++++++-----
 fs/reiserfs/reiserfs.h                    |  3 +-
 fs/reiserfs/xattr.c                       | 11 ++++---
 fs/reiserfs/xattr.h                       |  3 +-
 fs/reiserfs/xattr_acl.c                   |  3 +-
 fs/stat.c                                 |  8 +++--
 fs/sysv/file.c                            |  3 +-
 fs/sysv/itree.c                           |  4 +--
 fs/sysv/namei.c                           | 21 +++++++-----
 fs/sysv/sysv.h                            |  3 +-
 fs/tracefs/inode.c                        |  4 ++-
 fs/ubifs/dir.c                            | 26 ++++++++-------
 fs/ubifs/file.c                           |  3 +-
 fs/ubifs/ubifs.h                          |  5 +--
 fs/udf/file.c                             |  3 +-
 fs/udf/namei.c                            | 24 +++++++-------
 fs/udf/symlink.c                          |  5 +--
 fs/ufs/inode.c                            |  3 +-
 fs/ufs/namei.c                            | 19 ++++++-----
 fs/ufs/ufs.h                              |  3 +-
 fs/vboxsf/dir.c                           | 12 ++++---
 fs/vboxsf/utils.c                         |  7 ++--
 fs/vboxsf/vfsmod.h                        |  8 +++--
 fs/xfs/xfs_acl.c                          |  3 +-
 fs/xfs/xfs_acl.h                          |  3 +-
 fs/xfs/xfs_iops.c                         | 52 ++++++++++++++++-------------
 fs/zonefs/super.c                         |  3 +-
 include/linux/fs.h                        | 39 ++++++++++++++--------
 include/linux/nfs_fs.h                    |  7 ++--
 include/linux/posix_acl.h                 |  3 +-
 ipc/mqueue.c                              |  4 +--
 kernel/bpf/inode.c                        |  7 ++--
 mm/shmem.c                                | 39 ++++++++++++++--------
 net/socket.c                              |  5 +--
 security/apparmor/apparmorfs.c            |  3 +-
 security/integrity/evm/evm_secfs.c        |  2 +-
 182 files changed, 1121 insertions(+), 756 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index ca52c82e5bb5..98290ef311ca 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -415,28 +415,29 @@ As of kernel 2.6.22, the following members are defined:
 .. code-block:: c
 
 	struct inode_operations {
-		int (*create) (struct inode *,struct dentry *, umode_t, bool);
+		int (*create) (struct user_namespace *, struct inode *,struct dentry *, umode_t, bool);
 		struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 		int (*link) (struct dentry *,struct inode *,struct dentry *);
 		int (*unlink) (struct inode *,struct dentry *);
-		int (*symlink) (struct inode *,struct dentry *,const char *);
-		int (*mkdir) (struct inode *,struct dentry *,umode_t);
+		int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,const char *);
+		int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,umode_t);
 		int (*rmdir) (struct inode *,struct dentry *);
-		int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-		int (*rename) (struct inode *, struct dentry *,
+		int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,umode_t,dev_t);
+		int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
 			       struct inode *, struct dentry *, unsigned int);
 		int (*readlink) (struct dentry *, char __user *,int);
 		const char *(*get_link) (struct dentry *, struct inode *,
 					 struct delayed_call *);
-		int (*permission) (struct inode *, int);
+		int (*permission) (struct user_namespace *, struct inode *, int);
 		int (*get_acl)(struct inode *, int);
-		int (*setattr) (struct dentry *, struct iattr *);
-		int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+		int (*setattr) (struct user_namespace *, struct dentry *, struct iattr *);
+		int (*getattr) (struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int);
 		ssize_t (*listxattr) (struct dentry *, char *, size_t);
 		void (*update_time)(struct inode *, struct timespec *, int);
 		int (*atomic_open)(struct inode *, struct dentry *, struct file *,
 				   unsigned open_flag, umode_t create_mode);
-		int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+		int (*tmpfile) (struct user_namespace *, struct inode *, struct dentry *, umode_t);
+	        int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int);
 	};
 
 Again, all methods are called without any locks being held, unless
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 3de526eb2275..b83a3670bd74 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -91,7 +91,8 @@ out:
 }
 
 static int
-spufs_setattr(struct dentry *dentry, struct iattr *attr)
+spufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	      struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 7b4f154f07e6..e80ba93c62a9 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -355,7 +355,8 @@ static inline bool is_binderfs_control_device(const struct dentry *dentry)
 	return info->control_dentry == dentry;
 }
 
-static int binderfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int binderfs_rename(struct user_namespace *mnt_userns,
+			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags)
 {
@@ -363,7 +364,8 @@ static int binderfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    is_binderfs_control_device(new_dentry))
 		return -EPERM;
 
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+	return simple_rename(&init_user_ns, old_dir, old_dentry, new_dir,
+			     new_dentry, flags);
 }
 
 static int binderfs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 1c14f18a6ec9..bb1b286c49ae 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -280,7 +280,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			struct iattr iattr = { 0 };
 			struct posix_acl *old_acl = acl;
 
-			retval = posix_acl_update_mode(mnt_userns, inode,
+			retval = posix_acl_update_mode(&init_user_ns, inode,
 						       &iattr.ia_mode, &acl);
 			if (retval)
 				goto err_out;
@@ -299,7 +299,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 			 * What is the following setxattr update the
 			 * mode ?
 			 */
-			v9fs_vfs_setattr_dotl(dentry, &iattr);
+			v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 7b763776306e..4ca56c5dd637 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -135,7 +135,8 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 			unsigned int flags);
 extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
 extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
-extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+extern int v9fs_vfs_rename(struct user_namespace *mnt_userns,
+			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags);
 extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd2a2b040250..d44ade76966a 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 
 void v9fs_blank_wstat(struct p9_wstat *wstat);
-int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
+int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *,
+			  struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 			 int datasync);
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c21b146c8d91..648eb4c4cf7f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -676,8 +676,8 @@ error:
  */
 
 static int
-v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	u32 perm = unixmode2p9mode(v9ses, mode);
@@ -702,7 +702,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
  *
  */
 
-static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int err;
 	u32 perm;
@@ -907,9 +908,9 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
  */
 
 int
-v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry,
-		unsigned int flags)
+v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		struct dentry *old_dentry, struct inode *new_dir,
+		struct dentry *new_dentry, unsigned int flags)
 {
 	int retval;
 	struct inode *old_inode;
@@ -1016,8 +1017,8 @@ done:
  */
 
 static int
-v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
@@ -1054,7 +1055,8 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
  *
  */
 
-static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
+static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *iattr)
 {
 	int retval, use_dentry = 0;
 	struct v9fs_session_info *v9ses;
@@ -1295,7 +1297,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
  */
 
 static int
-v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		 struct dentry *dentry, const char *symname)
 {
 	p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n",
 		 dir->i_ino, dentry, symname);
@@ -1348,7 +1351,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
  */
 
 static int
-v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	int retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 984f28315d2a..1dc7af046615 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -33,8 +33,8 @@
 #include "acl.h"
 
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
-		    dev_t rdev);
+v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dentry, umode_t omode, dev_t rdev);
 
 /**
  * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
@@ -218,10 +218,10 @@ int v9fs_open_to_dotl_flags(int flags)
  */
 
 static int
-v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
-		bool excl)
+v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t omode, bool excl)
 {
-	return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0);
+	return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0);
 }
 
 static int
@@ -367,8 +367,9 @@ err_clunk_old_fid:
  *
  */
 
-static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-			       struct dentry *dentry, umode_t omode)
+static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
+			       struct inode *dir, struct dentry *dentry,
+			       umode_t omode)
 {
 	int err;
 	struct v9fs_session_info *v9ses;
@@ -457,8 +458,9 @@ error:
 }
 
 static int
-v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns,
+		      const struct path *path, struct kstat *stat,
+		      u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct v9fs_session_info *v9ses;
@@ -540,7 +542,8 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
  *
  */
 
-int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, struct iattr *iattr)
 {
 	int retval, use_dentry = 0;
 	struct p9_fid *fid = NULL;
@@ -684,8 +687,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 }
 
 static int
-v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
-		const char *symname)
+v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, const char *symname)
 {
 	int err;
 	kgid_t gid;
@@ -824,8 +827,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
  *
  */
 static int
-v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
-		dev_t rdev)
+v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
+		    struct dentry *dentry, umode_t omode, dev_t rdev)
 {
 	int err;
 	kgid_t gid;
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 699c4fa8b78b..06b7c92343ad 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -144,7 +144,8 @@ struct adfs_discmap {
 /* Inode stuff */
 struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
 int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
+int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr);
 
 /* map.c */
 int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset);
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 278dcee6ae22..fb7ee026d101 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -292,7 +292,8 @@ out:
  * later.
  */
 int
-adfs_notify_change(struct dentry *dentry, struct iattr *attr)
+adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a755bef7c4c7..bfa89e131ead 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -167,27 +167,33 @@ extern const struct export_operations affs_export_ops;
 extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
 extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int	affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool);
-extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+extern int	affs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool);
+extern int	affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode);
 extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
 extern int	affs_link(struct dentry *olddentry, struct inode *dir,
 			  struct dentry *dentry);
-extern int	affs_symlink(struct inode *dir, struct dentry *dentry,
-			     const char *symname);
-extern int	affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-			    struct inode *new_dir, struct dentry *new_dentry,
-			    unsigned int flags);
+extern int	affs_symlink(struct user_namespace *mnt_userns,
+			struct inode *dir, struct dentry *dentry,
+			const char *symname);
+extern int	affs_rename2(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags);
 
 /* inode.c */
 
 extern struct inode		*affs_new_inode(struct inode *dir);
-extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
+extern int			 affs_notify_change(struct user_namespace *mnt_userns,
+					struct dentry *dentry, struct iattr *attr);
 extern void			 affs_evict_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
 					unsigned long ino);
 extern int			 affs_write_inode(struct inode *inode,
 					struct writeback_control *wbc);
-extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
+extern int			 affs_add_entry(struct inode *dir, struct inode *inode,
+					struct dentry *dentry, s32 type);
 
 /* file.c */
 
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 767e5bdfb703..2352a75bd9d6 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -216,7 +216,8 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 }
 
 int
-affs_notify_change(struct dentry *dentry, struct iattr *attr)
+affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 41c5749f4db7..9ad22befce28 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -242,7 +242,8 @@ affs_unlink(struct inode *dir, struct dentry *dentry)
 }
 
 int
-affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+affs_create(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode	*inode;
@@ -273,7 +274,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
 }
 
 int
-affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+	   struct dentry *dentry, umode_t mode)
 {
 	struct inode		*inode;
 	int			 error;
@@ -311,7 +313,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
 }
 
 int
-affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+affs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+	     struct dentry *dentry, const char *symname)
 {
 	struct super_block	*sb = dir->i_sb;
 	struct buffer_head	*bh;
@@ -498,9 +501,9 @@ done:
 	return retval;
 }
 
-int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry,
-			unsigned int flags)
+int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+		 struct dentry *old_dentry, struct inode *new_dir,
+		 struct dentry *new_dentry, unsigned int flags)
 {
 
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7bd659ad959e..714fcca9af99 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -28,18 +28,19 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int
 				  loff_t fpos, u64 ino, unsigned dtype);
 static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
 			      loff_t fpos, u64 ino, unsigned dtype);
-static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl);
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl);
+static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode);
 static int afs_rmdir(struct inode *dir, struct dentry *dentry);
 static int afs_unlink(struct inode *dir, struct dentry *dentry);
 static int afs_link(struct dentry *from, struct inode *dir,
 		    struct dentry *dentry);
-static int afs_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *content);
-static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags);
+static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *content);
+static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags);
 static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
 static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
 				   unsigned int length);
@@ -1325,7 +1326,8 @@ static const struct afs_operation_ops afs_mkdir_operation = {
 /*
  * create a directory on an AFS filesystem
  */
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1619,8 +1621,8 @@ static const struct afs_operation_ops afs_create_operation = {
 /*
  * create a regular file on an AFS filesystem
  */
-static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1741,8 +1743,8 @@ static const struct afs_operation_ops afs_symlink_operation = {
 /*
  * create a symlink in an AFS filesystem
  */
-static int afs_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *content)
+static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *content)
 {
 	struct afs_operation *op;
 	struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1876,9 +1878,9 @@ static const struct afs_operation_ops afs_rename_operation = {
 /*
  * rename a file in an AFS filesystem and/or move it between directories
  */
-static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct afs_operation *op;
 	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 795ee5cb3817..1156b2df28d3 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -734,8 +734,8 @@ error_unlock:
 /*
  * read the attributes of an inode
  */
-int afs_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
@@ -857,7 +857,8 @@ static const struct afs_operation_ops afs_setattr_operation = {
 /*
  * set the attributes of an inode
  */
-int afs_setattr(struct dentry *dentry, struct iattr *attr)
+int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	struct afs_operation *op;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0d150a29e39e..b626e38e9ab5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1149,8 +1149,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
 extern struct inode *afs_root_iget(struct super_block *, struct key *);
 extern bool afs_check_validity(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
-extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int afs_setattr(struct dentry *, struct iattr *);
+extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *,
+		       struct kstat *, u32, unsigned int);
+extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *);
 extern void afs_evict_inode(struct inode *);
 extern int afs_drop_inode(struct inode *);
 
@@ -1361,7 +1362,7 @@ extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
 extern struct key *afs_request_key_rcu(struct afs_cell *);
 extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct user_namespace *, struct inode *, int);
 extern void __exit afs_clean_up_permit_cache(void);
 
 /*
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 9cf3102f370c..3c7a8fc4f93f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -396,7 +396,8 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		   int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t access;
diff --git a/fs/attr.c b/fs/attr.c
index c2d3bc0869d4..41abd0d973d8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -395,9 +395,9 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return error;
 
 	if (inode->i_op->setattr)
-		error = inode->i_op->setattr(dentry, attr);
+		error = inode->i_op->setattr(mnt_userns, dentry, attr);
 	else
-		error = simple_setattr(dentry, attr);
+		error = simple_setattr(mnt_userns, dentry, attr);
 
 	if (!error) {
 		fsnotify_change(dentry, ia_valid);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 5aaa1732bf1e..91fe4548c256 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,10 +10,12 @@
 
 #include "autofs_i.h"
 
-static int autofs_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs_dir_symlink(struct user_namespace *, struct inode *,
+			      struct dentry *, const char *);
 static int autofs_dir_unlink(struct inode *, struct dentry *);
 static int autofs_dir_rmdir(struct inode *, struct dentry *);
-static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static int autofs_dir_mkdir(struct user_namespace *, struct inode *,
+			    struct dentry *, umode_t);
 static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
 #ifdef CONFIG_COMPAT
 static long autofs_root_compat_ioctl(struct file *,
@@ -524,9 +526,9 @@ static struct dentry *autofs_lookup(struct inode *dir,
 	return NULL;
 }
 
-static int autofs_dir_symlink(struct inode *dir,
-			       struct dentry *dentry,
-			       const char *symname)
+static int autofs_dir_symlink(struct user_namespace *mnt_userns,
+			      struct inode *dir, struct dentry *dentry,
+			      const char *symname)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -715,8 +717,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static int autofs_dir_mkdir(struct inode *dir,
-			    struct dentry *dentry, umode_t mode)
+static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
+			    umode_t mode)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 54f0ce444272..48e16144c1f7 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -27,8 +27,9 @@ static const struct file_operations bad_file_ops =
 	.open		= bad_file_open,
 };
 
-static int bad_inode_create (struct inode *dir, struct dentry *dentry,
-		umode_t mode, bool excl)
+static int bad_inode_create(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
+			    umode_t mode, bool excl)
 {
 	return -EIO;
 }
@@ -50,14 +51,15 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
 	return -EIO;
 }
 
-static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
-		const char *symname)
+static int bad_inode_symlink(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *dentry,
+			     const char *symname)
 {
 	return -EIO;
 }
 
-static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
-			umode_t mode)
+static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode)
 {
 	return -EIO;
 }
@@ -67,13 +69,14 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
 	return -EIO;
 }
 
-static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
-			umode_t mode, dev_t rdev)
+static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	return -EIO;
 }
 
-static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int bad_inode_rename2(struct user_namespace *mnt_userns,
+			     struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry,
 			     unsigned int flags)
 {
@@ -86,18 +89,21 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct user_namespace *mnt_userns,
+				struct inode *inode, int mask)
 {
 	return -EIO;
 }
 
-static int bad_inode_getattr(const struct path *path, struct kstat *stat,
+static int bad_inode_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	return -EIO;
 }
 
-static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs)
+static int bad_inode_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *direntry, struct iattr *attrs)
 {
 	return -EIO;
 }
@@ -140,13 +146,15 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
 	return -EIO;
 }
 
-static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry,
+static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
+			     struct inode *inode, struct dentry *dentry,
 			     umode_t mode)
 {
 	return -EIO;
 }
 
-static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl,
+static int bad_inode_set_acl(struct user_namespace *mnt_userns,
+			     struct inode *inode, struct posix_acl *acl,
 			     int type)
 {
 	return -EIO;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index be1335a8d25b..34d4f68f786b 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -75,8 +75,8 @@ const struct file_operations bfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-						bool excl)
+static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	int err;
 	struct inode *inode;
@@ -199,9 +199,9 @@ out_brelse:
 	return error;
 }
 
-static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index d12a5a8730a8..d95eb5c8cb37 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -107,7 +107,8 @@ out:
 	return ret;
 }
 
-int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type)
 {
 	int ret;
 	umode_t old_mode = inode->i_mode;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e6e37591f1de..9c0b43853cd2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3625,7 +3625,8 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
-int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type);
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir);
 #else
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a63faed171de..c0b11db98e5e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5045,7 +5045,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	return ret;
 }
 
-static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+			 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6352,8 +6353,8 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 	return err;
 }
 
-static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
-			umode_t mode, dev_t rdev)
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -6416,8 +6417,8 @@ out_unlock:
 	return err;
 }
 
-static int btrfs_create(struct inode *dir, struct dentry *dentry,
-			umode_t mode, bool excl)
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -6561,7 +6562,8 @@ fail:
 	return err;
 }
 
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct inode *inode = NULL;
@@ -8816,7 +8818,8 @@ fail:
 	return -ENOMEM;
 }
 
-static int btrfs_getattr(const struct path *path, struct kstat *stat,
+static int btrfs_getattr(struct user_namespace *mnt_userns,
+			 const struct path *path, struct kstat *stat,
 			 u32 request_mask, unsigned int flags)
 {
 	u64 delalloc_bytes;
@@ -9333,9 +9336,9 @@ out_notrans:
 	return ret;
 }
 
-static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-			 struct inode *new_dir, struct dentry *new_dentry,
-			 unsigned int flags)
+static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+			 struct dentry *old_dentry, struct inode *new_dir,
+			 struct dentry *new_dentry, unsigned int flags)
 {
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
@@ -9543,8 +9546,8 @@ out:
 	return ret;
 }
 
-static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
+static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
@@ -9878,7 +9881,8 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct user_namespace *mnt_userns,
+			    struct inode *inode, int mask)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	umode_t mode = inode->i_mode;
@@ -9893,7 +9897,8 @@ static int btrfs_permission(struct inode *inode, int mask)
 	return generic_permission(&init_user_ns, inode, mask);
 }
 
-static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 	struct btrfs_trans_handle *trans;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 52a01ddbc4ac..529af59d9fd3 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -82,7 +82,8 @@ retry:
 	return acl;
 }
 
-int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	int ret = 0, size = 0;
 	const char *name = NULL;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 858ee7362ff5..83d9358854fb 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -823,8 +823,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 	return PTR_ERR(result);
 }
 
-static int ceph_mknod(struct inode *dir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
+static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
@@ -878,14 +878,14 @@ out:
 	return err;
 }
 
-static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       bool excl)
+static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
-	return ceph_mknod(dir, dentry, mode, 0);
+	return ceph_mknod(mnt_userns, dir, dentry, mode, 0);
 }
 
-static int ceph_symlink(struct inode *dir, struct dentry *dentry,
-			    const char *dest)
+static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *dest)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
@@ -937,7 +937,8 @@ out:
 	return err;
 }
 
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
@@ -1183,9 +1184,9 @@ out:
 	return err;
 }
 
-static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
 	struct ceph_mds_request *req;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 179a2bb88538..d6ece56d40e8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2238,7 +2238,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
 /*
  * setattr
  */
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -2321,7 +2322,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
  * Check inode permissions.  We verify we have a valid value for
  * the AUTH cap, then call the generic handler.
  */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	int err;
 
@@ -2368,8 +2370,8 @@ static int statx_to_caps(u32 want, umode_t mode)
  * Get all the attributes. If we have sufficient caps for the requested attrs,
  * then we can avoid talking to the MDS at all.
  */
-int ceph_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b62d8fee3b86..1ef0a2a15817 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -973,10 +973,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
 {
 	return __ceph_do_getattr(inode, NULL, mask, force);
 }
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask);
 extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(const struct path *path, struct kstat *stat,
+extern int ceph_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags);
 
 /* xattr.c */
@@ -1037,7 +1040,8 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 
 struct posix_acl *ceph_get_acl(struct inode *, int);
-int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_set_acl(struct user_namespace *mnt_userns,
+		 struct inode *inode, struct posix_acl *acl, int type);
 int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 		       struct ceph_acl_sec_ctx *as_ctx);
 void ceph_init_inode_acls(struct inode *inode,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ce14e6f8adb6..39e51dcf796f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -305,7 +305,8 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
 	return -EOPNOTSUPP;
 }
 
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask)
 {
 	struct cifs_sb_info *cifs_sb;
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2307bb0f6147..71e9c6abb2a6 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -62,19 +62,22 @@ extern void cifs_sb_deactive(struct super_block *sb);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
 extern struct inode *cifs_root_iget(struct super_block *);
-extern int cifs_create(struct inode *, struct dentry *, umode_t,
-		       bool excl);
+extern int cifs_create(struct user_namespace *, struct inode *,
+		       struct dentry *, umode_t, bool excl);
 extern int cifs_atomic_open(struct inode *, struct dentry *,
 			    struct file *, unsigned, umode_t);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
 				  unsigned int);
 extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
-extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+		      umode_t, dev_t);
+extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *,
+		      umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
-			struct dentry *, unsigned int);
+extern int cifs_rename2(struct user_namespace *, struct inode *,
+			struct dentry *, struct inode *, struct dentry *,
+			unsigned int);
 extern int cifs_revalidate_file_attr(struct file *filp);
 extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
@@ -82,8 +85,10 @@ extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_invalidate_mapping(struct inode *inode);
 extern int cifs_revalidate_mapping(struct inode *inode);
 extern int cifs_zap_mapping(struct inode *inode);
-extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int cifs_setattr(struct dentry *, struct iattr *);
+extern int cifs_getattr(struct user_namespace *, const struct path *,
+			struct kstat *, u32, unsigned int);
+extern int cifs_setattr(struct user_namespace *, struct dentry *,
+			struct iattr *);
 extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start,
 		       u64 len);
 
@@ -132,8 +137,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 /* Functions related to symlinks */
 extern const char *cifs_get_link(struct dentry *, struct inode *,
 			struct delayed_call *);
-extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
-			const char *symname);
+extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
+			struct dentry *direntry, const char *symname);
 
 #ifdef CONFIG_CIFS_XATTR
 extern const struct xattr_handler *cifs_xattr_handlers[];
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 68900f1629bf..68f4f8536e6a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -567,8 +567,8 @@ out_free_xid:
 	return rc;
 }
 
-int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
-		bool excl)
+int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
+		struct dentry *direntry, umode_t mode, bool excl)
 {
 	int rc;
 	unsigned int xid = get_xid();
@@ -611,8 +611,8 @@ out_free_xid:
 	return rc;
 }
 
-int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
-		dev_t device_number)
+int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
+	       struct dentry *direntry, umode_t mode, dev_t device_number)
 {
 	int rc = -EPERM;
 	unsigned int xid;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 374abce7efaf..3e9c7bb23f26 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1857,7 +1857,8 @@ posix_mkdir_get_info:
 	goto posix_mkdir_out;
 }
 
-int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
+int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
+	       struct dentry *direntry, umode_t mode)
 {
 	int rc = 0;
 	unsigned int xid;
@@ -2067,9 +2068,9 @@ do_rename_exit:
 }
 
 int
-cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
-	     struct inode *target_dir, struct dentry *target_dentry,
-	     unsigned int flags)
+cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
+	     struct dentry *source_dentry, struct inode *target_dir,
+	     struct dentry *target_dentry, unsigned int flags)
 {
 	char *from_name = NULL;
 	char *to_name = NULL;
@@ -2370,8 +2371,8 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 	return cifs_revalidate_mapping(inode);
 }
 
-int cifs_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
@@ -2923,7 +2924,8 @@ cifs_setattr_exit:
 }
 
 int
-cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
+	     struct iattr *attrs)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
 	struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 94dab4309fbb..7c5878a645d9 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -661,7 +661,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode,
 }
 
 int
-cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
+cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
+	     struct dentry *direntry, const char *symname)
 {
 	int rc = -EOPNOTSUPP;
 	unsigned int xid;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index d5ebd36fb2cc..e7b27754ce78 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -46,10 +46,12 @@ extern const struct file_operations coda_ioctl_operations;
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask);
+int coda_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask);
 int coda_revalidate_inode(struct inode *);
-int coda_getattr(const struct path *, struct kstat *, u32, unsigned int);
-int coda_setattr(struct dentry *, struct iattr *);
+int coda_getattr(struct user_namespace *, const struct path *, struct kstat *,
+		 u32, unsigned int);
+int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
 /* this file:  heloers */
 char *coda_f2s(struct CodaFid *f);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index ca40c2556ba6..d69989c1bac3 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -73,7 +73,8 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig
 }
 
 
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	int error;
 
@@ -132,7 +133,8 @@ static inline void coda_dir_drop_nlink(struct inode *dir)
 }
 
 /* creation routines: create, mknod, mkdir, link, symlink */
-static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl)
+static int coda_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *de, umode_t mode, bool excl)
 {
 	int error;
 	const char *name=de->d_name.name;
@@ -164,7 +166,8 @@ err_out:
 	return error;
 }
 
-static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
+static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *de, umode_t mode)
 {
 	struct inode *inode;
 	struct coda_vattr attrs;
@@ -225,7 +228,8 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 }
 
 
-static int coda_symlink(struct inode *dir_inode, struct dentry *de,
+static int coda_symlink(struct user_namespace *mnt_userns,
+			struct inode *dir_inode, struct dentry *de,
 			const char *symname)
 {
 	const char *name = de->d_name.name;
@@ -291,9 +295,9 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
 }
 
 /* rename */
-static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	const char *old_name = old_dentry->d_name.name;
 	const char *new_name = new_dentry->d_name.name;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 4d113e191cb8..d9f1bd7153df 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -251,8 +251,8 @@ static void coda_evict_inode(struct inode *inode)
 	coda_cache_clear_inode(inode);
 }
 
-int coda_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int coda_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	int err = coda_revalidate_inode(d_inode(path->dentry));
 	if (!err)
@@ -260,7 +260,8 @@ int coda_getattr(const struct path *path, struct kstat *stat,
 	return err;
 }
 
-int coda_setattr(struct dentry *de, struct iattr *iattr)
+int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de,
+		 struct iattr *iattr)
 {
 	struct inode *inode = d_inode(de);
 	struct coda_vattr vattr;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 3aec27e5eb82..cb9fd59a688c 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,8 @@
 #include "coda_linux.h"
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct user_namespace *mnt_userns,
+				 struct inode *inode, int mask);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
 			unsigned long user_data);
 
@@ -40,7 +41,8 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct user_namespace *mnt_userns,
+				 struct inode *inode, int mask)
 {
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 22dce2d35a4b..9a3aed249692 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -79,7 +79,8 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
 
 extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
 extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
-extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
+extern int configfs_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *iattr);
 
 extern struct dentry *configfs_pin_fs(void);
 extern void configfs_release_fs(void);
@@ -92,7 +93,8 @@ extern const struct inode_operations configfs_root_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
 extern const struct dentry_operations configfs_dentry_ops;
 
-extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+extern int configfs_symlink(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
 			    const char *symname);
 extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b839dd1b459f..b6098e02e20b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1268,7 +1268,8 @@ out_root_unlock:
 }
 EXPORT_SYMBOL(configfs_depend_item_unlocked);
 
-static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int ret = 0;
 	int module_got = 0;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 8bd6a883c94c..42c348bb2903 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -40,7 +40,8 @@ static const struct inode_operations configfs_inode_operations ={
 	.setattr	= configfs_setattr,
 };
 
-int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
+int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *iattr)
 {
 	struct inode * inode = d_inode(dentry);
 	struct configfs_dirent * sd = dentry->d_fsdata;
@@ -67,7 +68,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	}
 	/* attributes were changed atleast once in past */
 
-	error = simple_setattr(dentry, iattr);
+	error = simple_setattr(mnt_userns, dentry, iattr);
 	if (error)
 		return error;
 
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 8ca36394fa30..77c854364e60 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -139,7 +139,8 @@ static int get_target(const char *symname, struct path *path,
 }
 
 
-int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, const char *symname)
 {
 	int ret;
 	struct path path;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 2fcf66473436..c35249497b9b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -42,13 +42,14 @@ static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS;
  * so that we can use the file mode as part of a heuristic to determine whether
  * to lock down individual files.
  */
-static int debugfs_setattr(struct dentry *dentry, struct iattr *ia)
+static int debugfs_setattr(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *ia)
 {
 	int ret = security_locked_down(LOCKDOWN_DEBUGFS);
 
 	if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
 		return ret;
-	return simple_setattr(dentry, ia);
+	return simple_setattr(&init_user_ns, dentry, ia);
 }
 
 static const struct inode_operations debugfs_file_inode_operations = {
@@ -775,8 +776,8 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 
 	take_dentry_name_snapshot(&old_name, old_dentry);
 
-	error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
-			      dentry, 0);
+	error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry,
+			      d_inode(new_dir), dentry, 0);
 	if (error) {
 		release_dentry_name_snapshot(&old_name);
 		goto exit;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 73e3d47e7b2d..55da9a91f51a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -257,7 +257,8 @@ out:
  * Returns zero on success; non-zero on error condition
  */
 static int
-ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+ecryptfs_create(struct user_namespace *mnt_userns,
+		struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 		umode_t mode, bool excl)
 {
 	struct inode *ecryptfs_inode;
@@ -463,7 +464,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	return ecryptfs_do_unlink(dir, dentry, d_inode(dentry));
 }
 
-static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+static int ecryptfs_symlink(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
 			    const char *symname)
 {
 	int rc;
@@ -502,7 +504,8 @@ out_lock:
 	return rc;
 }
 
-static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int rc;
 	struct dentry *lower_dentry;
@@ -559,7 +562,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 }
 
 static int
-ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	int rc;
 	struct dentry *lower_dentry;
@@ -584,9 +588,9 @@ out:
 }
 
 static int
-ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry,
-		unsigned int flags)
+ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		struct dentry *old_dentry, struct inode *new_dir,
+		struct dentry *new_dentry, unsigned int flags)
 {
 	int rc;
 	struct dentry *lower_old_dentry;
@@ -874,7 +878,8 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	return inode_permission(&init_user_ns,
 				ecryptfs_inode_to_lower(inode), mask);
@@ -892,7 +897,8 @@ ecryptfs_permission(struct inode *inode, int mask)
  * All other metadata changes will be passed right to the lower filesystem,
  * and we will just update our inode to look like the lower.
  */
-static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+static int ecryptfs_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *ia)
 {
 	int rc = 0;
 	struct dentry *lower_dentry;
@@ -979,7 +985,8 @@ out:
 	return rc;
 }
 
-static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
+static int ecryptfs_getattr_link(struct user_namespace *mnt_userns,
+				 const struct path *path, struct kstat *stat,
 				 u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
@@ -1004,7 +1011,8 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
 	return rc;
 }
 
-static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
+static int ecryptfs_getattr(struct user_namespace *mnt_userns,
+			    const struct path *path, struct kstat *stat,
 			    u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 0297ad95eb5c..14e2947975fd 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -66,8 +66,8 @@ bool efivarfs_valid_name(const char *str, int len)
 	return uuid_is_valid(s);
 }
 
-static int efivarfs_create(struct inode *dir, struct dentry *dentry,
-			  umode_t mode, bool excl)
+static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode = NULL;
 	struct efivar_entry *var;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 083818063ac6..119fdce1b520 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -331,8 +331,9 @@ struct inode *erofs_iget(struct super_block *sb,
 	return inode;
 }
 
-int erofs_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int query_flags)
+int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask,
+		  unsigned int query_flags)
 {
 	struct inode *const inode = d_inode(path->dentry);
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 67a7ec945686..351dae524a0c 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -373,8 +373,9 @@ extern const struct inode_operations erofs_symlink_iops;
 extern const struct inode_operations erofs_fast_symlink_iops;
 
 struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
-int erofs_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int query_flags);
+int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask,
+		  unsigned int query_flags);
 
 /* namei.c */
 extern const struct inode_operations erofs_dir_iops;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index b8f0e829ecbd..d905bb9cd2ca 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -416,9 +416,11 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
 extern const struct file_operations exfat_file_operations;
 int __exfat_truncate(struct inode *inode, loff_t new_size);
 void exfat_truncate(struct inode *inode, loff_t size);
-int exfat_setattr(struct dentry *dentry, struct iattr *attr);
-int exfat_getattr(const struct path *path, struct kstat *stat,
-		unsigned int request_mask, unsigned int query_flags);
+int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr);
+int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, unsigned int request_mask,
+		  unsigned int query_flags);
 int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 
 /* namei.c */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index e9705b3295d3..3aa6eb4de5e3 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -267,8 +267,9 @@ write_size:
 	mutex_unlock(&sbi->s_lock);
 }
 
-int exfat_getattr(const struct path *path, struct kstat *stat,
-		unsigned int request_mask, unsigned int query_flags)
+int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path,
+		  struct kstat *stat, unsigned int request_mask,
+		  unsigned int query_flags)
 {
 	struct inode *inode = d_backing_inode(path->dentry);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -282,7 +283,8 @@ int exfat_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr)
 {
 	struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 2932b23a3b6c..d9e8ec689c55 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -541,8 +541,8 @@ out:
 	return ret;
 }
 
-static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -827,7 +827,8 @@ unlock:
 	return err;
 }
 
-static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -1318,9 +1319,10 @@ out:
 	return ret;
 }
 
-static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry,
-		unsigned int flags)
+static int exfat_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
 {
 	struct inode *old_inode, *new_inode;
 	struct super_block *sb = old_dir->i_sb;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 9031f7df2d48..b9a9db98e94b 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -216,7 +216,8 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
  * inode->i_mutex: down
  */
 int
-ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	     struct posix_acl *acl, int type)
 {
 	int error;
 	int update_mode = 0;
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0f01c759daac..917db5f6630a 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -56,7 +56,8 @@ static inline int ext2_acl_count(size_t size)
 
 /* acl.c */
 extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
-extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+			struct posix_acl *acl, int type);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
 #else
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 2a4175fbaf5e..3309fb2d327a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -764,8 +764,9 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_evict_inode(struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern int ext2_setattr (struct dentry *, struct iattr *);
-extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
+extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *);
+extern int ext2_getattr (struct user_namespace *, const struct path *,
+			 struct kstat *, u32, unsigned int);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3d8acafca8ce..68178b2234bd 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1638,8 +1638,8 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
-int ext2_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1664,7 +1664,8 @@ int ext2_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
+int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index ea980f1e2e99..3367384d344d 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -100,7 +100,9 @@ struct dentry *ext2_get_parent(struct dentry *child)
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
+static int ext2_create (struct user_namespace * mnt_userns,
+			struct inode * dir, struct dentry * dentry,
+			umode_t mode, bool excl)
 {
 	struct inode *inode;
 	int err;
@@ -118,7 +120,8 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
 	return ext2_add_nondir(dentry, inode);
 }
 
-static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode = ext2_new_inode(dir, mode, NULL);
 	if (IS_ERR(inode))
@@ -131,7 +134,8 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
-static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode * inode;
 	int err;
@@ -151,8 +155,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int ext2_symlink (struct inode * dir, struct dentry * dentry,
-	const char * symname)
+static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry * dentry, const char * symname)
 {
 	struct super_block * sb = dir->i_sb;
 	int err = -ENAMETOOLONG;
@@ -225,7 +229,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	return err;
 }
 
-static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+static int ext2_mkdir(struct user_namespace * mnt_userns,
+	struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -315,8 +320,9 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
-			struct inode * new_dir,	struct dentry * new_dentry,
+static int ext2_rename (struct user_namespace * mnt_userns,
+			struct inode * old_dir, struct dentry * old_dentry,
+			struct inode * new_dir, struct dentry * new_dentry,
 			unsigned int flags)
 {
 	struct inode * old_inode = d_inode(old_dentry);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 7b0fb66bc04d..059434e0f36c 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -222,7 +222,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	     struct posix_acl *acl, int type)
 {
 	handle_t *handle;
 	int error, credits, retries = 0;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9b63f5416a2f..84b8942a57f2 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -56,7 +56,8 @@ static inline int ext4_acl_count(size_t size)
 
 /* acl.c */
 struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2866d249f3d2..3c750f5e8ebd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2877,11 +2877,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	__ext4_iget((sb), (ino), (flags), __func__, __LINE__)
 
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
-extern int  ext4_setattr(struct dentry *, struct iattr *);
-extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int  ext4_setattr(struct user_namespace *, struct dentry *,
+			 struct iattr *);
+extern int  ext4_getattr(struct user_namespace *, const struct path *,
+			 struct kstat *, u32, unsigned int);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
-extern int  ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int  ext4_file_getattr(struct user_namespace *, const struct path *,
+			      struct kstat *, u32, unsigned int);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a303d3f8423..ce45535336fa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5319,7 +5319,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
  *
  * Called with inode->i_mutex down.
  */
-int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error, rc = 0;
@@ -5535,8 +5536,8 @@ err_out:
 	return error;
 }
 
-int ext4_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int query_flags)
+int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct ext4_inode *raw_inode;
@@ -5575,13 +5576,14 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-int ext4_file_getattr(const struct path *path, struct kstat *stat,
+int ext4_file_getattr(struct user_namespace *mnt_userns,
+		      const struct path *path, struct kstat *stat,
 		      u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	u64 delalloc_blocks;
 
-	ext4_getattr(path, stat, request_mask, query_flags);
+	ext4_getattr(&init_user_ns, path, stat, request_mask, query_flags);
 
 	/*
 	 * If there is inline data in the inode, the inode will normally not
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cf652ba3e74d..13dff80aedcb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2596,8 +2596,8 @@ static int ext4_add_nondir(handle_t *handle,
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       bool excl)
+static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -2631,8 +2631,8 @@ retry:
 	return err;
 }
 
-static int ext4_mknod(struct inode *dir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
+static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -2665,7 +2665,8 @@ retry:
 	return err;
 }
 
-static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -2774,7 +2775,8 @@ out:
 	return err;
 }
 
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -3292,7 +3294,7 @@ out_trace:
 	return retval;
 }
 
-static int ext4_symlink(struct inode *dir,
+static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 			struct dentry *dentry, const char *symname)
 {
 	handle_t *handle;
@@ -4085,7 +4087,8 @@ end_rename:
 	return retval;
 }
 
-static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int ext4_rename2(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 6a95bf28f602..a19e86c9adac 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -249,7 +249,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	return error;
 }
 
-int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 124868c13f80..986fd1bc780b 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -34,7 +34,8 @@ struct f2fs_acl_header {
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 
 extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
+extern int f2fs_set_acl(struct user_namespace *, struct inode *,
+			struct posix_acl *, int);
 extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
 							struct page *);
 #else
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index bb11759191dc..c9002b1933f0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3135,9 +3135,10 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
 int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
 int f2fs_truncate(struct inode *inode);
-int f2fs_getattr(const struct path *path, struct kstat *stat,
-			u32 request_mask, unsigned int flags);
-int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags);
+int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr);
 int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
 void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
 int f2fs_precache_extents(struct inode *inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 44cd0dbdbb5d..8f1e97e7d242 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -783,8 +783,8 @@ int f2fs_truncate(struct inode *inode)
 	return 0;
 }
 
-int f2fs_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int query_flags)
+int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -859,7 +859,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
 #define __setattr_copy setattr_copy
 #endif
 
-int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ad98926bacac..c061a67e43a3 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -314,8 +314,8 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
 	}
 }
 
-static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-						bool excl)
+static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -637,8 +637,8 @@ static const char *f2fs_get_link(struct dentry *dentry,
 	return link;
 }
 
-static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
-					const char *symname)
+static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -717,7 +717,8 @@ out_free_encrypted_link:
 	return err;
 }
 
-static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -770,8 +771,8 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
 	return -ENOTEMPTY;
 }
 
-static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
-				umode_t mode, dev_t rdev)
+static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -874,7 +875,8 @@ out:
 	return err;
 }
 
-static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 
@@ -1247,7 +1249,8 @@ out:
 	return err;
 }
 
-static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int f2fs_rename2(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 922a0c6ba46c..02d4d4234956 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -397,9 +397,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
-extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
+extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
-extern int fat_getattr(const struct path *path, struct kstat *stat,
+extern int fat_getattr(struct user_namespace *mnt_userns,
+		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int flags);
 extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f7e04f533d31..dd73d1b70c55 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -95,7 +95,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 		goto out_unlock_inode;
 
 	/* This MUST be done before doing anything irreversible... */
-	err = fat_setattr(file->f_path.dentry, &ia);
+	err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia);
 	if (err)
 		goto out_unlock_inode;
 
@@ -394,8 +394,8 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
-int fat_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int flags)
+int fat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(&init_user_ns, inode, stat);
@@ -466,7 +466,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 /* valid file mode bits */
 #define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 
-int fat_setattr(struct dentry *dentry, struct iattr *attr)
+int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = d_inode(dentry);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 9d062886fbc1..a8f3375d9d10 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -261,8 +261,8 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 }
 
 /***** Create a file */
-static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			bool excl)
+static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
@@ -339,7 +339,8 @@ out:
 }
 
 /***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
@@ -593,7 +594,8 @@ error_inode:
 }
 
 /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
-static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int msdos_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 0cdd0fb9f742..23936ecf79a5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -756,8 +756,8 @@ error:
 	return ERR_PTR(err);
 }
 
-static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		       bool excl)
+static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -846,7 +846,8 @@ out:
 	return err;
 }
 
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
@@ -892,9 +893,9 @@ out:
 	return err;
 }
 
-static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct buffer_head *dotdot_bh;
 	struct msdos_dir_entry *dotdot_de;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index f529075a2ce8..e9c0f916349d 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -50,7 +50,8 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	const char *name;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d2e318ed9b26..06a18700a845 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -605,7 +605,8 @@ out_err:
 	return err;
 }
 
-static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *,
+		      umode_t, dev_t);
 static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
 			    struct file *file, unsigned flags,
 			    umode_t mode)
@@ -645,7 +646,7 @@ out_dput:
 	return err;
 
 mknod:
-	err = fuse_mknod(dir, entry, mode, 0);
+	err = fuse_mknod(&init_user_ns, dir, entry, mode, 0);
 	if (err)
 		goto out_dput;
 no_open:
@@ -715,8 +716,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
 	return err;
 }
 
-static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
-		      dev_t rdev)
+static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *entry, umode_t mode, dev_t rdev)
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_mount *fm = get_fuse_mount(dir);
@@ -738,13 +739,14 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
 	return create_new_entry(fm, &args, dir, entry, mode);
 }
 
-static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
-		       bool excl)
+static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *entry, umode_t mode, bool excl)
 {
-	return fuse_mknod(dir, entry, mode, 0);
+	return fuse_mknod(&init_user_ns, dir, entry, mode, 0);
 }
 
-static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
+static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *entry, umode_t mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_mount *fm = get_fuse_mount(dir);
@@ -765,8 +767,8 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
 	return create_new_entry(fm, &args, dir, entry, S_IFDIR);
 }
 
-static int fuse_symlink(struct inode *dir, struct dentry *entry,
-			const char *link)
+static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *entry, const char *link)
 {
 	struct fuse_mount *fm = get_fuse_mount(dir);
 	unsigned len = strlen(link) + 1;
@@ -908,9 +910,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
 	return err;
 }
 
-static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
-			struct inode *newdir, struct dentry *newent,
-			unsigned int flags)
+static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir,
+			struct dentry *oldent, struct inode *newdir,
+			struct dentry *newent, unsigned int flags)
 {
 	struct fuse_conn *fc = get_fuse_conn(olddir);
 	int err;
@@ -1249,7 +1251,8 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
@@ -1757,7 +1760,8 @@ error:
 	return err;
 }
 
-static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry,
+			struct iattr *attr)
 {
 	struct inode *inode = d_inode(entry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1819,7 +1823,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	return ret;
 }
 
-static int fuse_getattr(const struct path *path, struct kstat *stat,
+static int fuse_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7c4b8cb93f9f..68cca8d4db6e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1180,8 +1180,8 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
 
 struct posix_acl;
 struct posix_acl *fuse_get_acl(struct inode *inode, int type);
-int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-
+int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type);
 
 /* readdir.c */
 int fuse_readdir(struct file *file, struct dir_context *ctx);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ce88ef29eef0..9165d70ead07 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -106,7 +106,8 @@ out:
 	return error;
 }
 
-int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 61353a1501c5..eccc6a43326c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,6 +13,7 @@
 
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
 extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+			struct posix_acl *acl, int type);
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 1d994bdfffaa..8f5523822788 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
 		if (error)
 			goto out;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 226b5b1dc1fa..cfac2c1e67fa 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -325,7 +325,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = gfs2_permission(dir, MAY_EXEC);
+		error = gfs2_permission(&init_user_ns, dir, MAY_EXEC);
 		if (error)
 			goto out;
 	}
@@ -355,7 +355,8 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&init_user_ns, &dip->i_inode,
+				MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -843,8 +844,8 @@ fail:
  * Returns: errno
  */
 
-static int gfs2_create(struct inode *dir, struct dentry *dentry,
-		       umode_t mode, bool excl)
+static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl);
 }
@@ -951,7 +952,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		goto out_gunlock;
 
-	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_gunlock;
 
@@ -1068,7 +1069,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&init_user_ns, &dip->i_inode,
+				MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -1204,8 +1206,8 @@ out_inodes:
  * Returns: errno
  */
 
-static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
-			const char *symname)
+static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	unsigned int size;
 
@@ -1225,7 +1227,8 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
  * Returns: errno
  */
 
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir));
 	return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0);
@@ -1240,8 +1243,8 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
  *
  */
 
-static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      dev_t dev)
+static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0);
 }
@@ -1490,7 +1493,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+		error = gfs2_permission(&init_user_ns, ndir,
+					MAY_WRITE | MAY_EXEC);
 		if (error)
 			goto out_gunlock;
 
@@ -1525,7 +1529,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = gfs2_permission(d_inode(odentry), MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, d_inode(odentry),
+					MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1688,12 +1693,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
 		goto out_gunlock;
 
 	if (S_ISDIR(old_mode)) {
-		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, odentry->d_inode,
+					MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
 	if (S_ISDIR(new_mode)) {
-		error = gfs2_permission(ndentry->d_inode, MAY_WRITE);
+		error = gfs2_permission(&init_user_ns, ndentry->d_inode,
+					MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1747,9 +1754,9 @@ out:
 	return error;
 }
 
-static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
-			struct inode *ndir, struct dentry *ndentry,
-			unsigned int flags)
+static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir,
+			struct dentry *odentry, struct inode *ndir,
+			struct dentry *ndentry, unsigned int flags)
 {
 	flags &= ~RENAME_NOREPLACE;
 
@@ -1833,7 +1840,8 @@ out:
  * Returns: errno
  */
 
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		    int mask)
 {
 	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
@@ -1963,7 +1971,8 @@ out:
  * Returns: errno
  */
 
-static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+static int gfs2_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -2008,6 +2017,7 @@ out:
 
 /**
  * gfs2_getattr - Read out an inode's attributes
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @path: Object to query
  * @stat: The inode's stats
  * @request_mask: Mask of STATX_xxx flags indicating the caller's interests
@@ -2022,7 +2032,8 @@ out:
  * Returns: errno
  */
 
-static int gfs2_getattr(const struct path *path, struct kstat *stat,
+static int gfs2_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8073b8d2c7fa..c447bd5b3017 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,8 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
 
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 				  int is_root);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct user_namespace *mnt_userns,
+			   struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 3bf2ae0e467c..527f6e46cbe8 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -189,8 +189,8 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
  * a directory and return a corresponding inode, given the inode for
  * the directory and the name (and its length) of the new file.
  */
-static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	int res;
@@ -219,7 +219,8 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
  * in a directory, given the inode for the parent directory and the
  * name (and its length) of the new directory.
  */
-static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	int res;
@@ -279,9 +280,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
  * new file/directory.
  * XXX: how do you handle must_be dir?
  */
-static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	int res;
 
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index f71c384064c8..b8eb0322a3e5 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -204,7 +204,8 @@ extern const struct address_space_operations hfs_btree_aops;
 extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
 extern int hfs_write_inode(struct inode *, struct writeback_control *);
-extern int hfs_inode_setattr(struct dentry *, struct iattr *);
+extern int hfs_inode_setattr(struct user_namespace *, struct dentry *,
+			     struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			__be32 log_size, __be32 phys_size, u32 clump_size);
 extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c646218b72bf..3fc5cb346586 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -602,13 +602,15 @@ static int hfs_file_release(struct inode *inode, struct file *file)
  *     correspond to the same HFS file.
  */
 
-int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
+int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
 	int error;
 
-	error = setattr_prepare(&init_user_ns, dentry, attr); /* basic permission checks */
+	error = setattr_prepare(&init_user_ns, dentry,
+				attr); /* basic permission checks */
 	if (error)
 		return error;
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 29a9dcfbe81f..03e6c046faf4 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -434,8 +434,8 @@ out:
 	return res;
 }
 
-static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
-			   const char *symname)
+static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, const char *symname)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
@@ -476,8 +476,8 @@ out:
 	return res;
 }
 
-static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
-			 umode_t mode, dev_t rdev)
+static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
 	struct inode *inode;
@@ -517,18 +517,20 @@ out:
 	return res;
 }
 
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			  bool excl)
+static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode, bool excl)
 {
-	return hfsplus_mknod(dir, dentry, mode, 0);
+	return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0);
 }
 
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
-	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
+	return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0);
 }
 
-static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int hfsplus_rename(struct user_namespace *mnt_userns,
+			  struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags)
 {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a92de5199ec3..12b20479ed2b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -488,8 +488,9 @@ void hfsplus_inode_write_fork(struct inode *inode,
 			      struct hfsplus_fork_raw *fork);
 int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
 int hfsplus_cat_write_inode(struct inode *inode);
-int hfsplus_getattr(const struct path *path, struct kstat *stat,
-		    u32 request_mask, unsigned int query_flags);
+int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		    struct kstat *stat, u32 request_mask,
+		    unsigned int query_flags);
 int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 642e067d8fe8..7a937de9b2ad 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -241,7 +241,8 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+static int hfsplus_setattr(struct user_namespace *mnt_userns,
+			   struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
@@ -270,8 +271,9 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-int hfsplus_getattr(const struct path *path, struct kstat *stat,
-		    u32 request_mask, unsigned int query_flags)
+int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		    struct kstat *stat, u32 request_mask,
+		    unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 6970e29a5287..7c918cd816a3 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -555,8 +555,8 @@ static int read_name(struct inode *ino, char *name)
 	return 0;
 }
 
-static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			 bool excl)
+static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	char *name;
@@ -654,8 +654,8 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 	return err;
 }
 
-static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
-			  const char *to)
+static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino,
+			  struct dentry *dentry, const char *to)
 {
 	char *file;
 	int err;
@@ -667,7 +667,8 @@ static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
 	return err;
 }
 
-static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino,
+			struct dentry *dentry, umode_t mode)
 {
 	char *file;
 	int err;
@@ -691,7 +692,8 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	return err;
 }
 
-static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode *inode;
 	char *name;
@@ -729,7 +731,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+static int hostfs_rename2(struct user_namespace *mnt_userns,
+			  struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags)
 {
@@ -757,7 +760,8 @@ static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	return err;
 }
 
-static int hostfs_permission(struct inode *ino, int desired)
+static int hostfs_permission(struct user_namespace *mnt_userns,
+			     struct inode *ino, int desired)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
@@ -783,7 +787,8 @@ static int hostfs_permission(struct inode *ino, int desired)
 	return err;
 }
 
-static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hostfs_setattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hostfs_iattr attrs;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1cca83218fb5..167ec6884642 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *);
 void hpfs_read_inode(struct inode *);
 void hpfs_write_inode(struct inode *);
 void hpfs_write_inode_nolock(struct inode *);
-int hpfs_setattr(struct dentry *, struct iattr *);
+int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 void hpfs_write_if_changed(struct inode *);
 void hpfs_evict_inode(struct inode *);
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 8ba2152a78ba..82208cc28ebd 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -257,7 +257,8 @@ void hpfs_write_inode_nolock(struct inode *i)
 	brelse(bh);
 }
 
-int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
+int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error = -EINVAL;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1aee39160ac5..d73f8a67168e 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -20,7 +20,8 @@ static void hpfs_update_directory_times(struct inode *dir)
 	hpfs_write_inode_nolock(dir);
 }
 
-static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -128,7 +129,8 @@ bail:
 	return err;
 }
 
-static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -215,7 +217,8 @@ bail:
 	return err;
 }
 
-static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -289,7 +292,8 @@ bail:
 	return err;
 }
 
-static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
+static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symlink)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
@@ -506,10 +510,10 @@ fail:
 const struct address_space_operations hpfs_symlink_aops = {
 	.readpage	= hpfs_symlink_readpage
 };
-	
-static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+
+static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	const unsigned char *old_name = old_dentry->d_name.name;
 	unsigned old_len = old_dentry->d_name.len;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 327e572b4e00..c5c32eb59498 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -751,7 +751,8 @@ out:
 	return error;
 }
 
-static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hstate *h = hstate_inode(inode);
@@ -898,33 +899,39 @@ static int do_hugetlbfs_mknod(struct inode *dir,
 	return error;
 }
 
-static int hugetlbfs_mknod(struct inode *dir,
-			struct dentry *dentry, umode_t mode, dev_t dev)
+static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
 }
 
-static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode)
 {
-	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
+				     mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
 	return retval;
 }
 
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int hugetlbfs_create(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
+			    umode_t mode, bool excl)
 {
-	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+	return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 }
 
-static int hugetlbfs_tmpfile(struct inode *dir,
-			struct dentry *dentry, umode_t mode)
+static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *dentry,
+			     umode_t mode)
 {
 	return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
 }
 
-static int hugetlbfs_symlink(struct inode *dir,
-			struct dentry *dentry, const char *symname)
+static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
+			     struct inode *dir, struct dentry *dentry,
+			     const char *symname)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 5f27ac593479..55a79df70d24 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -226,7 +226,8 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
 	return rc;
 }
 
-int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type)
 {
 	int rc, xprefix;
 
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 12d0271bdde3..62c50da9d493 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,8 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
 struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
-int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		  struct posix_acl *acl, int type);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
 extern int jffs2_init_acl_post(struct inode *);
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 776493713153..c0aabbcbfd58 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -24,18 +24,21 @@
 
 static int jffs2_readdir (struct file *, struct dir_context *);
 
-static int jffs2_create (struct inode *,struct dentry *,umode_t,
-			 bool);
+static int jffs2_create (struct user_namespace *, struct inode *,
+		         struct dentry *, umode_t, bool);
 static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
 				    unsigned int);
 static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
 static int jffs2_unlink (struct inode *,struct dentry *);
-static int jffs2_symlink (struct inode *,struct dentry *,const char *);
-static int jffs2_mkdir (struct inode *,struct dentry *,umode_t);
+static int jffs2_symlink (struct user_namespace *, struct inode *,
+			  struct dentry *, const char *);
+static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *,
+			umode_t);
 static int jffs2_rmdir (struct inode *,struct dentry *);
-static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t);
-static int jffs2_rename (struct inode *, struct dentry *,
-			 struct inode *, struct dentry *,
+static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *,
+			umode_t,dev_t);
+static int jffs2_rename (struct user_namespace *, struct inode *,
+			 struct dentry *, struct inode *, struct dentry *,
 			 unsigned int);
 
 const struct file_operations jffs2_dir_operations =
@@ -157,8 +160,8 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx)
 /***********************************************************************/
 
 
-static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
-			umode_t mode, bool excl)
+static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct jffs2_raw_inode *ri;
 	struct jffs2_inode_info *f, *dir_f;
@@ -276,7 +279,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 
 /***********************************************************************/
 
-static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target)
+static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i,
+			  struct dentry *dentry, const char *target)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
@@ -438,7 +442,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 }
 
 
-static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode)
+static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i,
+		        struct dentry *dentry, umode_t mode)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
@@ -609,7 +614,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	return ret;
 }
 
-static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i,
+		        struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
@@ -756,7 +762,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
 	return ret;
 }
 
-static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
+static int jffs2_rename (struct user_namespace *mnt_userns,
+			 struct inode *old_dir_i, struct dentry *old_dentry,
 			 struct inode *new_dir_i, struct dentry *new_dentry,
 			 unsigned int flags)
 {
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ee9f51bab4c6..2ac410477c4f 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,7 +190,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	return 0;
 }
 
-int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
+int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int rc;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index ef1cfa61549e..173eccac691d 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long);
 extern const struct inode_operations jffs2_symlink_inode_operations;
 
 /* fs.c */
-int jffs2_setattr (struct dentry *, struct iattr *);
+int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
 void jffs2_evict_inode (struct inode *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index cf79a34bfada..43c285c3d2a7 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -91,7 +91,8 @@ out:
 	return rc;
 }
 
-int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		struct posix_acl *acl, int type)
 {
 	int rc;
 	tid_t tid;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 61c3b0c1fbf6..28b70e7c7dd4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -85,7 +85,8 @@ static int jfs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int rc;
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 9f8f92dd6f84..7ae389a7a366 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -8,7 +8,8 @@
 #ifdef CONFIG_JFS_POSIX_ACL
 
 struct posix_acl *jfs_get_acl(struct inode *inode, int type);
-int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		struct posix_acl *acl, int type);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 
 #else
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 70a0d12e427e..01daa0cb0ae5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
 	int fh_len, int fh_type);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern int jfs_setattr(struct dentry *, struct iattr *);
+extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
 extern const struct address_space_operations jfs_aops;
 extern const struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 7a55d14cc1af..9abed0d750e5 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -59,8 +59,8 @@ static inline void free_ea_wmap(struct inode *inode)
  * RETURN:	Errors from subroutines
  *
  */
-static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	int rc = 0;
 	tid_t tid;		/* transaction id */
@@ -192,7 +192,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
  * note:
  * EACCES: user needs search+write permission on the parent directory
  */
-static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip,
+		     struct dentry *dentry, umode_t mode)
 {
 	int rc = 0;
 	tid_t tid;		/* transaction id */
@@ -868,8 +869,8 @@ static int jfs_link(struct dentry *old_dentry,
  * an intermediate result whose length exceeds PATH_MAX [XPG4.2]
 */
 
-static int jfs_symlink(struct inode *dip, struct dentry *dentry,
-		const char *name)
+static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip,
+		       struct dentry *dentry, const char *name)
 {
 	int rc;
 	tid_t tid;
@@ -1058,9 +1059,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
  *
  * FUNCTION:	rename a file or directory
  */
-static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct btstack btstack;
 	ino_t ino;
@@ -1344,8 +1345,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
  *
  * FUNCTION:	Create a special file (device)
  */
-static int jfs_mknod(struct inode *dir, struct dentry *dentry,
-		umode_t mode, dev_t rdev)
+static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct jfs_inode_info *jfs_ip;
 	struct btstack btstack;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 7a53eed69fef..7e0e62deab53 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1110,7 +1110,8 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 	return ret;
 }
 
-static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+static int kernfs_iop_mkdir(struct user_namespace *mnt_userns,
+			    struct inode *dir, struct dentry *dentry,
 			    umode_t mode)
 {
 	struct kernfs_node *parent = dir->i_private;
@@ -1147,7 +1148,8 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
-static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int kernfs_iop_rename(struct user_namespace *mnt_userns,
+			     struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry,
 			     unsigned int flags)
 {
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 032d3d7546d8..d73950fc3d57 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -112,7 +112,8 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 	return ret;
 }
 
-int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct kernfs_node *kn = inode->i_private;
@@ -183,7 +184,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 		set_nlink(inode, kn->dir.subdirs + 2);
 }
 
-int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+int kernfs_iop_getattr(struct user_namespace *mnt_userns,
+		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
@@ -272,7 +274,8 @@ void kernfs_evict_inode(struct inode *inode)
 	kernfs_put(kn);
 }
 
-int kernfs_iop_permission(struct inode *inode, int mask)
+int kernfs_iop_permission(struct user_namespace *mnt_userns,
+			  struct inode *inode, int mask)
 {
 	struct kernfs_node *kn;
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 7ee97ef59184..ccc3b44f6306 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -89,9 +89,12 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
  */
 extern const struct xattr_handler *kernfs_xattr_handlers[];
 void kernfs_evict_inode(struct inode *inode);
-int kernfs_iop_permission(struct inode *inode, int mask);
-int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
-int kernfs_iop_getattr(const struct path *path, struct kstat *stat,
+int kernfs_iop_permission(struct user_namespace *mnt_userns,
+			  struct inode *inode, int mask);
+int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *iattr);
+int kernfs_iop_getattr(struct user_namespace *mnt_userns,
+		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int query_flags);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
 int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
diff --git a/fs/libfs.c b/fs/libfs.c
index 508e9ea8e6f3..967aefda6ee3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -27,8 +27,9 @@
 
 #include "internal.h"
 
-int simple_getattr(const struct path *path, struct kstat *stat,
-		   u32 request_mask, unsigned int query_flags)
+int simple_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		   struct kstat *stat, u32 request_mask,
+		   unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	generic_fillattr(&init_user_ns, inode, stat);
@@ -447,9 +448,9 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 }
 EXPORT_SYMBOL(simple_rmdir);
 
-int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
-		  struct inode *new_dir, struct dentry *new_dentry,
-		  unsigned int flags)
+int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		  struct dentry *old_dentry, struct inode *new_dir,
+		  struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *inode = d_inode(old_dentry);
 	int they_are_dirs = d_is_dir(old_dentry);
@@ -492,18 +493,19 @@ EXPORT_SYMBOL(simple_rename);
  * on simple regular filesystems.  Anything that needs to change on-disk
  * or wire state on size changes needs its own setattr method.
  */
-int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
 
-	error = setattr_prepare(&init_user_ns, dentry, iattr);
+	error = setattr_prepare(mnt_userns, dentry, iattr);
 	if (error)
 		return error;
 
 	if (iattr->ia_valid & ATTR_SIZE)
 		truncate_setsize(inode, iattr->ia_size);
-	setattr_copy(&init_user_ns, inode, iattr);
+	setattr_copy(mnt_userns, inode, iattr);
 	mark_inode_dirty(inode);
 	return 0;
 }
@@ -1300,7 +1302,8 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
 	return ERR_PTR(-ENOENT);
 }
 
-static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+static int empty_dir_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
@@ -1308,7 +1311,8 @@ static int empty_dir_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+static int empty_dir_setattr(struct user_namespace *mnt_userns,
+			     struct dentry *dentry, struct iattr *attr)
 {
 	return -EPERM;
 }
@@ -1318,14 +1322,9 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
 	return -EOPNOTSUPP;
 }
 
-static int empty_dir_permission(struct inode *inode, int mask)
-{
-	return generic_permission(&init_user_ns, inode, mask);
-}
-
 static const struct inode_operations empty_dir_inode_operations = {
 	.lookup		= empty_dir_lookup,
-	.permission	= empty_dir_permission,
+	.permission	= generic_permission,
 	.setattr	= empty_dir_setattr,
 	.getattr	= empty_dir_getattr,
 	.listxattr	= empty_dir_listxattr,
diff --git a/fs/minix/file.c b/fs/minix/file.c
index f07acd268577..6a7bd2d9eec0 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -22,7 +22,8 @@ const struct file_operations minix_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
-static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+static int minix_setattr(struct user_namespace *mnt_userns,
+			 struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 91c81d2fc90d..a532a99bbe81 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -652,8 +652,8 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return err;
 }
 
-int minix_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags)
+int minix_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct super_block *sb = path->dentry->d_sb;
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 168d45d3de73..202173368025 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -51,7 +51,8 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
 extern unsigned long minix_count_free_blocks(struct super_block *sb);
-extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int minix_getattr(struct user_namespace *, const struct path *,
+			 struct kstat *, u32, unsigned int);
 extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
 
 extern void V1_minix_truncate(struct inode *);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1a6084d2b02e..937fa5fae2b8 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -33,7 +33,8 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
 	return d_splice_alias(inode, dentry);
 }
 
-static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	int error;
 	struct inode *inode;
@@ -51,7 +52,8 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
 	return error;
 }
 
-static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
 	int error;
 	struct inode *inode = minix_new_inode(dir, mode, &error);
@@ -63,14 +65,14 @@ static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return error;
 }
 
-static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int minix_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
-	return minix_mknod(dir, dentry, mode, 0);
+	return minix_mknod(mnt_userns, dir, dentry, mode, 0);
 }
 
-static int minix_symlink(struct inode * dir, struct dentry *dentry,
-	  const char * symname)
+static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	int err = -ENAMETOOLONG;
 	int i = strlen(symname)+1;
@@ -109,7 +111,8 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
 	return add_nondir(dentry, inode);
 }
 
-static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
+static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -181,8 +184,9 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
-			struct inode * new_dir, struct dentry *new_dentry,
+static int minix_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
 	struct inode * old_inode = d_inode(old_dentry);
diff --git a/fs/namei.c b/fs/namei.c
index c8c083daf368..d9ceb75ac169 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -443,7 +443,7 @@ static inline int do_inode_permission(struct user_namespace *mnt_userns,
 {
 	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 		if (likely(inode->i_op->permission))
-			return inode->i_op->permission(inode, mask);
+			return inode->i_op->permission(mnt_userns, inode, mask);
 
 		/* This gets set once for the inode lifetime */
 		spin_lock(&inode->i_lock);
@@ -2199,11 +2199,13 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
+		struct user_namespace *mnt_userns;
 		const char *link;
 		u64 hash_len;
 		int type;
 
-		err = may_lookup(&init_user_ns, nd);
+		mnt_userns = mnt_user_ns(nd->path.mnt);
+		err = may_lookup(mnt_userns, nd);
 		if (err)
 			return err;
 
@@ -2251,7 +2253,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 OK:
 			/* pathname or trailing symlink, done */
 			if (!depth) {
-				nd->dir_uid = i_uid_into_mnt(&init_user_ns, nd->inode);
+				nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
 				nd->dir_mode = nd->inode->i_mode;
 				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
@@ -2904,7 +2906,7 @@ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	error = dir->i_op->create(dir, dentry, mode, want_excl);
+	error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -2995,7 +2997,7 @@ static int may_open(struct user_namespace *mnt_userns, const struct path *path,
 	return 0;
 }
 
-static int handle_truncate(struct file *filp)
+static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
 {
 	const struct path *path = &filp->f_path;
 	struct inode *inode = path->dentry->d_inode;
@@ -3009,7 +3011,7 @@ static int handle_truncate(struct file *filp)
 	if (!error)
 		error = security_path_truncate(path);
 	if (!error) {
-		error = do_truncate(&init_user_ns, path->dentry, 0,
+		error = do_truncate(mnt_userns, path->dentry, 0,
 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
 				    filp);
 	}
@@ -3118,6 +3120,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 				  const struct open_flags *op,
 				  bool got_write)
 {
+	struct user_namespace *mnt_userns;
 	struct dentry *dir = nd->path.dentry;
 	struct inode *dir_inode = dir->d_inode;
 	int open_flag = op->open_flag;
@@ -3165,13 +3168,14 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	 */
 	if (unlikely(!got_write))
 		open_flag &= ~O_TRUNC;
+	mnt_userns = mnt_user_ns(nd->path.mnt);
 	if (open_flag & O_CREAT) {
 		if (open_flag & O_EXCL)
 			open_flag &= ~O_TRUNC;
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current_umask();
 		if (likely(got_write))
-			create_error = may_o_create(&init_user_ns, &nd->path,
+			create_error = may_o_create(mnt_userns, &nd->path,
 						    dentry, mode);
 		else
 			create_error = -EROFS;
@@ -3207,8 +3211,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 			error = -EACCES;
 			goto out_dput;
 		}
-		error = dir_inode->i_op->create(dir_inode, dentry, mode,
-						open_flag & O_EXCL);
+
+		error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
+						mode, open_flag & O_EXCL);
 		if (error)
 			goto out_dput;
 	}
@@ -3316,6 +3321,7 @@ finish_lookup:
 static int do_open(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
+	struct user_namespace *mnt_userns;
 	int open_flag = op->open_flag;
 	bool do_truncate;
 	int acc_mode;
@@ -3328,12 +3334,13 @@ static int do_open(struct nameidata *nd,
 	}
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
+	mnt_userns = mnt_user_ns(nd->path.mnt);
 	if (open_flag & O_CREAT) {
 		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
 			return -EEXIST;
 		if (d_is_dir(nd->path.dentry))
 			return -EISDIR;
-		error = may_create_in_sticky(&init_user_ns, nd,
+		error = may_create_in_sticky(mnt_userns, nd,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
 			return error;
@@ -3353,13 +3360,13 @@ static int do_open(struct nameidata *nd,
 			return error;
 		do_truncate = true;
 	}
-	error = may_open(&init_user_ns, &nd->path, acc_mode, open_flag);
+	error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
 	if (!error)
 		error = ima_file_check(file, op->acc_mode);
 	if (!error && do_truncate)
-		error = handle_truncate(file);
+		error = handle_truncate(mnt_userns, file);
 	if (unlikely(error > 0)) {
 		WARN_ON(1);
 		error = -EINVAL;
@@ -3403,7 +3410,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
 	child = d_alloc(dentry, &slash_name);
 	if (unlikely(!child))
 		goto out_err;
-	error = dir->i_op->tmpfile(dir, child, mode);
+	error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
 	if (error)
 		goto out_err;
 	error = -ENOENT;
@@ -3446,7 +3453,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	path.dentry = child;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
-	error = may_open(&init_user_ns, &path, 0, op->open_flag);
+	error = may_open(mnt_userns, &path, 0, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = path.mnt;
@@ -3690,7 +3697,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (error)
 		return error;
 
-	error = dir->i_op->mknod(dir, dentry, mode, dev);
+	error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -3809,7 +3816,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	if (max_links && dir->i_nlink >= max_links)
 		return -EMLINK;
 
-	error = dir->i_op->mkdir(dir, dentry, mode);
+	error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
 	return error;
@@ -3834,7 +3841,8 @@ retry:
 	if (!error) {
 		struct user_namespace *mnt_userns;
 		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, mode);
+		error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
+				  mode);
 	}
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
@@ -4087,7 +4095,8 @@ retry_deleg:
 		if (error)
 			goto exit2;
 		mnt_userns = mnt_user_ns(path.mnt);
-		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, &delegated_inode);
+		error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
+				   &delegated_inode);
 exit2:
 		dput(dentry);
 	}
@@ -4166,7 +4175,7 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (error)
 		return error;
 
-	error = dir->i_op->symlink(dir, dentry, oldname);
+	error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -4357,13 +4366,13 @@ retry:
 	error = -EXDEV;
 	if (old_path.mnt != new_path.mnt)
 		goto out_dput;
-	error = may_linkat(&init_user_ns, &old_path);
+	mnt_userns = mnt_user_ns(new_path.mnt);
+	error = may_linkat(mnt_userns, &old_path);
 	if (unlikely(error))
 		goto out_dput;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
 		goto out_dput;
-	mnt_userns = mnt_user_ns(new_path.mnt);
 	error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
 			 new_dentry, &delegated_inode);
 out_dput:
@@ -4542,8 +4551,8 @@ int vfs_rename(struct renamedata *rd)
 		if (error)
 			goto out;
 	}
-	error = old_dir->i_op->rename(old_dir, old_dentry,
-				       new_dir, new_dentry, flags);
+	error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
+				      new_dir, new_dentry, flags);
 	if (error)
 		goto out;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 727e01a84503..19a9f434442f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2095,8 +2095,8 @@ EXPORT_SYMBOL_GPL(nfs_instantiate);
  * that the operation succeeded on the server, but an error in the
  * reply path made it appear to have failed.
  */
-int nfs_create(struct inode *dir, struct dentry *dentry,
-		umode_t mode, bool excl)
+int nfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct iattr attr;
 	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
@@ -2124,7 +2124,8 @@ EXPORT_SYMBOL_GPL(nfs_create);
  * See comments for nfs_proc_create regarding failed operations.
  */
 int
-nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	  struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct iattr attr;
 	int status;
@@ -2150,7 +2151,8 @@ EXPORT_SYMBOL_GPL(nfs_mknod);
 /*
  * See comments for nfs_proc_create regarding failed operations.
  */
-int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode)
 {
 	struct iattr attr;
 	int error;
@@ -2295,7 +2297,8 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
  * now have a new file handle and can instantiate an in-core NFS inode
  * and move the raw page into its mapping.
  */
-int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		struct dentry *dentry, const char *symname)
 {
 	struct page *page;
 	char *kaddr;
@@ -2398,9 +2401,9 @@ EXPORT_SYMBOL_GPL(nfs_link);
  * If these conditions are met, we can drop the dentries before doing
  * the rename.
  */
-int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-	       struct inode *new_dir, struct dentry *new_dentry,
-	       unsigned int flags)
+int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+	       struct dentry *old_dentry, struct inode *new_dir,
+	       struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
@@ -2939,7 +2942,9 @@ static int nfs_execute_ok(struct inode *inode, int mask)
 	return ret;
 }
 
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct user_namespace *mnt_userns,
+		   struct inode *inode,
+		   int mask)
 {
 	const struct cred *cred = current_cred();
 	int res = 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cab123ec1664..447e95974386 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -594,7 +594,8 @@ EXPORT_SYMBOL_GPL(nfs_fhget);
 #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
 
 int
-nfs_setattr(struct dentry *dentry, struct iattr *attr)
+nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+	    struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct nfs_fattr *fattr;
@@ -787,8 +788,8 @@ static bool nfs_need_revalidate_inode(struct inode *inode)
 	return false;
 }
 
-int nfs_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct nfs_server *server = NFS_SERVER(inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 62d3189745cd..25fb43b69e5a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -378,14 +378,18 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
 extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc);
 struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
-int nfs_create(struct inode *, struct dentry *, umode_t, bool);
-int nfs_mkdir(struct inode *, struct dentry *, umode_t);
+int nfs_create(struct user_namespace *, struct inode *, struct dentry *,
+	       umode_t, bool);
+int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *,
+	      umode_t);
 int nfs_rmdir(struct inode *, struct dentry *);
 int nfs_unlink(struct inode *, struct dentry *);
-int nfs_symlink(struct inode *, struct dentry *, const char *);
+int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *,
+		const char *);
 int nfs_link(struct dentry *, struct inode *, struct dentry *);
-int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
-int nfs_rename(struct inode *, struct dentry *,
+int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t,
+	      dev_t);
+int nfs_rename(struct user_namespace *, struct inode *, struct dentry *,
 	       struct inode *, struct dentry *, unsigned int);
 
 /* file.c */
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 55fc711e368b..93e60e921f92 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -208,20 +208,23 @@ out_fc:
 }
 
 static int
-nfs_namespace_getattr(const struct path *path, struct kstat *stat,
-			u32 request_mask, unsigned int query_flags)
+nfs_namespace_getattr(struct user_namespace *mnt_userns,
+		      const struct path *path, struct kstat *stat,
+		      u32 request_mask, unsigned int query_flags)
 {
 	if (NFS_FH(d_inode(path->dentry))->size != 0)
-		return nfs_getattr(path, stat, request_mask, query_flags);
+		return nfs_getattr(mnt_userns, path, stat, request_mask,
+				   query_flags);
 	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
 	return 0;
 }
 
 static int
-nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		      struct iattr *attr)
 {
 	if (NFS_FH(d_inode(dentry))->size != 0)
-		return nfs_setattr(dentry, attr);
+		return nfs_setattr(mnt_userns, dentry, attr);
 	return -EACCES;
 }
 
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 1b950b66b3bb..c8a192802dda 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -12,7 +12,8 @@
  */
 #ifdef CONFIG_NFS_V3_ACL
 extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
-extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+			struct posix_acl *acl, int type);
 extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		struct posix_acl *dfacl);
 extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index c6c863382f37..5604e807fc01 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -251,7 +251,8 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
 }
 
-int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
 	int status;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8aad3c48092a..2e8eb263cf0f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -805,7 +805,8 @@ void nilfs_evict_inode(struct inode *inode)
 	 */
 }
 
-int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *iattr)
 {
 	struct nilfs_transaction_info ti;
 	struct inode *inode = d_inode(dentry);
@@ -843,7 +844,8 @@ out_err:
 	return err;
 }
 
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     int mask)
 {
 	struct nilfs_root *root = NILFS_I(inode)->i_root;
 
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index a6ec7961d4f5..ecace5f96a95 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -72,8 +72,8 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			bool excl)
+static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
@@ -100,7 +100,8 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 
 static int
-nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
@@ -124,8 +125,8 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 	return err;
 }
 
-static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
+static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = dir->i_sb;
@@ -201,7 +202,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
 	return err;
 }
 
-static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
@@ -338,8 +340,9 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
 	return err;
 }
 
-static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir,	struct dentry *new_dentry,
+static int nilfs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f8450ee3fd06..c4a45a081ade 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -267,9 +267,11 @@ extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
 extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
-extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern int nilfs_setattr(struct user_namespace *, struct dentry *,
+			 struct iattr *);
 extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
 int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 38f4cf1d4497..4435dbbc0b63 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2848,6 +2848,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
 
 /**
  * ntfs_setattr - called from notify_change() when an attribute is being changed
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry:	dentry whose attributes to change
  * @attr:	structure describing the attributes and the changes
  *
@@ -2860,7 +2861,8 @@ void ntfs_truncate_vfs(struct inode *vi) {
  *
  * Called with ->i_mutex held.
  */
-int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
+int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	struct inode *vi = d_inode(dentry);
 	int err;
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 363e4e820673..6f78ee00f57f 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -289,7 +289,8 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
 extern int ntfs_truncate(struct inode *vi);
 extern void ntfs_truncate_vfs(struct inode *vi);
 
-extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ntfs_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr);
 
 extern int __ntfs_write_inode(struct inode *vi, int sync);
 
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 990756cee4bd..5259badabb56 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -262,7 +262,8 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		      struct posix_acl *acl, int type)
 {
 	struct buffer_head *bh = NULL;
 	int status, had_lock;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 127b13432146..4e86450917b2 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -19,7 +19,8 @@ struct ocfs2_acl_entry {
 };
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
-int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		      struct posix_acl *acl, int type);
 extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
 			  struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 9fa66cd1f622..b2870f1a31df 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -190,7 +190,8 @@ static int dlmfs_file_release(struct inode *inode,
  * We do ->setattr() just to override size changes.  Our size is the size
  * of the LVB and nothing else.
  */
-static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+static int dlmfs_file_setattr(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, struct iattr *attr)
 {
 	int error;
 	struct inode *inode = d_inode(dentry);
@@ -395,7 +396,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
  * File creation. Allocate an inode, and we're done..
  */
 /* SMP-safe */
-static int dlmfs_mkdir(struct inode * dir,
+static int dlmfs_mkdir(struct user_namespace * mnt_userns,
+		       struct inode * dir,
 		       struct dentry * dentry,
 		       umode_t mode)
 {
@@ -443,7 +445,8 @@ bail:
 	return status;
 }
 
-static int dlmfs_create(struct inode *dir,
+static int dlmfs_create(struct user_namespace *mnt_userns,
+			struct inode *dir,
 			struct dentry *dentry,
 			umode_t mode,
 			bool excl)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a070d4c9b6ed..e3039d973acd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1112,7 +1112,8 @@ out:
 	return ret;
 }
 
-int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr)
 {
 	int status = 0, size_change;
 	int inode_locked = 0;
@@ -1298,8 +1299,8 @@ bail:
 	return status;
 }
 
-int ocfs2_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags)
+int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct super_block *sb = path->dentry->d_sb;
@@ -1330,7 +1331,8 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		     int mask)
 {
 	int ret, had_lock;
 	struct ocfs2_lock_holder oh;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 4832cbceba5b..8536cec5f122 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,10 +51,13 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
 			  u64 new_i_size, u64 zero_to);
 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 		      loff_t zero_to);
-int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
-int ocfs2_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr);
+int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags);
+int ocfs2_permission(struct user_namespace *mnt_userns,
+		     struct inode *inode,
+		     int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 908b79e1082b..3abdd36da2e2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -221,7 +221,8 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
 	iput(inode);
 }
 
-static int ocfs2_mknod(struct inode *dir,
+static int ocfs2_mknod(struct user_namespace *mnt_userns,
+		       struct inode *dir,
 		       struct dentry *dentry,
 		       umode_t mode,
 		       dev_t dev)
@@ -645,7 +646,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	return status;
 }
 
-static int ocfs2_mkdir(struct inode *dir,
+static int ocfs2_mkdir(struct user_namespace *mnt_userns,
+		       struct inode *dir,
 		       struct dentry *dentry,
 		       umode_t mode)
 {
@@ -653,14 +655,15 @@ static int ocfs2_mkdir(struct inode *dir,
 
 	trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
 			  OCFS2_I(dir)->ip_blkno, mode);
-	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0);
 	if (ret)
 		mlog_errno(ret);
 
 	return ret;
 }
 
-static int ocfs2_create(struct inode *dir,
+static int ocfs2_create(struct user_namespace *mnt_userns,
+			struct inode *dir,
 			struct dentry *dentry,
 			umode_t mode,
 			bool excl)
@@ -669,7 +672,7 @@ static int ocfs2_create(struct inode *dir,
 
 	trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
 			   (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
-	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 	if (ret)
 		mlog_errno(ret);
 
@@ -1195,7 +1198,8 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 		ocfs2_inode_unlock(inode2, 1);
 }
 
-static int ocfs2_rename(struct inode *old_dir,
+static int ocfs2_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir,
 			struct dentry *old_dentry,
 			struct inode *new_dir,
 			struct dentry *new_dentry,
@@ -1784,7 +1788,8 @@ bail:
 	return status;
 }
 
-static int ocfs2_symlink(struct inode *dir,
+static int ocfs2_symlink(struct user_namespace *mnt_userns,
+			 struct inode *dir,
 			 struct dentry *dentry,
 			 const char *symname)
 {
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index a0f45651f3b7..c219f91f44e9 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -279,13 +279,14 @@ out_free_inode:
 	return err;
 }
 
-static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	return omfs_add_node(dir, dentry, mode | S_IFDIR);
 }
 
-static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
 	return omfs_add_node(dir, dentry, mode | S_IFREG);
 }
@@ -369,9 +370,9 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
 	return true;
 }
 
-static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry,
-		       unsigned int flags)
+static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *new_inode = d_inode(new_dentry);
 	struct inode *old_inode = d_inode(old_dentry);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 729339cd7902..11e733aab25d 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -343,7 +343,8 @@ const struct file_operations omfs_file_operations = {
 	.splice_read = generic_file_splice_read,
 };
 
-static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int omfs_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 628921952d16..18852b9ed82b 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -116,7 +116,8 @@ out:
 	return error;
 }
 
-int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		     struct posix_acl *acl, int type)
 {
 	int error;
 	struct iattr iattr;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index b94032f77e61..5079cfafa8d7 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -871,7 +871,8 @@ out:
 /*
  * Change attributes of an object referenced by dentry.
  */
-int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *iattr)
 {
 	int ret;
 	gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
@@ -890,8 +891,8 @@ out:
 /*
  * Obtain attributes of an object given a dentry
  */
-int orangefs_getattr(const struct path *path, struct kstat *stat,
-		     u32 request_mask, unsigned int flags)
+int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		     struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	int ret;
 	struct inode *inode = path->dentry->d_inode;
@@ -919,7 +920,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 	return ret;
 }
 
-int orangefs_permission(struct inode *inode, int mask)
+int orangefs_permission(struct user_namespace *mnt_userns,
+			struct inode *inode, int mask)
 {
 	int ret;
 
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 3e7cf3d0a494..600e8eee541f 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -15,7 +15,8 @@
 /*
  * Get a newly allocated inode to go with a negative dentry.
  */
-static int orangefs_create(struct inode *dir,
+static int orangefs_create(struct user_namespace *mnt_userns,
+			struct inode *dir,
 			struct dentry *dentry,
 			umode_t mode,
 			bool exclusive)
@@ -215,7 +216,8 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
 	return ret;
 }
 
-static int orangefs_symlink(struct inode *dir,
+static int orangefs_symlink(struct user_namespace *mnt_userns,
+		         struct inode *dir,
 			 struct dentry *dentry,
 			 const char *symname)
 {
@@ -303,7 +305,8 @@ out:
 	return ret;
 }
 
-static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	struct orangefs_inode_s *parent = ORANGEFS_I(dir);
 	struct orangefs_kernel_op_s *new_op;
@@ -372,7 +375,8 @@ out:
 	return ret;
 }
 
-static int orangefs_rename(struct inode *old_dir,
+static int orangefs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir,
 			struct dentry *old_dentry,
 			struct inode *new_dir,
 			struct dentry *new_dentry,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index e12aeb9623d6..0e6b97682e41 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -107,7 +107,9 @@ extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
 extern const struct xattr_handler *orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
-extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int orangefs_set_acl(struct user_namespace *mnt_userns,
+			    struct inode *inode, struct posix_acl *acl,
+			    int type);
 
 /*
  * orangefs data structures
@@ -359,12 +361,13 @@ struct inode *orangefs_new_inode(struct super_block *sb,
 			      struct orangefs_object_kref *ref);
 
 int __orangefs_setattr(struct inode *, struct iattr *);
-int orangefs_setattr(struct dentry *, struct iattr *);
+int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 
-int orangefs_getattr(const struct path *path, struct kstat *stat,
-		     u32 request_mask, unsigned int flags);
+int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		     struct kstat *stat, u32 request_mask, unsigned int flags);
 
-int orangefs_permission(struct inode *inode, int mask);
+int orangefs_permission(struct user_namespace *mnt_userns,
+			struct inode *inode, int mask);
 
 int orangefs_update_time(struct inode *, struct timespec64 *, int);
 
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6904cc2ed7bb..8b3be7342a8c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -650,19 +650,20 @@ out:
 	return err;
 }
 
-static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
 }
 
-static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
 }
 
-static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		     dev_t rdev)
+static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	/* Don't allow creation of "whiteout" on overlay */
 	if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
@@ -671,8 +672,8 @@ static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return ovl_create_object(dentry, mode, rdev, NULL);
 }
 
-static int ovl_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *link)
+static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *link)
 {
 	return ovl_create_object(dentry, S_IFLNK, 0, link);
 }
@@ -1069,9 +1070,9 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
 	return err;
 }
 
-static int ovl_rename(struct inode *olddir, struct dentry *old,
-		      struct inode *newdir, struct dentry *new,
-		      unsigned int flags)
+static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
+		      struct dentry *old, struct inode *newdir,
+		      struct dentry *new, unsigned int flags)
 {
 	int err;
 	struct dentry *old_upperdir;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 023fde466e3a..e78d45dfeaee 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -14,7 +14,8 @@
 #include "overlayfs.h"
 
 
-int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	int err;
 	bool full_copy_up = false;
@@ -154,8 +155,8 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
 	return 0;
 }
 
-int ovl_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int flags)
+int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	enum ovl_path_type type;
@@ -277,7 +278,8 @@ out:
 	return err;
 }
 
-int ovl_permission(struct inode *inode, int mask)
+int ovl_permission(struct user_namespace *mnt_userns,
+		   struct inode *inode, int mask)
 {
 	struct inode *upperinode = ovl_inode_upper(inode);
 	struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 5e9eb46e741a..78b9d93a33c9 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -444,10 +444,12 @@ int ovl_set_nlink_lower(struct dentry *dentry);
 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
 			   struct dentry *upperdentry,
 			   unsigned int fallback);
-int ovl_setattr(struct dentry *dentry, struct iattr *attr);
-int ovl_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int flags);
-int ovl_permission(struct inode *inode, int mask);
+int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr);
+int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags);
+int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+		   int mask);
 int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 		  const void *value, size_t size, int flags);
 int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 8168ab2dda11..c04612b19054 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1023,7 +1023,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 	    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
 		struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
 
-		err = ovl_setattr(dentry, &iattr);
+		err = ovl_setattr(&init_user_ns, dentry, &iattr);
 		if (err)
 			return err;
 	}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index d31b60f5d40d..f3309a7edb49 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -593,7 +593,7 @@ int
 	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
 	if (ret)
 		return ret;
-	ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
+	ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS);
 	posix_acl_release(acl);
 	return ret;
 }
@@ -918,7 +918,7 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
 		if (ret)
 			return ret;
 	}
-	return inode->i_op->set_acl(inode, acl, type);
+	return inode->i_op->set_acl(mnt_userns, inode, acl, type);
 }
 EXPORT_SYMBOL(set_posix_acl);
 
@@ -966,12 +966,13 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
 };
 EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
 
-int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		   struct posix_acl *acl, int type)
 {
 	int error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_update_mode(&init_user_ns, inode,
+		error = posix_acl_update_mode(mnt_userns, inode,
 				&inode->i_mode, &acl);
 		if (error)
 			return error;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d45aa68c1f17..56bf14316122 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -685,7 +685,8 @@ static int proc_fd_access_allowed(struct inode *inode)
 	return allowed;
 }
 
-int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		 struct iattr *attr)
 {
 	int error;
 	struct inode *inode = d_inode(dentry);
@@ -726,7 +727,8 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info,
 }
 
 
-static int proc_pid_permission(struct inode *inode, int mask)
+static int proc_pid_permission(struct user_namespace *mnt_userns,
+			       struct inode *inode, int mask)
 {
 	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
 	struct task_struct *task;
@@ -1927,8 +1929,8 @@ out_unlock:
 	return NULL;
 }
 
-int pid_getattr(const struct path *path, struct kstat *stat,
-		u32 request_mask, unsigned int query_flags)
+int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
@@ -3473,7 +3475,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
  * This function makes sure that the node is always accessible for members of
  * same thread group.
  */
-static int proc_tid_comm_permission(struct inode *inode, int mask)
+static int proc_tid_comm_permission(struct user_namespace *mnt_userns,
+				    struct inode *inode, int mask)
 {
 	bool is_same_tgroup;
 	struct task_struct *task;
@@ -3798,7 +3801,8 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	return 0;
 }
 
-static int proc_task_getattr(const struct path *path, struct kstat *stat,
+static int proc_task_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d6e76461e135..07fc4fad2602 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -276,7 +276,8 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-int proc_fd_permission(struct inode *inode, int mask)
+int proc_fd_permission(struct user_namespace *mnt_userns,
+		       struct inode *inode, int mask)
 {
 	struct task_struct *p;
 	int rv;
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index f371a602bf58..c5a921a06a0b 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -10,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations;
 extern const struct file_operations proc_fdinfo_operations;
 extern const struct inode_operations proc_fdinfo_inode_operations;
 
-extern int proc_fd_permission(struct inode *inode, int mask);
+extern int proc_fd_permission(struct user_namespace *mnt_userns,
+			      struct inode *inode, int mask);
 
 static inline unsigned int proc_fd(struct inode *inode)
 {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0db96a761149..bc86aa87cc41 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -115,7 +115,8 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
 	return true;
 }
 
-static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+static int proc_notify_change(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct proc_dir_entry *de = PDE(inode);
@@ -133,7 +134,8 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	return 0;
 }
 
-static int proc_getattr(const struct path *path, struct kstat *stat,
+static int proc_getattr(struct user_namespace *mnt_userns,
+			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f60b379dcdc7..03415f3fb3a8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,8 +162,10 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
  * base.c
  */
 extern const struct dentry_operations pid_dentry_operations;
-extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
-extern int proc_setattr(struct dentry *, struct iattr *);
+extern int pid_getattr(struct user_namespace *, const struct path *,
+		       struct kstat *, u32, unsigned int);
+extern int proc_setattr(struct user_namespace *, struct dentry *,
+			struct iattr *);
 extern void proc_pid_evict_inode(struct proc_inode *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern void pid_update_inode(struct task_struct *, struct inode *);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4aef49ccf571..15c2e55d2ed2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -289,7 +289,8 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
 	return de;
 }
 
-static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat,
+static int proc_tgid_net_getattr(struct user_namespace *mnt_userns,
+				 const struct path *path, struct kstat *stat,
 				 u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 87c828348140..2daac06727d0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -785,7 +785,8 @@ out:
 	return 0;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct user_namespace *mnt_userns,
+			       struct inode *inode, int mask)
 {
 	/*
 	 * sysctl entries that are not writeable,
@@ -813,7 +814,8 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	return error;
 }
 
-static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+static int proc_sys_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
@@ -830,7 +832,8 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+static int proc_sys_getattr(struct user_namespace *mnt_userns,
+			    const struct path *path, struct kstat *stat,
 			    u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 244e4b6f15ef..c7e3b1350ef8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -308,7 +308,8 @@ void __init proc_root_init(void)
 	register_filesystem(&proc_fs_type);
 }
 
-static int proc_root_getattr(const struct path *path, struct kstat *stat,
+static int proc_root_getattr(struct user_namespace *mnt_userns,
+			     const struct path *path, struct kstat *stat,
 			     u32 request_mask, unsigned int query_flags)
 {
 	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f0358fe410d3..ba3525ccc27e 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -22,7 +22,7 @@
 #include <linux/uaccess.h>
 #include "internal.h"
 
-static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long addr,
 						   unsigned long len,
@@ -158,7 +158,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
  * handle a change of attributes
  * - we're specifically interested in a change of size
  */
-static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+static int ramfs_nommu_setattr(struct user_namespace *mnt_userns,
+			       struct dentry *dentry, struct iattr *ia)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int old_ia_valid = ia->ia_valid;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3fd4326f36b5..3c2658c8fde0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -101,7 +101,8 @@ struct inode *ramfs_get_inode(struct super_block *sb,
  */
 /* SMP-safe */
 static int
-ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
 	int error = -ENOSPC;
@@ -115,20 +116,23 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	return error;
 }
 
-static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
-	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
 	return retval;
 }
 
-static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
-	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
+	return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 }
 
-static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index 0c1c847f992f..fd58618da360 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -49,7 +49,8 @@ static inline int reiserfs_acl_count(size_t size)
 
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
-int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		     struct posix_acl *acl, int type);
 int reiserfs_acl_chmod(struct inode *inode);
 int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 				 struct inode *dir, struct dentry *dentry,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 944f2b487cf8..780bb90c1804 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3282,7 +3282,8 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
-int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
+int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int ia_valid;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index a67a7d371725..e6eb05e2b2f1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -619,8 +619,8 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
 	return dquot_initialize(inode);
 }
 
-static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			   bool excl)
+static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			   struct dentry *dentry, umode_t mode, bool excl)
 {
 	int retval;
 	struct inode *inode;
@@ -698,8 +698,8 @@ out_failed:
 	return retval;
 }
 
-static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-			  dev_t rdev)
+static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	int retval;
 	struct inode *inode;
@@ -781,7 +781,8 @@ out_failed:
 	return retval;
 }
 
-static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+			  struct dentry *dentry, umode_t mode)
 {
 	int retval;
 	struct inode *inode;
@@ -1094,8 +1095,9 @@ out_unlink:
 	return retval;
 }
 
-static int reiserfs_symlink(struct inode *parent_dir,
-			    struct dentry *dentry, const char *symname)
+static int reiserfs_symlink(struct user_namespace *mnt_userns,
+			    struct inode *parent_dir, struct dentry *dentry,
+			    const char *symname)
 {
 	int retval;
 	struct inode *inode;
@@ -1304,7 +1306,8 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
  * one path. If it holds 2 or more, it can get into endless waiting in
  * get_empty_nodes or its clones
  */
-static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int reiserfs_rename(struct user_namespace *mnt_userns,
+			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags)
 {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f69871516167..0ca2ac62e534 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -3102,7 +3102,8 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
 }
 
 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
+int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct iattr *attr);
 
 int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ec440d1957a1..bd073836e141 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -66,14 +66,14 @@
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
 	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->create(dir, dentry, mode, true);
+	return dir->i_op->create(&init_user_ns, dir, dentry, mode, true);
 }
 #endif
 
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	BUG_ON(!inode_is_locked(dir));
-	return dir->i_op->mkdir(dir, dentry, mode);
+	return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode);
 }
 
 /*
@@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data)
 	 * ATTR_MODE is set.
 	 */
 	attrs->ia_valid &= (ATTR_UID|ATTR_GID);
-	err = reiserfs_setattr(dentry, attrs);
+	err = reiserfs_setattr(&init_user_ns, dentry, attrs);
 	attrs->ia_valid = ia_valid;
 
 	return err;
@@ -604,7 +604,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 		inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
 		inode_dio_wait(d_inode(dentry));
 
-		err = reiserfs_setattr(dentry, &newattrs);
+		err = reiserfs_setattr(&init_user_ns, dentry, &newattrs);
 		inode_unlock(d_inode(dentry));
 	} else
 		update_ctime(inode);
@@ -948,7 +948,8 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+			int mask)
 {
 	/*
 	 * We don't do permission checks on the internal objects.
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index c764352447ba..9b3b06da568c 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -16,7 +16,8 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_lookup_privroot(struct super_block *sb);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct inode *inode, int mask);
+int reiserfs_permission(struct user_namespace *mnt_userns,
+			struct inode *inode, int mask);
 
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4bf976bc7bad..a9547144a099 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -18,7 +18,8 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
 
 
 int
-reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		 struct posix_acl *acl, int type)
 {
 	int error, error2;
 	struct reiserfs_transaction_handle th;
diff --git a/fs/stat.c b/fs/stat.c
index 2c471c2fd766..fbc171d038aa 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,6 +75,7 @@ EXPORT_SYMBOL(generic_fillattr);
 int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 		      u32 request_mask, unsigned int query_flags)
 {
+	struct user_namespace *mnt_userns;
 	struct inode *inode = d_backing_inode(path->dentry);
 
 	memset(stat, 0, sizeof(*stat));
@@ -91,11 +92,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 	if (IS_DAX(inode))
 		stat->attributes |= STATX_ATTR_DAX;
 
+	mnt_userns = mnt_user_ns(path->mnt);
 	if (inode->i_op->getattr)
-		return inode->i_op->getattr(path, stat, request_mask,
-					    query_flags);
+		return inode->i_op->getattr(mnt_userns, path, stat,
+					    request_mask, query_flags);
 
-	generic_fillattr(mnt_user_ns(path->mnt), inode, stat);
+	generic_fillattr(mnt_userns, inode, stat);
 	return 0;
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index ca7e216b7b9e..90e00124ea07 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -29,7 +29,8 @@ const struct file_operations sysv_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
-static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
+static int sysv_setattr(struct user_namespace *mnt_userns,
+			struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 83cffab6955f..8b2e99b7bc9f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -441,8 +441,8 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
 	return blocks;
 }
 
-int sysv_getattr(const struct path *path, struct kstat *stat,
-		 u32 request_mask, unsigned int flags)
+int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		 struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	struct super_block *s = path->dentry->d_sb;
 	generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index ea2414b385ec..b2e6abc06a2d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -41,7 +41,8 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un
 	return d_splice_alias(inode, dentry);
 }
 
-static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev)
+static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode * inode;
 	int err;
@@ -60,13 +61,14 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode,
 	return err;
 }
 
-static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
+static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
 {
-	return sysv_mknod(dir, dentry, mode, 0);
+	return sysv_mknod(&init_user_ns, dir, dentry, mode, 0);
 }
 
-static int sysv_symlink(struct inode * dir, struct dentry * dentry, 
-	const char * symname)
+static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	int err = -ENAMETOOLONG;
 	int l = strlen(symname)+1;
@@ -108,7 +110,8 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
 	return add_nondir(dentry, inode);
 }
 
-static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
+static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -186,9 +189,9 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
-		       struct inode * new_dir, struct dentry * new_dentry,
-		       unsigned int flags)
+static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		       struct dentry *old_dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode * old_inode = d_inode(old_dentry);
 	struct inode * new_inode = d_inode(new_dentry);
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 1cff585526b1..99ddf033da4f 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -141,7 +141,8 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int sysv_getattr(struct user_namespace *, const struct path *,
+			struct kstat *, u32, unsigned int);
 extern int sysv_init_icache(void);
 extern void sysv_destroy_icache(void);
 
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 0ee8c6dfb036..4b83cbded559 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -67,7 +67,9 @@ static char *get_dname(struct dentry *dentry)
 	return name;
 }
 
-static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns,
+				 struct inode *inode, struct dentry *dentry,
+				 umode_t mode)
 {
 	char *name;
 	int ret;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index a8881ed61620..d9d8d7794eff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -280,8 +280,8 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry,
 	return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm);
 }
 
-static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-			bool excl)
+static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
@@ -441,8 +441,8 @@ out_budg:
 	return err;
 }
 
-static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry,
-			 umode_t mode)
+static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode)
 {
 	return do_tmpfile(dir, dentry, mode, NULL);
 }
@@ -942,7 +942,8 @@ out_fname:
 	return err;
 }
 
-static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -1013,8 +1014,8 @@ out_budg:
 	return err;
 }
 
-static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
-		       umode_t mode, dev_t rdev)
+static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 	struct ubifs_inode *ui;
@@ -1102,8 +1103,8 @@ out_budg:
 	return err;
 }
 
-static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
+static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	struct inode *inode;
 	struct ubifs_inode *ui;
@@ -1542,7 +1543,8 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 	return err;
 }
 
-static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+static int ubifs_rename(struct user_namespace *mnt_userns,
+			struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
@@ -1566,8 +1568,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-int ubifs_getattr(const struct path *path, struct kstat *stat,
-		  u32 request_mask, unsigned int flags)
+int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		  struct kstat *stat, u32 request_mask, unsigned int flags)
 {
 	loff_t size;
 	struct inode *inode = d_inode(path->dentry);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 76ef392b1e41..0e4b4be3aa26 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1257,7 +1257,8 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	return err;
 }
 
-int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr)
 {
 	int err;
 	struct inode *inode = d_inode(dentry);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2cdde3b549..7fdfdbda4b8a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1989,13 +1989,14 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 
 /* file.c */
 int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
-int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		  struct iattr *attr);
 int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
 
 /* dir.c */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode);
-int ubifs_getattr(const struct path *path, struct kstat *stat,
+int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat,
 		  u32 request_mask, unsigned int flags);
 int ubifs_check_dir_empty(struct inode *dir);
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7c7d161315c2..2846dcd92197 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -253,7 +253,8 @@ const struct file_operations udf_file_operations = {
 	.llseek			= generic_file_llseek,
 };
 
-static int udf_setattr(struct dentry *dentry, struct iattr *attr)
+static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index e169d8fe35b5..f146b3089f3d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -604,8 +604,8 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	return 0;
 }
 
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int udf_create(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, bool excl)
 {
 	struct inode *inode = udf_new_inode(dir, mode);
 
@@ -623,7 +623,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return udf_add_nondir(dentry, inode);
 }
 
-static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode = udf_new_inode(dir, mode);
 
@@ -642,8 +643,8 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
-static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		     dev_t rdev)
+static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 
@@ -658,7 +659,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return udf_add_nondir(dentry, inode);
 }
 
-static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct udf_fileident_bh fibh;
@@ -877,8 +879,8 @@ out:
 	return retval;
 }
 
-static int udf_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *symname)
+static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *symname)
 {
 	struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
 	struct pathComponent *pc;
@@ -1065,9 +1067,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 /* Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 54a44d1f023c..9b223421a3c5 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -152,8 +152,9 @@ out_unmap:
 	return err;
 }
 
-static int udf_symlink_getattr(const struct path *path, struct kstat *stat,
-				u32 request_mask, unsigned int flags)
+static int udf_symlink_getattr(struct user_namespace *mnt_userns,
+			       const struct path *path, struct kstat *stat,
+			       u32 request_mask, unsigned int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = d_backing_inode(dentry);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 6b51f3b20143..debc282c1bb4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1211,7 +1211,8 @@ out:
 	return err;
 }
 
-int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 9ef40f100415..29d5a0e0c8f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -69,7 +69,8 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
+static int ufs_create (struct user_namespace * mnt_userns,
+		struct inode * dir, struct dentry * dentry, umode_t mode,
 		bool excl)
 {
 	struct inode *inode;
@@ -85,7 +86,8 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
 	return ufs_add_nondir(dentry, inode);
 }
 
-static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
+static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 	int err;
@@ -104,8 +106,8 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev
 	return err;
 }
 
-static int ufs_symlink (struct inode * dir, struct dentry * dentry,
-	const char * symname)
+static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry * dentry, const char * symname)
 {
 	struct super_block * sb = dir->i_sb;
 	int err;
@@ -164,7 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 	return error;
 }
 
-static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
+static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir,
+	struct dentry * dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err;
@@ -240,9 +243,9 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry,
-		      unsigned int flags)
+static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+		      struct dentry *old_dentry, struct inode *new_dir,
+		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index b49e0efdf3d7..550f7c5a3636 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -123,7 +123,8 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_evict_inode (struct inode *);
-extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		       struct iattr *attr);
 
 /* namei.c */
 extern const struct file_operations ufs_dir_operations;
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 4d569f14a8d8..7aee0ec63ade 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -288,13 +288,15 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
 	return 0;
 }
 
-static int vboxsf_dir_mkfile(struct inode *parent, struct dentry *dentry,
+static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns,
+			     struct inode *parent, struct dentry *dentry,
 			     umode_t mode, bool excl)
 {
 	return vboxsf_dir_create(parent, dentry, mode, 0);
 }
 
-static int vboxsf_dir_mkdir(struct inode *parent, struct dentry *dentry,
+static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns,
+			    struct inode *parent, struct dentry *dentry,
 			    umode_t mode)
 {
 	return vboxsf_dir_create(parent, dentry, mode, 1);
@@ -332,7 +334,8 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)
 	return 0;
 }
 
-static int vboxsf_dir_rename(struct inode *old_parent,
+static int vboxsf_dir_rename(struct user_namespace *mnt_userns,
+			     struct inode *old_parent,
 			     struct dentry *old_dentry,
 			     struct inode *new_parent,
 			     struct dentry *new_dentry,
@@ -374,7 +377,8 @@ err_put_old_path:
 	return err;
 }
 
-static int vboxsf_dir_symlink(struct inode *parent, struct dentry *dentry,
+static int vboxsf_dir_symlink(struct user_namespace *mnt_userns,
+			      struct inode *parent, struct dentry *dentry,
 			      const char *symname)
 {
 	struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index d2cd1c99f48e..3b847e3fba24 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -212,8 +212,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 	return 0;
 }
 
-int vboxsf_getattr(const struct path *path, struct kstat *kstat,
-		   u32 request_mask, unsigned int flags)
+int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		   struct kstat *kstat, u32 request_mask, unsigned int flags)
 {
 	int err;
 	struct dentry *dentry = path->dentry;
@@ -237,7 +237,8 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat,
 	return 0;
 }
 
-int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr)
+int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *iattr)
 {
 	struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry));
 	struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb);
diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h
index 18f95b00fc33..760524e78c88 100644
--- a/fs/vboxsf/vfsmod.h
+++ b/fs/vboxsf/vfsmod.h
@@ -90,9 +90,11 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path,
 		struct shfl_fsobjinfo *info);
 int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info);
 int vboxsf_inode_revalidate(struct dentry *dentry);
-int vboxsf_getattr(const struct path *path, struct kstat *kstat,
-		   u32 request_mask, unsigned int query_flags);
-int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr);
+int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path,
+		   struct kstat *kstat, u32 request_mask,
+		   unsigned int query_flags);
+int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+		   struct iattr *iattr);
 struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi,
 					    struct dentry *dentry);
 int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 368351298bd5..332e87153c6c 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -238,7 +238,8 @@ xfs_acl_set_mode(
 }
 
 int
-xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+	    struct posix_acl *acl, int type)
 {
 	umode_t mode;
 	bool set_mode = false;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index c042c0868016..7bdb3a4ed798 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,8 @@ struct posix_acl;
 
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+		       struct posix_acl *acl, int type);
 extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 void xfs_forget_acl(struct inode *inode, const char *name);
 #else
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 26d22edef741..f5dfa128af64 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -220,29 +220,32 @@ xfs_generic_create(
 
 STATIC int
 xfs_vn_mknod(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode,
-	dev_t		rdev)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode,
+	dev_t			rdev)
 {
 	return xfs_generic_create(dir, dentry, mode, rdev, false);
 }
 
 STATIC int
 xfs_vn_create(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode,
-	bool		flags)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode,
+	bool			flags)
 {
 	return xfs_generic_create(dir, dentry, mode, 0, false);
 }
 
 STATIC int
 xfs_vn_mkdir(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode)
 {
 	return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
 }
@@ -361,9 +364,10 @@ xfs_vn_unlink(
 
 STATIC int
 xfs_vn_symlink(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	const char	*symname)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	const char		*symname)
 {
 	struct inode	*inode;
 	struct xfs_inode *cip = NULL;
@@ -403,11 +407,12 @@ xfs_vn_symlink(
 
 STATIC int
 xfs_vn_rename(
-	struct inode	*odir,
-	struct dentry	*odentry,
-	struct inode	*ndir,
-	struct dentry	*ndentry,
-	unsigned int	flags)
+	struct user_namespace	*mnt_userns,
+	struct inode		*odir,
+	struct dentry		*odentry,
+	struct inode		*ndir,
+	struct dentry		*ndentry,
+	unsigned int		flags)
 {
 	struct inode	*new_inode = d_inode(ndentry);
 	int		omode = 0;
@@ -529,6 +534,7 @@ xfs_stat_blksize(
 
 STATIC int
 xfs_vn_getattr(
+	struct user_namespace	*mnt_userns,
 	const struct path	*path,
 	struct kstat		*stat,
 	u32			request_mask,
@@ -1047,6 +1053,7 @@ xfs_vn_setattr_size(
 
 STATIC int
 xfs_vn_setattr(
+	struct user_namespace	*mnt_userns,
 	struct dentry		*dentry,
 	struct iattr		*iattr)
 {
@@ -1144,9 +1151,10 @@ xfs_vn_fiemap(
 
 STATIC int
 xfs_vn_tmpfile(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	umode_t		mode)
+	struct user_namespace	*mnt_userns,
+	struct inode		*dir,
+	struct dentry		*dentry,
+	umode_t			mode)
 {
 	return xfs_generic_create(dir, dentry, mode, 0, true);
 }
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8a1f69677784..76e45d66d4ce 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -480,7 +480,8 @@ unlock:
 	return ret;
 }
 
-static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
+static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
+				struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = d_inode(dentry);
 	int ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f0601cca1930..7762d3d75230 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1930,22 +1930,28 @@ struct file_operations {
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
-	int (*permission) (struct inode *, int);
+	int (*permission) (struct user_namespace *, struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
 
-	int (*create) (struct inode *,struct dentry *, umode_t, bool);
+	int (*create) (struct user_namespace *, struct inode *,struct dentry *,
+		       umode_t, bool);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
-	int (*symlink) (struct inode *,struct dentry *,const char *);
-	int (*mkdir) (struct inode *,struct dentry *,umode_t);
+	int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,
+			const char *);
+	int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,
+		      umode_t);
 	int (*rmdir) (struct inode *,struct dentry *);
-	int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-	int (*rename) (struct inode *, struct dentry *,
+	int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,
+		      umode_t,dev_t);
+	int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
-	int (*setattr) (struct dentry *, struct iattr *);
-	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+	int (*setattr) (struct user_namespace *, struct dentry *,
+			struct iattr *);
+	int (*getattr) (struct user_namespace *, const struct path *,
+			struct kstat *, u32, unsigned int);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
@@ -1953,8 +1959,10 @@ struct inode_operations {
 	int (*atomic_open)(struct inode *, struct dentry *,
 			   struct file *, unsigned open_flag,
 			   umode_t create_mode);
-	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
-	int (*set_acl)(struct inode *, struct posix_acl *, int);
+	int (*tmpfile) (struct user_namespace *, struct inode *,
+			struct dentry *, umode_t);
+	int (*set_acl)(struct user_namespace *, struct inode *,
+		       struct posix_acl *, int);
 } ____cacheline_aligned;
 
 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
@@ -3227,15 +3235,18 @@ extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, struct dir_context *);
-extern int simple_setattr(struct dentry *, struct iattr *);
-extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int simple_setattr(struct user_namespace *, struct dentry *,
+			  struct iattr *);
+extern int simple_getattr(struct user_namespace *, const struct path *,
+			  struct kstat *, u32, unsigned int);
 extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_open(struct inode *inode, struct file *file);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
-extern int simple_rename(struct inode *, struct dentry *,
-			 struct inode *, struct dentry *, unsigned int);
+extern int simple_rename(struct user_namespace *, struct inode *,
+			 struct dentry *, struct inode *, struct dentry *,
+			 unsigned int);
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 681ed98e4ba8..8c6c4e32fc2f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -379,10 +379,11 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr);
-extern int nfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern int nfs_getattr(struct user_namespace *, const struct path *,
+		       struct kstat *, u32, unsigned int);
 extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
-extern int nfs_permission(struct inode *, int);
+extern int nfs_permission(struct user_namespace *, struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -390,7 +391,7 @@ extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
-extern int nfs_setattr(struct dentry *, struct iattr *);
+extern int nfs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
 extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
 				struct nfs4_label *label);
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 6dcd8b8f6ab5..307094ebb88c 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -79,7 +79,8 @@ extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
 int posix_acl_update_mode(struct user_namespace *, struct inode *, umode_t *,
 			  struct posix_acl **);
 
-extern int simple_set_acl(struct inode *, struct posix_acl *, int);
+extern int simple_set_acl(struct user_namespace *, struct inode *,
+			  struct posix_acl *, int);
 extern int simple_acl_create(struct inode *, struct inode *);
 
 struct posix_acl *get_cached_acl(struct inode *inode, int type);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index fcd56e077733..8031464ed4ae 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -594,8 +594,8 @@ out_unlock:
 	return error;
 }
 
-static int mqueue_create(struct inode *dir, struct dentry *dentry,
-				umode_t mode, bool excl)
+static int mqueue_create(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, umode_t mode, bool excl)
 {
 	return mqueue_create_attr(dentry, mode, NULL);
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 05b1f51d15e0..1576ff331ee4 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -152,7 +152,8 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
 	dir->i_ctime = dir->i_mtime;
 }
 
-static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		     struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 
@@ -381,8 +382,8 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 	return simple_lookup(dir, dentry, flags);
 }
 
-static int bpf_symlink(struct inode *dir, struct dentry *dentry,
-		       const char *target)
+static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, const char *target)
 {
 	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
 	struct inode *inode;
diff --git a/mm/shmem.c b/mm/shmem.c
index 339d5530d3a9..facdd1a9c524 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1060,7 +1060,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-static int shmem_getattr(const struct path *path, struct kstat *stat,
+static int shmem_getattr(struct user_namespace *mnt_userns,
+			 const struct path *path, struct kstat *stat,
 			 u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = path->dentry->d_inode;
@@ -1080,7 +1081,8 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
 	return 0;
 }
 
-static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct user_namespace *mnt_userns,
+			 struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2917,7 +2919,8 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
  * File creation. Allocate an inode, and we're done..
  */
 static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+	    struct dentry *dentry, umode_t mode, dev_t dev)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
@@ -2946,7 +2949,8 @@ out_iput:
 }
 
 static int
-shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+	      struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	int error = -ENOSPC;
@@ -2969,20 +2973,22 @@ out_iput:
 	return error;
 }
 
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	int error;
 
-	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+	if ((error = shmem_mknod(&init_user_ns, dir, dentry,
+				 mode | S_IFDIR, 0)))
 		return error;
 	inc_nlink(dir);
 	return 0;
 }
 
-static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
+static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, umode_t mode, bool excl)
 {
-	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+	return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
 }
 
 /*
@@ -3062,7 +3068,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
 	return 0;
 }
 
-static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+static int shmem_whiteout(struct user_namespace *mnt_userns,
+			  struct inode *old_dir, struct dentry *old_dentry)
 {
 	struct dentry *whiteout;
 	int error;
@@ -3071,7 +3078,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
 	if (!whiteout)
 		return -ENOMEM;
 
-	error = shmem_mknod(old_dir, whiteout,
+	error = shmem_mknod(&init_user_ns, old_dir, whiteout,
 			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
 	dput(whiteout);
 	if (error)
@@ -3094,7 +3101,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
  * it exists so that the VFS layer correctly free's it when it
  * gets overwritten.
  */
-static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+static int shmem_rename2(struct user_namespace *mnt_userns,
+			 struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry,
+			 unsigned int flags)
 {
 	struct inode *inode = d_inode(old_dentry);
 	int they_are_dirs = S_ISDIR(inode->i_mode);
@@ -3111,7 +3121,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
 	if (flags & RENAME_WHITEOUT) {
 		int error;
 
-		error = shmem_whiteout(old_dir, old_dentry);
+		error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
 		if (error)
 			return error;
 	}
@@ -3135,7 +3145,8 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
 	return 0;
 }
 
-static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			 struct dentry *dentry, const char *symname)
 {
 	int error;
 	int len;
diff --git a/net/socket.c b/net/socket.c
index c76703c6f480..2826698ff97c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -538,9 +538,10 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 	return used;
 }
 
-static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+static int sockfs_setattr(struct user_namespace *mnt_userns,
+			  struct dentry *dentry, struct iattr *iattr)
 {
-	int err = simple_setattr(dentry, iattr);
+	int err = simple_setattr(&init_user_ns, dentry, iattr);
 
 	if (!err && (iattr->ia_valid & ATTR_UID)) {
 		struct socket *sock = SOCKET_I(d_inode(dentry));
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index f95c6bfa8b8e..2ee3b3d29f10 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -1773,7 +1773,8 @@ fail2:
 	return error;
 }
 
-static int ns_mkdir_op(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int ns_mkdir_op(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode)
 {
 	struct aa_ns *ns, *parent;
 	/* TODO: improve permission check */
diff --git a/security/integrity/evm/evm_secfs.c b/security/integrity/evm/evm_secfs.c
index cfc3075769bb..bbc85637e18b 100644
--- a/security/integrity/evm/evm_secfs.c
+++ b/security/integrity/evm/evm_secfs.c
@@ -219,7 +219,7 @@ static ssize_t evm_write_xattrs(struct file *file, const char __user *buf,
 		newattrs.ia_valid = ATTR_MODE;
 		inode = evm_xattrs->d_inode;
 		inode_lock(inode);
-		err = simple_setattr(evm_xattrs, &newattrs);
+		err = simple_setattr(&init_user_ns, evm_xattrs, &newattrs);
 		inode_unlock(inode);
 		if (!err)
 			err = count;
-- 
cgit v1.2.3


From a2d2329e30e224ea68d575d2525b866df9805ea0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:45 +0100
Subject: ima: handle idmapped mounts

IMA does sometimes access the inode's i_uid and compares it against the
rules' fowner. Enable IMA to handle idmapped mounts by passing down the
mount's user namespace. We simply make use of the helpers we introduced
before. If the initial user namespace is passed nothing changes so
non-idmapped mounts will see identical behavior as before.

Link: https://lore.kernel.org/r/20210121131959.646623-27-christian.brauner@ubuntu.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/attr.c                                    |  2 +-
 fs/namei.c                                   |  4 +--
 include/linux/ima.h                          | 18 +++++++++-----
 security/integrity/ima/ima.h                 | 19 ++++++++------
 security/integrity/ima/ima_api.c             | 10 +++++---
 security/integrity/ima/ima_appraise.c        | 15 ++++++-----
 security/integrity/ima/ima_asymmetric_keys.c |  3 ++-
 security/integrity/ima/ima_main.c            | 37 ++++++++++++++++++----------
 security/integrity/ima/ima_policy.c          | 20 +++++++++------
 security/integrity/ima/ima_queue_keys.c      |  4 ++-
 10 files changed, 83 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index 41abd0d973d8..87ef39db1c34 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -401,7 +401,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
 
 	if (!error) {
 		fsnotify_change(dentry, ia_valid);
-		ima_inode_post_setattr(dentry);
+		ima_inode_post_setattr(mnt_userns, dentry);
 		evm_inode_post_setattr(dentry, ia_valid);
 	}
 
diff --git a/fs/namei.c b/fs/namei.c
index d9ceb75ac169..dbf53b325ac9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3422,7 +3422,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
 		inode->i_state |= I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
-	ima_post_create_tmpfile(inode);
+	ima_post_create_tmpfile(mnt_userns, inode);
 	return child;
 
 out_err:
@@ -3750,7 +3750,7 @@ retry:
 			error = vfs_create(mnt_userns, path.dentry->d_inode,
 					   dentry, mode, true);
 			if (!error)
-				ima_post_path_mknod(dentry);
+				ima_post_path_mknod(mnt_userns, dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
 			error = vfs_mknod(mnt_userns, path.dentry->d_inode,
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 7db9cca1af34..5486312d8ee6 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -16,7 +16,8 @@ struct linux_binprm;
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
-extern void ima_post_create_tmpfile(struct inode *inode);
+extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+				    struct inode *inode);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
 extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot);
@@ -27,7 +28,8 @@ extern int ima_read_file(struct file *file, enum kernel_read_file_id id,
 			 bool contents);
 extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
-extern void ima_post_path_mknod(struct dentry *dentry);
+extern void ima_post_path_mknod(struct user_namespace *mnt_userns,
+				struct dentry *dentry);
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
@@ -68,7 +70,8 @@ static inline int ima_file_check(struct file *file, int mask)
 	return 0;
 }
 
-static inline void ima_post_create_tmpfile(struct inode *inode)
+static inline void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+					   struct inode *inode)
 {
 }
 
@@ -112,7 +115,8 @@ static inline int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	return 0;
 }
 
-static inline void ima_post_path_mknod(struct dentry *dentry)
+static inline void ima_post_path_mknod(struct user_namespace *mnt_userns,
+				       struct dentry *dentry)
 {
 	return;
 }
@@ -153,7 +157,8 @@ static inline void ima_post_key_create_or_update(struct key *keyring,
 
 #ifdef CONFIG_IMA_APPRAISE
 extern bool is_ima_appraise_enabled(void);
-extern void ima_inode_post_setattr(struct dentry *dentry);
+extern void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+				   struct dentry *dentry);
 extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
 		       const void *xattr_value, size_t xattr_value_len);
 extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name);
@@ -163,7 +168,8 @@ static inline bool is_ima_appraise_enabled(void)
 	return 0;
 }
 
-static inline void ima_inode_post_setattr(struct dentry *dentry)
+static inline void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+					  struct dentry *dentry)
 {
 	return;
 }
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 8e8b1e3cb847..b87c9006d577 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -253,8 +253,9 @@ static inline void ima_process_queued_keys(void) {}
 #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */
 
 /* LIM API function definitions */
-int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
-		   int mask, enum ima_hooks func, int *pcr,
+int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+		   const struct cred *cred, u32 secid, int mask,
+		   enum ima_hooks func, int *pcr,
 		   struct ima_template_desc **template_desc,
 		   const char *keyring);
 int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
@@ -266,7 +267,8 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
 			   struct evm_ima_xattr_data *xattr_value,
 			   int xattr_len, const struct modsig *modsig, int pcr,
 			   struct ima_template_desc *template_desc);
-void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+void process_buffer_measurement(struct user_namespace *mnt_userns,
+				struct inode *inode, const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
 				int pcr, const char *keyring);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
@@ -281,8 +283,9 @@ void ima_free_template_entry(struct ima_template_entry *entry);
 const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
 
 /* IMA policy related functions */
-int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-		     enum ima_hooks func, int mask, int flags, int *pcr,
+int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+		     const struct cred *cred, u32 secid, enum ima_hooks func,
+		     int mask, int flags, int *pcr,
 		     struct ima_template_desc **template_desc,
 		     const char *keyring);
 void ima_init_policy(void);
@@ -313,7 +316,8 @@ int ima_appraise_measurement(enum ima_hooks func,
 			     struct file *file, const unsigned char *filename,
 			     struct evm_ima_xattr_data *xattr_value,
 			     int xattr_len, const struct modsig *modsig);
-int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
+int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+		      int mask, enum ima_hooks func);
 void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
 enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
 					   enum ima_hooks func);
@@ -340,7 +344,8 @@ static inline int ima_appraise_measurement(enum ima_hooks func,
 	return INTEGRITY_UNKNOWN;
 }
 
-static inline int ima_must_appraise(struct inode *inode, int mask,
+static inline int ima_must_appraise(struct user_namespace *mnt_userns,
+				    struct inode *inode, int mask,
 				    enum ima_hooks func)
 {
 	return 0;
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 4f39fb93f278..ed410efb3597 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -162,6 +162,7 @@ err_out:
 
 /**
  * ima_get_action - appraise & measure decision based on policy.
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: pointer to the inode associated with the object being validated
  * @cred: pointer to credentials structure to validate
  * @secid: secid of the task being validated
@@ -183,8 +184,9 @@ err_out:
  * Returns IMA_MEASURE, IMA_APPRAISE mask.
  *
  */
-int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
-		   int mask, enum ima_hooks func, int *pcr,
+int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+		   const struct cred *cred, u32 secid, int mask,
+		   enum ima_hooks func, int *pcr,
 		   struct ima_template_desc **template_desc,
 		   const char *keyring)
 {
@@ -192,8 +194,8 @@ int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 
 	flags &= ima_policy_flag;
 
-	return ima_match_policy(inode, cred, secid, func, mask, flags, pcr,
-				template_desc, keyring);
+	return ima_match_policy(mnt_userns, inode, cred, secid, func, mask,
+				flags, pcr, template_desc, keyring);
 }
 
 /*
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 70b643c41c6b..2e64b9f281cc 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -68,7 +68,8 @@ bool is_ima_appraise_enabled(void)
  *
  * Return 1 to appraise or hash
  */
-int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
+int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+		      int mask, enum ima_hooks func)
 {
 	u32 secid;
 
@@ -76,8 +77,8 @@ int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
 		return 0;
 
 	security_task_getsecid(current, &secid);
-	return ima_match_policy(inode, current_cred(), secid, func, mask,
-				IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
+	return ima_match_policy(mnt_userns, inode, current_cred(), secid, func,
+				mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
 }
 
 static int ima_fix_xattr(struct dentry *dentry,
@@ -350,7 +351,7 @@ int ima_check_blacklist(struct integrity_iint_cache *iint,
 
 		rc = is_binary_blacklisted(digest, digestsize);
 		if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
-			process_buffer_measurement(NULL, digest, digestsize,
+			process_buffer_measurement(&init_user_ns, NULL, digest, digestsize,
 						   "blacklisted-hash", NONE,
 						   pcr, NULL);
 	}
@@ -501,6 +502,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file)
 
 /**
  * ima_inode_post_setattr - reflect file metadata changes
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry: pointer to the affected dentry
  *
  * Changes to a dentry's metadata might result in needing to appraise.
@@ -508,7 +510,8 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file)
  * This function is called from notify_change(), which expects the caller
  * to lock the inode's i_mutex.
  */
-void ima_inode_post_setattr(struct dentry *dentry)
+void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+			    struct dentry *dentry)
 {
 	struct inode *inode = d_backing_inode(dentry);
 	struct integrity_iint_cache *iint;
@@ -518,7 +521,7 @@ void ima_inode_post_setattr(struct dentry *dentry)
 	    || !(inode->i_opflags & IOP_XATTR))
 		return;
 
-	action = ima_must_appraise(inode, MAY_ACCESS, POST_SETATTR);
+	action = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, POST_SETATTR);
 	if (!action)
 		__vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_IMA);
 	iint = integrity_iint_find(inode);
diff --git a/security/integrity/ima/ima_asymmetric_keys.c b/security/integrity/ima/ima_asymmetric_keys.c
index 1c68c500c26f..c4ef69100789 100644
--- a/security/integrity/ima/ima_asymmetric_keys.c
+++ b/security/integrity/ima/ima_asymmetric_keys.c
@@ -10,6 +10,7 @@
  */
 
 #include <keys/asymmetric-type.h>
+#include <linux/user_namespace.h>
 #include "ima.h"
 
 /**
@@ -58,7 +59,7 @@ void ima_post_key_create_or_update(struct key *keyring, struct key *key,
 	 * if the IMA policy is configured to measure a key linked
 	 * to the given keyring.
 	 */
-	process_buffer_measurement(NULL, payload, payload_len,
+	process_buffer_measurement(&init_user_ns, NULL, payload, payload_len,
 				   keyring->description, KEY_CHECK, 0,
 				   keyring->description);
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index f87cb29329e9..cb1c56eccd6d 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -218,8 +218,8 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	 * bitmask based on the appraise/audit/measurement policy.
 	 * Included is the appraise submask.
 	 */
-	action = ima_get_action(inode, cred, secid, mask, func, &pcr,
-				&template_desc, NULL);
+	action = ima_get_action(file_mnt_user_ns(file), inode, cred, secid,
+				mask, func, &pcr, &template_desc, NULL);
 	violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) &&
 			   (ima_policy_flag & IMA_MEASURE));
 	if (!action && !violation_check)
@@ -431,8 +431,9 @@ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot)
 
 	security_task_getsecid(current, &secid);
 	inode = file_inode(vma->vm_file);
-	action = ima_get_action(inode, current_cred(), secid, MAY_EXEC,
-				MMAP_CHECK, &pcr, &template, 0);
+	action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode,
+				current_cred(), secid, MAY_EXEC, MMAP_CHECK,
+				&pcr, &template, 0);
 
 	/* Is the mmap'ed file in policy? */
 	if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
@@ -592,18 +593,21 @@ EXPORT_SYMBOL_GPL(ima_inode_hash);
 
 /**
  * ima_post_create_tmpfile - mark newly created tmpfile as new
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @file : newly created tmpfile
  *
  * No measuring, appraising or auditing of newly created tmpfiles is needed.
  * Skip calling process_measurement(), but indicate which newly, created
  * tmpfiles are in policy.
  */
-void ima_post_create_tmpfile(struct inode *inode)
+void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+			     struct inode *inode)
 {
 	struct integrity_iint_cache *iint;
 	int must_appraise;
 
-	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+					  FILE_CHECK);
 	if (!must_appraise)
 		return;
 
@@ -619,18 +623,21 @@ void ima_post_create_tmpfile(struct inode *inode)
 
 /**
  * ima_post_path_mknod - mark as a new inode
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @dentry: newly created dentry
  *
  * Mark files created via the mknodat syscall as new, so that the
  * file data can be written later.
  */
-void ima_post_path_mknod(struct dentry *dentry)
+void ima_post_path_mknod(struct user_namespace *mnt_userns,
+			 struct dentry *dentry)
 {
 	struct integrity_iint_cache *iint;
 	struct inode *inode = dentry->d_inode;
 	int must_appraise;
 
-	must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+	must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+					  FILE_CHECK);
 	if (!must_appraise)
 		return;
 
@@ -810,6 +817,7 @@ int ima_post_load_data(char *buf, loff_t size,
 
 /*
  * process_buffer_measurement - Measure the buffer to ima log.
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
  * @buf: pointer to the buffer that needs to be added to the log.
  * @size: size of buffer(in bytes).
@@ -820,7 +828,8 @@ int ima_post_load_data(char *buf, loff_t size,
  *
  * Based on policy, the buffer is measured into the ima log.
  */
-void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+void process_buffer_measurement(struct user_namespace *mnt_userns,
+				struct inode *inode, const void *buf, int size,
 				const char *eventname, enum ima_hooks func,
 				int pcr, const char *keyring)
 {
@@ -860,8 +869,9 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size,
 	 */
 	if (func) {
 		security_task_getsecid(current, &secid);
-		action = ima_get_action(inode, current_cred(), secid, 0, func,
-					&pcr, &template, keyring);
+		action = ima_get_action(mnt_userns, inode, current_cred(),
+					secid, 0, func, &pcr, &template,
+					keyring);
 		if (!(action & IMA_MEASURE))
 			return;
 	}
@@ -919,8 +929,9 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
 	if (!f.file)
 		return;
 
-	process_buffer_measurement(file_inode(f.file), buf, size,
-				   "kexec-cmdline", KEXEC_CMDLINE, 0, NULL);
+	process_buffer_measurement(file_mnt_user_ns(f.file), file_inode(f.file),
+				   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
+				   NULL);
 	fdput(f);
 }
 
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 823a0c1379cb..e14426c24a19 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -488,6 +488,7 @@ static bool ima_match_keyring(struct ima_rule_entry *rule,
 /**
  * ima_match_rules - determine whether an inode matches the policy rule.
  * @rule: a pointer to a rule
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: a pointer to an inode
  * @cred: a pointer to a credentials structure for user validation
  * @secid: the secid of the task to be validated
@@ -497,9 +498,10 @@ static bool ima_match_keyring(struct ima_rule_entry *rule,
  *
  * Returns true on rule match, false on failure.
  */
-static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
-			    const struct cred *cred, u32 secid,
-			    enum ima_hooks func, int mask,
+static bool ima_match_rules(struct ima_rule_entry *rule,
+			    struct user_namespace *mnt_userns,
+			    struct inode *inode, const struct cred *cred,
+			    u32 secid, enum ima_hooks func, int mask,
 			    const char *keyring)
 {
 	int i;
@@ -539,7 +541,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
 	}
 
 	if ((rule->flags & IMA_FOWNER) &&
-	    !rule->fowner_op(inode->i_uid, rule->fowner))
+	    !rule->fowner_op(i_uid_into_mnt(mnt_userns, inode), rule->fowner))
 		return false;
 	for (i = 0; i < MAX_LSM_RULES; i++) {
 		int rc = 0;
@@ -602,6 +604,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
 
 /**
  * ima_match_policy - decision based on LSM and other conditions
+ * @mnt_userns:	user namespace of the mount the inode was found from
  * @inode: pointer to an inode for which the policy decision is being made
  * @cred: pointer to a credentials structure for which the policy decision is
  *        being made
@@ -620,8 +623,9 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
  * list when walking it.  Reads are many orders of magnitude more numerous
  * than writes so ima_match_policy() is classical RCU candidate.
  */
-int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-		     enum ima_hooks func, int mask, int flags, int *pcr,
+int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+		     const struct cred *cred, u32 secid, enum ima_hooks func,
+		     int mask, int flags, int *pcr,
 		     struct ima_template_desc **template_desc,
 		     const char *keyring)
 {
@@ -637,8 +641,8 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
 		if (!(entry->action & actmask))
 			continue;
 
-		if (!ima_match_rules(entry, inode, cred, secid, func, mask,
-				     keyring))
+		if (!ima_match_rules(entry, mnt_userns, inode, cred, secid,
+				     func, mask, keyring))
 			continue;
 
 		action |= entry->flags & IMA_ACTION_FLAGS;
diff --git a/security/integrity/ima/ima_queue_keys.c b/security/integrity/ima/ima_queue_keys.c
index 69a8626a35c0..ca3dea19da18 100644
--- a/security/integrity/ima/ima_queue_keys.c
+++ b/security/integrity/ima/ima_queue_keys.c
@@ -8,6 +8,7 @@
  *       Enables deferred processing of keys
  */
 
+#include <linux/user_namespace.h>
 #include <linux/workqueue.h>
 #include <keys/asymmetric-type.h>
 #include "ima.h"
@@ -158,7 +159,8 @@ void ima_process_queued_keys(void)
 
 	list_for_each_entry_safe(entry, tmp, &ima_keys, list) {
 		if (!timer_expired)
-			process_buffer_measurement(NULL, entry->payload,
+			process_buffer_measurement(&init_user_ns, NULL,
+						   entry->payload,
 						   entry->payload_len,
 						   entry->keyring_name,
 						   KEY_CHECK, 0,
-- 
cgit v1.2.3


From 2a1867219c7b27f928e2545782b86daaf9ad50bd Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:53 +0100
Subject: fs: add mount_setattr()

This implements the missing mount_setattr() syscall. While the new mount
api allows to change the properties of a superblock there is currently
no way to change the properties of a mount or a mount tree using file
descriptors which the new mount api is based on. In addition the old
mount api has the restriction that mount options cannot be applied
recursively. This hasn't changed since changing mount options on a
per-mount basis was implemented in [1] and has been a frequent request
not just for convenience but also for security reasons. The legacy
mount syscall is unable to accommodate this behavior without introducing
a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND |
MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost
mount. Changing MS_REC to apply to the whole mount tree would mean
introducing a significant uapi change and would likely cause significant
regressions.

The new mount_setattr() syscall allows to recursively clear and set
mount options in one shot. Multiple calls to change mount options
requesting the same changes are idempotent:

int mount_setattr(int dfd, const char *path, unsigned flags,
                  struct mount_attr *uattr, size_t usize);

Flags to modify path resolution behavior are specified in the @flags
argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW,
and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to
restrict path resolution as introduced with openat2() might be supported
in the future.

The mount_setattr() syscall can be expected to grow over time and is
designed with extensibility in mind. It follows the extensible syscall
pattern we have used with other syscalls such as openat2(), clone3(),
sched_{set,get}attr(), and others.
The set of mount options is passed in the uapi struct mount_attr which
currently has the following layout:

struct mount_attr {
	__u64 attr_set;
	__u64 attr_clr;
	__u64 propagation;
	__u64 userns_fd;
};

The @attr_set and @attr_clr members are used to clear and set mount
options. This way a user can e.g. request that a set of flags is to be
raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in
@attr_set while at the same time requesting that another set of flags is
to be lowered such as removing noexec from a mount tree by specifying
MOUNT_ATTR_NOEXEC in @attr_clr.

Note, since the MOUNT_ATTR_<atime> values are an enum starting from 0,
not a bitmap, users wanting to transition to a different atime setting
cannot simply specify the atime setting in @attr_set, but must also
specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that
MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set
can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in
@attr_clr.

The @propagation field lets callers specify the propagation type of a
mount tree. Propagation is a single property that has four different
settings and as such is not really a flag argument but an enum.
Specifically, it would be unclear what setting and clearing propagation
settings in combination would amount to. The legacy mount() syscall thus
forbids the combination of multiple propagation settings too. The goal
is to keep the semantics of mount propagation somewhat simple as they
are overly complex as it is.

The @userns_fd field lets user specify a user namespace whose idmapping
becomes the idmapping of the mount. This is implemented and explained in
detail in the next patch.

[1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount")

Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com
Cc: David Howells <dhowells@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-api@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 fs/namespace.c                              | 263 ++++++++++++++++++++++++++++
 include/linux/syscalls.h                    |   4 +
 include/uapi/asm-generic/unistd.h           |   4 +-
 include/uapi/linux/mount.h                  |  15 ++
 tools/include/uapi/asm-generic/unistd.h     |   4 +-
 23 files changed, 307 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index a6617067dbe6..02f0244e005c 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -481,3 +481,4 @@
 549	common	faccessat2			sys_faccessat2
 550	common	process_madvise			sys_process_madvise
 551	common	epoll_pwait2			sys_epoll_pwait2
+552	common	mount_setattr			sys_mount_setattr
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 20e1170e2e0a..dcc1191291a2 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -455,3 +455,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 86a9d7b3eabe..949788f5ba40 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		442
+#define __NR_compat_syscalls		443
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index cccfbbefbf95..3d874f624056 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mount_setattr 442
+__SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index bfc00f2bd437..d89231166e19 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -362,3 +362,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7fe4e45c864c..72bde6707dd3 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a522adf194ab..d603a5ec9338 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -447,3 +447,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0f03ad223f33..8fd8c1790941 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -380,3 +380,4 @@
 439	n32	faccessat2			sys_faccessat2
 440	n32	process_madvise			sys_process_madvise
 441	n32	epoll_pwait2			compat_sys_epoll_pwait2
+442	n32	mount_setattr			sys_mount_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 91649690b52f..169f21438065 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -356,3 +356,4 @@
 439	n64	faccessat2			sys_faccessat2
 440	n64	process_madvise			sys_process_madvise
 441	n64	epoll_pwait2			sys_epoll_pwait2
+442	n64	mount_setattr			sys_mount_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 4bad0c40aed6..090d29ca80ff 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -429,3 +429,4 @@
 439	o32	faccessat2			sys_faccessat2
 440	o32	process_madvise			sys_process_madvise
 441	o32	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	o32	mount_setattr			sys_mount_setattr
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 6bcc31966b44..271a92519683 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index f744eb5cba88..72e5aa67ab8a 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -531,3 +531,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index d443423495e5..3abef2144dac 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
 439  common	faccessat2		sys_faccessat2			sys_faccessat2
 440  common	process_madvise		sys_process_madvise		sys_process_madvise
 441  common	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442  common	mount_setattr		sys_mount_setattr		sys_mount_setattr
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 9df40ac0ebc0..d08eebad6b7f 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 40d8c7cd8298..84403a99039c 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -487,3 +487,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 874aeacde2dd..a1c9f496fca6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -446,3 +446,4 @@
 439	i386	faccessat2		sys_faccessat2
 440	i386	process_madvise		sys_process_madvise
 441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	i386	mount_setattr		sys_mount_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78672124d28b..7bf01cbe582f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
 441	common	epoll_pwait2		sys_epoll_pwait2
+442	common	mount_setattr		sys_mount_setattr
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 46116a28eeed..365a9b849224 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -412,3 +412,4 @@
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
+442	common	mount_setattr			sys_mount_setattr
diff --git a/fs/namespace.c b/fs/namespace.c
index 00ed0d6cb2ee..726a51c8c46e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -73,6 +73,14 @@ static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 
+struct mount_kattr {
+	unsigned int attr_set;
+	unsigned int attr_clr;
+	unsigned int propagation;
+	unsigned int lookup_flags;
+	bool recurse;
+};
+
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
@@ -3469,6 +3477,11 @@ out_type:
 	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
 	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
 
+#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS
+
+#define MOUNT_SETATTR_PROPAGATION_FLAGS \
+	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
+
 static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
 {
 	unsigned int mnt_flags = 0;
@@ -3820,6 +3833,256 @@ out0:
 	return error;
 }
 
+static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
+{
+	unsigned int flags = mnt->mnt.mnt_flags;
+
+	/*  flags to clear */
+	flags &= ~kattr->attr_clr;
+	/* flags to raise */
+	flags |= kattr->attr_set;
+
+	return flags;
+}
+
+static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
+					   struct mount *mnt, int *err)
+{
+	struct mount *m = mnt, *last = NULL;
+
+	if (!is_mounted(&m->mnt)) {
+		*err = -EINVAL;
+		goto out;
+	}
+
+	if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
+		*err = -EINVAL;
+		goto out;
+	}
+
+	do {
+		unsigned int flags;
+
+		flags = recalc_flags(kattr, m);
+		if (!can_change_locked_flags(m, flags)) {
+			*err = -EPERM;
+			goto out;
+		}
+
+		last = m;
+
+		if ((kattr->attr_set & MNT_READONLY) &&
+		    !(m->mnt.mnt_flags & MNT_READONLY)) {
+			*err = mnt_hold_writers(m);
+			if (*err)
+				goto out;
+		}
+	} while (kattr->recurse && (m = next_mnt(m, mnt)));
+
+out:
+	return last;
+}
+
+static void mount_setattr_commit(struct mount_kattr *kattr,
+				 struct mount *mnt, struct mount *last,
+				 int err)
+{
+	struct mount *m = mnt;
+
+	do {
+		if (!err) {
+			unsigned int flags;
+
+			flags = recalc_flags(kattr, m);
+			WRITE_ONCE(m->mnt.mnt_flags, flags);
+		}
+
+		/*
+		 * We either set MNT_READONLY above so make it visible
+		 * before ~MNT_WRITE_HOLD or we failed to recursively
+		 * apply mount options.
+		 */
+		if ((kattr->attr_set & MNT_READONLY) &&
+		    (m->mnt.mnt_flags & MNT_WRITE_HOLD))
+			mnt_unhold_writers(m);
+
+		if (!err && kattr->propagation)
+			change_mnt_propagation(m, kattr->propagation);
+
+		/*
+		 * On failure, only cleanup until we found the first mount
+		 * we failed to handle.
+		 */
+		if (err && m == last)
+			break;
+	} while (kattr->recurse && (m = next_mnt(m, mnt)));
+
+	if (!err)
+		touch_mnt_namespace(mnt->mnt_ns);
+}
+
+static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
+{
+	struct mount *mnt = real_mount(path->mnt), *last = NULL;
+	int err = 0;
+
+	if (path->dentry != mnt->mnt.mnt_root)
+		return -EINVAL;
+
+	if (kattr->propagation) {
+		/*
+		 * Only take namespace_lock() if we're actually changing
+		 * propagation.
+		 */
+		namespace_lock();
+		if (kattr->propagation == MS_SHARED) {
+			err = invent_group_ids(mnt, kattr->recurse);
+			if (err) {
+				namespace_unlock();
+				return err;
+			}
+		}
+	}
+
+	lock_mount_hash();
+
+	/*
+	 * Get the mount tree in a shape where we can change mount
+	 * properties without failure.
+	 */
+	last = mount_setattr_prepare(kattr, mnt, &err);
+	if (last) /* Commit all changes or revert to the old state. */
+		mount_setattr_commit(kattr, mnt, last, err);
+
+	unlock_mount_hash();
+
+	if (kattr->propagation) {
+		namespace_unlock();
+		if (err)
+			cleanup_group_ids(mnt, NULL);
+	}
+
+	return err;
+}
+
+static int build_mount_kattr(const struct mount_attr *attr,
+			     struct mount_kattr *kattr, unsigned int flags)
+{
+	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	*kattr = (struct mount_kattr) {
+		.lookup_flags	= lookup_flags,
+		.recurse	= !!(flags & AT_RECURSIVE),
+	};
+
+	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
+		return -EINVAL;
+	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
+		return -EINVAL;
+	kattr->propagation = attr->propagation;
+
+	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
+		return -EINVAL;
+
+	if (attr->userns_fd)
+		return -EINVAL;
+
+	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
+	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
+
+	/*
+	 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
+	 * users wanting to transition to a different atime setting cannot
+	 * simply specify the atime setting in @attr_set, but must also
+	 * specify MOUNT_ATTR__ATIME in the @attr_clr field.
+	 * So ensure that MOUNT_ATTR__ATIME can't be partially set in
+	 * @attr_clr and that @attr_set can't have any atime bits set if
+	 * MOUNT_ATTR__ATIME isn't set in @attr_clr.
+	 */
+	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
+		if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
+			return -EINVAL;
+
+		/*
+		 * Clear all previous time settings as they are mutually
+		 * exclusive.
+		 */
+		kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
+		switch (attr->attr_set & MOUNT_ATTR__ATIME) {
+		case MOUNT_ATTR_RELATIME:
+			kattr->attr_set |= MNT_RELATIME;
+			break;
+		case MOUNT_ATTR_NOATIME:
+			kattr->attr_set |= MNT_NOATIME;
+			break;
+		case MOUNT_ATTR_STRICTATIME:
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
+		if (attr->attr_set & MOUNT_ATTR__ATIME)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+		unsigned int, flags, struct mount_attr __user *, uattr,
+		size_t, usize)
+{
+	int err;
+	struct path target;
+	struct mount_attr attr;
+	struct mount_kattr kattr;
+
+	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
+
+	if (flags & ~(AT_EMPTY_PATH |
+		      AT_RECURSIVE |
+		      AT_SYMLINK_NOFOLLOW |
+		      AT_NO_AUTOMOUNT))
+		return -EINVAL;
+
+	if (unlikely(usize > PAGE_SIZE))
+		return -E2BIG;
+	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
+		return -EINVAL;
+
+	if (!may_mount())
+		return -EPERM;
+
+	err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+	if (err)
+		return err;
+
+	/* Don't bother walking through the mounts if this is a nop. */
+	if (attr.attr_set == 0 &&
+	    attr.attr_clr == 0 &&
+	    attr.propagation == 0)
+		return 0;
+
+	err = build_mount_kattr(&attr, &kattr, flags);
+	if (err)
+		return err;
+
+	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
+	if (err)
+		return err;
+
+	err = do_mount_setattr(&target, &kattr);
+	path_put(&target);
+	return err;
+}
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7688bc983de5..cd7b5c817ba2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -68,6 +68,7 @@ union bpf_attr;
 struct io_uring_params;
 struct clone_args;
 struct open_how;
+struct mount_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1028,6 +1029,9 @@ asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
 asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       int to_dfd, const char __user *to_path,
 			       unsigned int ms_flags);
+asmlinkage long sys_mount_setattr(int dfd, const char __user *path,
+				  unsigned int flags,
+				  struct mount_attr __user *uattr, size_t usize);
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
 			     const void __user *value, int aux);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 728752917785..ce58cff99b66 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mount_setattr 442
+__SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 
 #undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 443
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index dd8306ea336c..d3bcefdfa73d 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -1,6 +1,8 @@
 #ifndef _UAPI_LINUX_MOUNT_H
 #define _UAPI_LINUX_MOUNT_H
 
+#include <linux/types.h>
+
 /*
  * These are the fs-independent mount-flags: up to 32 flags are supported
  *
@@ -118,4 +120,17 @@ enum fsconfig_command {
 #define MOUNT_ATTR_STRICTATIME	0x00000020 /* - Always perform atime updates */
 #define MOUNT_ATTR_NODIRATIME	0x00000080 /* Do not update directory access times */
 
+/*
+ * mount_setattr()
+ */
+struct mount_attr {
+	__u64 attr_set;
+	__u64 attr_clr;
+	__u64 propagation;
+	__u64 userns_fd;
+};
+
+/* List of all mount_attr versions. */
+#define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 728752917785..ce58cff99b66 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
 #define __NR_epoll_pwait2 441
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
+#define __NR_mount_setattr 442
+__SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 
 #undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 443
 
 /*
  * 32 bit systems traditionally used different
-- 
cgit v1.2.3


From 9caccd41541a6f7d6279928d9f971f6642c361af Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 21 Jan 2021 14:19:54 +0100
Subject: fs: introduce MOUNT_ATTR_IDMAP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new mount bind mount property to allow idmapping mounts. The
MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall
together with a file descriptor referring to a user namespace.

The user namespace referenced by the namespace file descriptor will be
attached to the bind mount. All interactions with the filesystem going
through that mount will be mapped according to the mapping specified in
the user namespace attached to it.

Using user namespaces to mark mounts means we can reuse all the existing
infrastructure in the kernel that already exists to handle idmappings
and can also use this for permission checking to allow unprivileged user
to create idmapped mounts in the future.

Idmapping a mount is decoupled from the caller's user and mount
namespace. This means idmapped mounts can be created in the initial
user namespace which is an important use-case for systemd-homed,
portable usb-sticks between systems, sharing data between the initial
user namespace and unprivileged containers, and other use-cases that
have been brought up. For example, assume a home directory where all
files are owned by uid and gid 1000 and the home directory is brought to
a new laptop where the user has id 12345. The system administrator can
simply create a mount of this home directory with a mapping of
1000:12345:1 and other mappings to indicate the ids should be kept.
(With this it is e.g. also possible to create idmapped mounts on the
host with an identity mapping 1:1:100000 where the root user is not
mapped. A user with root access that e.g. has been pivot rooted into
such a mount on the host will be not be able to execute, read, write, or
create files as root.)

Given that mapping a mount is decoupled from the caller's user namespace
a sufficiently privileged process such as a container manager can set up
an idmapped mount for the container and the container can simply pivot
root to it. There's no need for the container to do anything. The mount
will appear correctly mapped independent of the user namespace the
container uses. This means we don't need to mark a mount as idmappable.

In order to create an idmapped mount the caller must currently be
privileged in the user namespace of the superblock the mount belongs to.
Once a mount has been idmapped we don't allow it to change its mapping.
This keeps permission checking and life-cycle management simple. Users
wanting to change the idmapped can always create a new detached mount
with a different idmapping.

Link: https://lore.kernel.org/r/20210121131959.646623-36-christian.brauner@ubuntu.com
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Mauricio Vásquez Bernal <mauricio@kinvolk.io>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/namespace.c             | 122 ++++++++++++++++++++++++++++++++++++++++++---
 fs/proc_namespace.c        |   3 ++
 include/linux/mount.h      |   3 +-
 include/uapi/linux/mount.h |   1 +
 4 files changed, 121 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 726a51c8c46e..062fe984a697 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/memblock.h>
+#include <linux/proc_fs.h>
 #include <linux/task_work.h>
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
@@ -79,6 +80,7 @@ struct mount_kattr {
 	unsigned int propagation;
 	unsigned int lookup_flags;
 	bool recurse;
+	struct user_namespace *mnt_userns;
 };
 
 /* /sys/fs */
@@ -3477,7 +3479,7 @@ out_type:
 	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
 	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
 
-#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS
+#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
 
 #define MOUNT_SETATTR_PROPAGATION_FLAGS \
 	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
@@ -3845,6 +3847,36 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
 	return flags;
 }
 
+static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
+{
+	struct vfsmount *m = &mnt->mnt;
+
+	if (!kattr->mnt_userns)
+		return 0;
+
+	/*
+	 * Once a mount has been idmapped we don't allow it to change its
+	 * mapping. It makes things simpler and callers can just create
+	 * another bind-mount they can idmap if they want to.
+	 */
+	if (mnt_user_ns(m) != &init_user_ns)
+		return -EPERM;
+
+	/* The underlying filesystem doesn't support idmapped mounts yet. */
+	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
+		return -EINVAL;
+
+	/* We're not controlling the superblock. */
+	if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Mount has already been visible in the filesystem hierarchy. */
+	if (!is_anon_ns(mnt->mnt_ns))
+		return -EINVAL;
+
+	return 0;
+}
+
 static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
 					   struct mount *mnt, int *err)
 {
@@ -3869,6 +3901,10 @@ static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
 			goto out;
 		}
 
+		*err = can_idmap_mount(kattr, m);
+		if (*err)
+			goto out;
+
 		last = m;
 
 		if ((kattr->attr_set & MNT_READONLY) &&
@@ -3883,6 +3919,18 @@ out:
 	return last;
 }
 
+static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
+{
+	struct user_namespace *mnt_userns;
+
+	if (!kattr->mnt_userns)
+		return;
+
+	mnt_userns = get_user_ns(kattr->mnt_userns);
+	/* Pairs with smp_load_acquire() in mnt_user_ns(). */
+	smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
+}
+
 static void mount_setattr_commit(struct mount_kattr *kattr,
 				 struct mount *mnt, struct mount *last,
 				 int err)
@@ -3893,6 +3941,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr,
 		if (!err) {
 			unsigned int flags;
 
+			do_idmap_mount(kattr, m);
 			flags = recalc_flags(kattr, m);
 			WRITE_ONCE(m->mnt.mnt_flags, flags);
 		}
@@ -3965,7 +4014,62 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
 	return err;
 }
 
-static int build_mount_kattr(const struct mount_attr *attr,
+static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
+				struct mount_kattr *kattr, unsigned int flags)
+{
+	int err = 0;
+	struct ns_common *ns;
+	struct user_namespace *mnt_userns;
+	struct file *file;
+
+	if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
+		return 0;
+
+	/*
+	 * We currently do not support clearing an idmapped mount. If this ever
+	 * is a use-case we can revisit this but for now let's keep it simple
+	 * and not allow it.
+	 */
+	if (attr->attr_clr & MOUNT_ATTR_IDMAP)
+		return -EINVAL;
+
+	if (attr->userns_fd > INT_MAX)
+		return -EINVAL;
+
+	file = fget(attr->userns_fd);
+	if (!file)
+		return -EBADF;
+
+	if (!proc_ns_file(file)) {
+		err = -EINVAL;
+		goto out_fput;
+	}
+
+	ns = get_proc_ns(file_inode(file));
+	if (ns->ops->type != CLONE_NEWUSER) {
+		err = -EINVAL;
+		goto out_fput;
+	}
+
+	/*
+	 * The init_user_ns is used to indicate that a vfsmount is not idmapped.
+	 * This is simpler than just having to treat NULL as unmapped. Users
+	 * wanting to idmap a mount to init_user_ns can just use a namespace
+	 * with an identity mapping.
+	 */
+	mnt_userns = container_of(ns, struct user_namespace, ns);
+	if (mnt_userns == &init_user_ns) {
+		err = -EPERM;
+		goto out_fput;
+	}
+	kattr->mnt_userns = get_user_ns(mnt_userns);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
 			     struct mount_kattr *kattr, unsigned int flags)
 {
 	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
@@ -3991,9 +4095,6 @@ static int build_mount_kattr(const struct mount_attr *attr,
 	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
 		return -EINVAL;
 
-	if (attr->userns_fd)
-		return -EINVAL;
-
 	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
 	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
 
@@ -4032,7 +4133,13 @@ static int build_mount_kattr(const struct mount_attr *attr,
 			return -EINVAL;
 	}
 
-	return 0;
+	return build_mount_idmapped(attr, usize, kattr, flags);
+}
+
+static void finish_mount_kattr(struct mount_kattr *kattr)
+{
+	put_user_ns(kattr->mnt_userns);
+	kattr->mnt_userns = NULL;
 }
 
 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
@@ -4070,7 +4177,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
 	    attr.propagation == 0)
 		return 0;
 
-	err = build_mount_kattr(&attr, &kattr, flags);
+	err = build_mount_kattr(&attr, usize, &kattr, flags);
 	if (err)
 		return err;
 
@@ -4079,6 +4186,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
 		return err;
 
 	err = do_mount_setattr(&target, &kattr);
+	finish_mount_kattr(&kattr);
 	path_put(&target);
 	return err;
 }
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index eafb75755fa3..392ef5162655 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -79,6 +79,9 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		if (mnt->mnt_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
 	}
+
+	if (mnt_user_ns(mnt) != &init_user_ns)
+		seq_puts(m, ",idmapped");
 }
 
 static inline void mangle(struct seq_file *m, const char *s)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 52de25e08319..161f4419db6c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -77,7 +77,8 @@ struct vfsmount {
 
 static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
 {
-	return mnt->mnt_userns;
+	/* Pairs with smp_store_release() in do_idmap_mount(). */
+	return smp_load_acquire(&mnt->mnt_userns);
 }
 
 struct file; /* forward dec */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d3bcefdfa73d..e6524ead2b7b 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -119,6 +119,7 @@ enum fsconfig_command {
 #define MOUNT_ATTR_NOATIME	0x00000010 /* - Do not update access times. */
 #define MOUNT_ATTR_STRICTATIME	0x00000020 /* - Always perform atime updates */
 #define MOUNT_ATTR_NODIRATIME	0x00000080 /* Do not update directory access times */
+#define MOUNT_ATTR_IDMAP	0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
 
 /*
  * mount_setattr()
-- 
cgit v1.2.3


From 52f019d43c229afd65dc11c8c1b05b6436bf6765 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 9 Jan 2021 11:42:51 +0100
Subject: block: add a hard-readonly flag to struct gendisk

Commit 20bd1d026aac ("scsi: sd: Keep disk read-only when re-reading
partition") addressed a long-standing problem with user read-only
policy being overridden as a result of a device-initiated revalidate.
The commit has since been reverted due to a regression that left some
USB devices read-only indefinitely.

To fix the underlying problems with revalidate we need to keep track
of hardware state and user policy separately.

The gendisk has been updated to reflect the current hardware state set
by the device driver. This is done to allow returning the device to
the hardware state once the user clears the BLKROSET flag.

The resulting semantics are as follows:

 - If BLKROSET sets a given partition read-only, that partition will
   remain read-only even if the underlying storage stack initiates a
   revalidate. However, the BLKRRPART ioctl will cause the partition
   table to be dropped and any user policy on partitions will be lost.

 - If BLKROSET has not been set, both the whole disk device and any
   partitions will reflect the current write-protect state of the
   underlying device.

Based on a patch from Martin K. Petersen <martin.petersen@oracle.com>.

Reported-by: Oleksii Kurochko <olkuroch@cisco.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201221
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c        |  4 +---
 block/genhd.c           | 33 +++++++++++++++++++--------------
 block/partitions/core.c |  3 +--
 include/linux/genhd.h   |  6 ++++--
 4 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 7663a9b94b80..08ff8ca32529 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -694,9 +694,7 @@ static inline bool should_fail_request(struct block_device *part,
 
 static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
 {
-	const int op = bio_op(bio);
-
-	if (part->bd_read_only && op_is_write(op)) {
+	if (op_is_write(bio_op(bio)) && bdev_read_only(part)) {
 		char b[BDEVNAME_SIZE];
 
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
diff --git a/block/genhd.c b/block/genhd.c
index 484a474648d5..1873e4571328 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1637,27 +1637,32 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro)
 	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
 }
 
-void set_disk_ro(struct gendisk *disk, int flag)
+/**
+ * set_disk_ro - set a gendisk read-only
+ * @disk:	gendisk to operate on
+ * @ready_only:	%true to set the disk read-only, %false set the disk read/write
+ *
+ * This function is used to indicate whether a given disk device should have its
+ * read-only flag set. set_disk_ro() is typically used by device drivers to
+ * indicate whether the underlying physical device is write-protected.
+ */
+void set_disk_ro(struct gendisk *disk, bool read_only)
 {
-	struct disk_part_iter piter;
-	struct block_device *part;
-
-	if (disk->part0->bd_read_only != flag) {
-		set_disk_ro_uevent(disk, flag);
-		disk->part0->bd_read_only = flag;
+	if (read_only) {
+		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
+			return;
+	} else {
+		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
+			return;
 	}
-
-	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
-	while ((part = disk_part_iter_next(&piter)))
-		part->bd_read_only = flag;
-	disk_part_iter_exit(&piter);
+	set_disk_ro_uevent(disk, read_only);
 }
-
 EXPORT_SYMBOL(set_disk_ro);
 
 int bdev_read_only(struct block_device *bdev)
 {
-	return bdev->bd_read_only;
+	return bdev->bd_read_only ||
+		test_bit(GD_READ_ONLY, &bdev->bd_disk->state);
 }
 EXPORT_SYMBOL(bdev_read_only);
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e7d776db803b..168d5906077c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -195,7 +195,7 @@ static ssize_t part_start_show(struct device *dev,
 static ssize_t part_ro_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only);
+	return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
 }
 
 static ssize_t part_alignment_offset_show(struct device *dev,
@@ -361,7 +361,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 
 	bdev->bd_start_sect = start;
 	bdev_set_nr_sectors(bdev, len);
-	bdev->bd_read_only = get_disk_ro(disk);
 
 	if (info) {
 		err = -ENOMEM;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 809aaa32d53c..a62ccbfac54b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -163,6 +163,7 @@ struct gendisk {
 	int flags;
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
+#define GD_READ_ONLY			1
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
@@ -249,11 +250,12 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
 extern void del_gendisk(struct gendisk *gp);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
-extern void set_disk_ro(struct gendisk *disk, int flag);
+void set_disk_ro(struct gendisk *disk, bool read_only);
 
 static inline int get_disk_ro(struct gendisk *disk)
 {
-	return disk->part0->bd_read_only;
+	return disk->part0->bd_read_only ||
+		test_bit(GD_READ_ONLY, &disk->state);
 }
 
 extern void disk_block_events(struct gendisk *disk);
-- 
cgit v1.2.3


From 309dca309fc39a9e3c31b916393b74bd174fd74e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Jan 2021 11:02:34 +0100
Subject: block: store a block_device pointer in struct bio

Replace the gendisk pointer in struct bio with a pointer to the newly
improved struct block device.  From that the gendisk can be trivially
accessed with an extra indirection, but it also allows to directly
look up all information related to partition remapping.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/m68k/emu/nfblock.c             |  2 +-
 arch/xtensa/platforms/iss/simdisk.c |  2 +-
 block/bio-integrity.c               | 18 +++++++++---------
 block/bio.c                         | 31 +++++++++++--------------------
 block/blk-cgroup.c                  |  7 ++++---
 block/blk-core.c                    | 37 +++++++++++++++++--------------------
 block/blk-crypto-fallback.c         |  2 +-
 block/blk-crypto.c                  |  2 +-
 block/blk-merge.c                   | 17 ++++++++---------
 block/blk-mq.c                      |  2 +-
 block/blk-throttle.c                |  2 +-
 block/blk.h                         |  2 --
 block/bounce.c                      |  2 +-
 block/genhd.c                       |  2 +-
 drivers/block/brd.c                 |  2 +-
 drivers/block/drbd/drbd_int.h       |  4 ++--
 drivers/block/drbd/drbd_req.c       |  2 +-
 drivers/block/null_blk/main.c       |  2 +-
 drivers/block/pktcdvd.c             |  4 ++--
 drivers/block/ps3vram.c             |  2 +-
 drivers/block/rsxx/dev.c            |  2 +-
 drivers/block/umem.c                |  2 +-
 drivers/block/zram/zram_drv.c       |  2 +-
 drivers/lightnvm/pblk-init.c        |  2 +-
 drivers/md/bcache/debug.c           |  2 +-
 drivers/md/bcache/request.c         |  7 ++++---
 drivers/md/dm-bio-record.h          |  9 +++------
 drivers/md/dm-raid1.c               | 10 +++++-----
 drivers/md/dm.c                     | 14 +++++++-------
 drivers/md/md-linear.c              |  2 +-
 drivers/md/md.c                     |  2 +-
 drivers/md/md.h                     |  6 +++---
 drivers/md/raid1.c                  |  6 +++---
 drivers/md/raid10.c                 | 12 ++++++------
 drivers/md/raid5.c                  |  2 +-
 drivers/nvdimm/blk.c                |  4 ++--
 drivers/nvdimm/btt.c                |  4 ++--
 drivers/nvdimm/pmem.c               |  4 ++--
 drivers/nvme/host/core.c            |  6 +++---
 drivers/nvme/host/lightnvm.c        |  3 +--
 drivers/nvme/host/multipath.c       |  6 +++---
 drivers/nvme/host/rdma.c            |  2 +-
 drivers/s390/block/dcssblk.c        |  2 +-
 drivers/s390/block/xpram.c          |  2 +-
 fs/btrfs/check-integrity.c          | 10 +++++-----
 fs/btrfs/raid56.c                   |  7 ++-----
 fs/btrfs/scrub.c                    |  2 +-
 fs/direct-io.c                      |  2 +-
 fs/f2fs/data.c                      | 12 +-----------
 include/linux/bio.h                 | 18 ++++++++----------
 include/linux/blk-mq.h              |  4 ++--
 include/linux/blk_types.h           |  3 +--
 include/linux/blkdev.h              |  5 +++--
 kernel/trace/blktrace.c             | 16 +++++++++-------
 mm/page_io.c                        |  2 +-
 55 files changed, 154 insertions(+), 184 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 92d26c812441..ba808543161a 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -61,7 +61,7 @@ struct nfhd_device {
 
 static blk_qc_t nfhd_submit_bio(struct bio *bio)
 {
-	struct nfhd_device *dev = bio->bi_disk->private_data;
+	struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	int dir, len, shift;
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 3447556d276d..fc09be7b1347 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -103,7 +103,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
 
 static blk_qc_t simdisk_submit_bio(struct bio *bio)
 {
-	struct simdisk *dev = bio->bi_disk->private_data;
+	struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	sector_t sector = bio->bi_iter.bi_sector;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9ffd7e289554..c3e5abcfdc98 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -140,7 +140,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	iv = bip->bip_vec + bip->bip_vcnt;
 
 	if (bip->bip_vcnt &&
-	    bvec_gap_to_prev(bio->bi_disk->queue,
+	    bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
 			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
 		return 0;
 
@@ -162,7 +162,7 @@ EXPORT_SYMBOL(bio_integrity_add_page);
 static blk_status_t bio_integrity_process(struct bio *bio,
 		struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
 {
-	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
 	struct blk_integrity_iter iter;
 	struct bvec_iter bviter;
 	struct bio_vec bv;
@@ -171,7 +171,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
 	void *prot_buf = page_address(bip->bip_vec->bv_page) +
 		bip->bip_vec->bv_offset;
 
-	iter.disk_name = bio->bi_disk->disk_name;
+	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
 	iter.interval = 1 << bi->interval_exp;
 	iter.seed = proc_iter->bi_sector;
 	iter.prot_buf = prot_buf;
@@ -208,8 +208,8 @@ static blk_status_t bio_integrity_process(struct bio *bio,
 bool bio_integrity_prep(struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
-	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
-	struct request_queue *q = bio->bi_disk->queue;
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	void *buf;
 	unsigned long start, end;
 	unsigned int len, nr_pages;
@@ -329,7 +329,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio_integrity_payload *bip =
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
-	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
 
 	/*
 	 * At the moment verify is called bio's iterator was advanced
@@ -355,7 +355,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
  */
 bool __bio_integrity_endio(struct bio *bio)
 {
-	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
 	struct bio_integrity_payload *bip = bio_integrity(bio);
 
 	if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
@@ -381,7 +381,7 @@ bool __bio_integrity_endio(struct bio *bio)
 void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
 	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
 
 	bip->bip_iter.bi_sector += bytes_done >> 9;
@@ -397,7 +397,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 void bio_integrity_trim(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
 
 	bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
 }
diff --git a/block/bio.c b/block/bio.c
index 1f2cc1fbe283..0b70ade17da6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -607,16 +607,7 @@ void bio_truncate(struct bio *bio, unsigned new_size)
  */
 void guard_bio_eod(struct bio *bio)
 {
-	sector_t maxsector;
-	struct block_device *part;
-
-	rcu_read_lock();
-	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
-	if (part)
-		maxsector = bdev_nr_sectors(part);
-	else	
-		maxsector = get_capacity(bio->bi_disk);
-	rcu_read_unlock();
+	sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
 
 	if (!maxsector)
 		return;
@@ -676,11 +667,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
 
 	/*
-	 * most users will be overriding ->bi_disk with a new target,
+	 * most users will be overriding ->bi_bdev with a new target,
 	 * so we don't set nor calculate new physical/hw segment counts here
 	 */
-	bio->bi_disk = bio_src->bi_disk;
-	bio->bi_partno = bio_src->bi_partno;
+	bio->bi_bdev = bio_src->bi_bdev;
 	bio_set_flag(bio, BIO_CLONED);
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
@@ -730,7 +720,7 @@ EXPORT_SYMBOL(bio_clone_fast);
 
 const char *bio_devname(struct bio *bio, char *buf)
 {
-	return disk_name(bio->bi_disk, bio->bi_partno, buf);
+	return bdevname(bio->bi_bdev, buf);
 }
 EXPORT_SYMBOL(bio_devname);
 
@@ -1037,7 +1027,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct request_queue *q = bio->bi_disk->queue;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
@@ -1145,7 +1135,8 @@ static void submit_bio_wait_endio(struct bio *bio)
  */
 int submit_bio_wait(struct bio *bio)
 {
-	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
+	DECLARE_COMPLETION_ONSTACK_MAP(done,
+			bio->bi_bdev->bd_disk->lockdep_map);
 	unsigned long hang_check;
 
 	bio->bi_private = &done;
@@ -1422,8 +1413,8 @@ again:
 	if (!bio_integrity_endio(bio))
 		return;
 
-	if (bio->bi_disk)
-		rq_qos_done_bio(bio->bi_disk->queue, bio);
+	if (bio->bi_bdev)
+		rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
 
 	/*
 	 * Need to have a real endio function for chained bios, otherwise
@@ -1438,8 +1429,8 @@ again:
 		goto again;
 	}
 
-	if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-		trace_block_bio_complete(bio->bi_disk->queue, bio);
+	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 	}
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 031114d454a6..3465d6ee708e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1800,7 +1800,8 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
 	struct blkcg_gq *blkg, *ret_blkg = NULL;
 
 	rcu_read_lock();
-	blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
+	blkg = blkg_lookup_create(css_to_blkcg(css),
+				  bio->bi_bdev->bd_disk->queue);
 	while (blkg) {
 		if (blkg_tryget(blkg)) {
 			ret_blkg = blkg;
@@ -1836,8 +1837,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
 	if (css && css->parent) {
 		bio->bi_blkg = blkg_tryget_closest(bio, css);
 	} else {
-		blkg_get(bio->bi_disk->queue->root_blkg);
-		bio->bi_blkg = bio->bi_disk->queue->root_blkg;
+		blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
+		bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
 	}
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
diff --git a/block/blk-core.c b/block/blk-core.c
index 08ff8ca32529..a3a54cd86c9c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -476,7 +476,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 
 static inline int bio_queue_enter(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_disk->queue;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	bool nowait = bio->bi_opf & REQ_NOWAIT;
 	int ret;
 
@@ -712,7 +712,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part)
 
 static noinline int should_fail_bio(struct bio *bio)
 {
-	if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size))
+	if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
 		return -EIO;
 	return 0;
 }
@@ -741,13 +741,9 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
  */
 static inline int blk_partition_remap(struct bio *bio)
 {
-	struct block_device *p;
+	struct block_device *p = bio->bi_bdev;
 	int ret = -EIO;
 
-	rcu_read_lock();
-	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-	if (unlikely(!p))
-		goto out;
 	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
 		goto out;
 	if (unlikely(bio_check_ro(bio, p)))
@@ -761,10 +757,9 @@ static inline int blk_partition_remap(struct bio *bio)
 				      bio->bi_iter.bi_sector -
 				      p->bd_start_sect);
 	}
-	bio->bi_partno = 0;
+	bio->bi_bdev = bdev_whole(p);
 	ret = 0;
 out:
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -805,7 +800,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 
 static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_disk->queue;
+	struct block_device *bdev = bio->bi_bdev;
+	struct request_queue *q = bdev->bd_disk->queue;
 	blk_status_t status = BLK_STS_IOERR;
 	struct blk_plug *plug;
 
@@ -825,13 +821,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 	if (should_fail_bio(bio))
 		goto end_io;
 
-	if (bio->bi_partno) {
+	if (bio->bi_bdev->bd_partno) {
 		if (unlikely(blk_partition_remap(bio)))
 			goto end_io;
 	} else {
-		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
+		if (unlikely(bio_check_ro(bio, bdev_whole(bdev))))
 			goto end_io;
-		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
+		if (unlikely(bio_check_eod(bio, get_capacity(bdev->bd_disk))))
 			goto end_io;
 	}
 
@@ -924,7 +920,7 @@ end_io:
 
 static blk_qc_t __submit_bio(struct bio *bio)
 {
-	struct gendisk *disk = bio->bi_disk;
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
 	blk_qc_t ret = BLK_QC_T_NONE;
 
 	if (blk_crypto_bio_prep(&bio)) {
@@ -966,7 +962,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 
 	do {
-		struct request_queue *q = bio->bi_disk->queue;
+		struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 		struct bio_list lower, same;
 
 		if (unlikely(bio_queue_enter(bio) != 0))
@@ -987,7 +983,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
 		bio_list_init(&lower);
 		bio_list_init(&same);
 		while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
-			if (q == bio->bi_disk->queue)
+			if (q == bio->bi_bdev->bd_disk->queue)
 				bio_list_add(&same, bio);
 			else
 				bio_list_add(&lower, bio);
@@ -1012,7 +1008,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
 	current->bio_list = bio_list;
 
 	do {
-		struct gendisk *disk = bio->bi_disk;
+		struct gendisk *disk = bio->bi_bdev->bd_disk;
 
 		if (unlikely(bio_queue_enter(bio) != 0))
 			continue;
@@ -1055,7 +1051,7 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	if (!bio->bi_disk->fops->submit_bio)
+	if (!bio->bi_bdev->bd_disk->fops->submit_bio)
 		return __submit_bio_noacct_mq(bio);
 	return __submit_bio_noacct(bio);
 }
@@ -1067,7 +1063,7 @@ EXPORT_SYMBOL(submit_bio_noacct);
  *
  * submit_bio() is used to submit I/O requests to block devices.  It is passed a
  * fully set up &struct bio that describes the I/O that needs to be done.  The
- * bio will be send to the device described by the bi_disk and bi_partno fields.
+ * bio will be send to the device described by the bi_bdev field.
  *
  * The success/failure status of the request, along with notification of
  * completion, is delivered asynchronously through the ->bi_end_io() callback
@@ -1087,7 +1083,8 @@ blk_qc_t submit_bio(struct bio *bio)
 		unsigned int count;
 
 		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-			count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
+			count = queue_logical_block_size(
+					bio->bi_bdev->bd_disk->queue) >> 9;
 		else
 			count = bio_sectors(bio);
 
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c162b754efbd..8f1e18176731 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -167,7 +167,7 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
 	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL);
 	if (!bio)
 		return NULL;
-	bio->bi_disk		= bio_src->bi_disk;
+	bio->bi_bdev		= bio_src->bi_bdev;
 	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_ioprio		= bio_src->bi_ioprio;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 5da43f0973b4..09fcb18fa778 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -280,7 +280,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 	 * Success if device supports the encryption context, or if we succeeded
 	 * in falling back to the crypto API.
 	 */
-	if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm,
+	if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
 					 &bc_key->crypto_cfg))
 		return true;
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 808768f6b174..ffb4aa0ea68b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -298,14 +298,13 @@ split:
  * Split a bio into two bios, chain the two bios, submit the second half and
  * store a pointer to the first half in *@bio. If the second bio is still too
  * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is
- * the responsibility of the caller to ensure that
- * @bio->bi_disk->queue->bio_split is only released after processing of the
- * split bio has finished.
+ * function may allocate a new bio from q->bio_split, it is the responsibility
+ * of the caller to ensure that q->bio_split is only released after processing
+ * of the split bio has finished.
  */
 void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 {
-	struct request_queue *q = (*bio)->bi_disk->queue;
+	struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
 	struct bio *split = NULL;
 
 	switch (bio_op(*bio)) {
@@ -358,9 +357,9 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
  *
  * Split a bio into two bios, chains the two bios, submit the second half and
  * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of
- * the caller to ensure that @bio->bi_disk->queue->bio_split is only released
- * after processing of the split bio has finished.
+ * a new bio from q->bio_split, it is the responsibility of the caller to ensure
+ * that q->bio_split is only released after processing of the split bio has
+ * finished.
  */
 void blk_queue_split(struct bio **bio)
 {
@@ -866,7 +865,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		return false;
 
 	/* must be same device */
-	if (rq->rq_disk != bio->bi_disk)
+	if (rq->rq_disk != bio->bi_bdev->bd_disk)
 		return false;
 
 	/* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f285a9123a8b..74b17b396f4c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2128,7 +2128,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
  */
 blk_qc_t blk_mq_submit_bio(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_disk->queue;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_mq_alloc_data data = {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d52cac9f3a7c..b1b22d863bdf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2178,7 +2178,7 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 
 bool blk_throtl_bio(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_disk->queue;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	struct blkcg_gq *blkg = bio->bi_blkg;
 	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg = blkg_to_tg(blkg);
diff --git a/block/blk.h b/block/blk.h
index 7550364c326c..10ab7c0d0766 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -202,8 +202,6 @@ static inline void elevator_exit(struct request_queue *q,
 	__elevator_exit(q, e);
 }
 
-struct block_device *__disk_get_part(struct gendisk *disk, int partno);
-
 ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
 		char *buf);
 ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
diff --git a/block/bounce.c b/block/bounce.c
index d3f51acd6e3b..a22a8a1942b2 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -246,7 +246,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
 	if (!bio)
 		return NULL;
-	bio->bi_disk		= bio_src->bi_disk;
+	bio->bi_bdev		= bio_src->bi_bdev;
 	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_ioprio		= bio_src->bi_ioprio;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
diff --git a/block/genhd.c b/block/genhd.c
index ca5d880af512..e536d0b4bbae 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -161,7 +161,7 @@ static void part_in_flight_rw(struct block_device *part,
 		inflight[1] = 0;
 }
 
-struct block_device *__disk_get_part(struct gendisk *disk, int partno)
+static struct block_device *__disk_get_part(struct gendisk *disk, int partno)
 {
 	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
 
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c7c821419079..18bf99906662 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -284,7 +284,7 @@ out:
 
 static blk_qc_t brd_submit_bio(struct bio *bio)
 {
-	struct brd_device *brd = bio->bi_disk->private_data;
+	struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
 	sector_t sector = bio->bi_iter.bi_sector;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8f879e5c2f67..b2c93a29c251 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1579,8 +1579,8 @@ static inline void drbd_submit_bio_noacct(struct drbd_device *device,
 					     int fault_type, struct bio *bio)
 {
 	__release(local);
-	if (!bio->bi_disk) {
-		drbd_err(device, "drbd_submit_bio_noacct: bio->bi_disk == NULL\n");
+	if (!bio->bi_bdev) {
+		drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n");
 		bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 330f851cb8f0..ea0f31ab3343 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1595,7 +1595,7 @@ void do_submit(struct work_struct *ws)
 
 blk_qc_t drbd_submit_bio(struct bio *bio)
 {
-	struct drbd_device *device = bio->bi_disk->private_data;
+	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
 	unsigned long start_jif;
 
 	blk_queue_split(&bio);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 5357c3a4a36f..d6c821d48090 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1420,7 +1420,7 @@ static blk_qc_t null_submit_bio(struct bio *bio)
 {
 	sector_t sector = bio->bi_iter.bi_sector;
 	sector_t nr_sectors = bio_sectors(bio);
-	struct nullb *nullb = bio->bi_disk->private_data;
+	struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
 	struct nullb_queue *nq = nullb_to_queue(nullb);
 	struct nullb_cmd *cmd;
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index b8bb8ec7538d..658a0981cb54 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2374,7 +2374,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
 
 	blk_queue_split(&bio);
 
-	pd = bio->bi_disk->queue->queuedata;
+	pd = bio->bi_bdev->bd_disk->queue->queuedata;
 	if (!pd) {
 		pr_err("%s incorrect request queue\n", bio_devname(bio, b));
 		goto end_io;
@@ -2418,7 +2418,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio)
 			split = bio;
 		}
 
-		pkt_make_request_write(bio->bi_disk->queue, split);
+		pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
 	} while (split != bio);
 
 	return BLK_QC_T_NONE;
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index b71d28372ef3..1d738999fb69 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -581,7 +581,7 @@ out:
 
 static blk_qc_t ps3vram_submit_bio(struct bio *bio)
 {
-	struct ps3_system_bus_device *dev = bio->bi_disk->private_data;
+	struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data;
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	int busy;
 
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index edacefff6e35..9a28322a8cd8 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -122,7 +122,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
 
 static blk_qc_t rsxx_submit_bio(struct bio *bio)
 {
-	struct rsxx_cardinfo *card = bio->bi_disk->private_data;
+	struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data;
 	struct rsxx_bio_meta *bio_meta;
 	blk_status_t st = BLK_STS_IOERR;
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 2b95d7b33b91..982732dbe82e 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -521,7 +521,7 @@ static int mm_check_plugged(struct cardinfo *card)
 
 static blk_qc_t mm_submit_bio(struct bio *bio)
 {
-	struct cardinfo *card = bio->bi_disk->private_data;
+	struct cardinfo *card = bio->bi_bdev->bd_disk->private_data;
 
 	pr_debug("mm_make_request %llu %u\n",
 		 (unsigned long long)bio->bi_iter.bi_sector,
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e2933cb7a82a..d6243dbc53cc 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1596,7 +1596,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
  */
 static blk_qc_t zram_submit_bio(struct bio *bio)
 {
-	struct zram *zram = bio->bi_disk->private_data;
+	struct zram *zram = bio->bi_bdev->bd_disk->private_data;
 
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index b6246f73895c..5924f09c217b 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -49,7 +49,7 @@ struct bio_set pblk_bio_set;
 
 static blk_qc_t pblk_submit_bio(struct bio *bio)
 {
-	struct pblk *pblk = bio->bi_disk->queue->queuedata;
+	struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata;
 
 	if (bio_op(bio) == REQ_OP_DISCARD) {
 		pblk_discard(pblk, bio);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index b00fd08d696b..058dd8014428 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
 	if (!check)
 		return;
-	check->bi_disk = bio->bi_disk;
+	check->bi_bdev = bio->bi_bdev;
 	check->bi_opf = REQ_OP_READ;
 	check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
 	check->bi_iter.bi_size = bio->bi_iter.bi_size;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 85b1f2a9b72d..dfc35d6d05ed 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -894,7 +894,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	    !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
 	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
-			      get_capacity(bio->bi_disk) - bio_end_sector(bio));
+			      get_capacity(bio->bi_bdev->bd_disk) -
+			      bio_end_sector(bio));
 
 	s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
@@ -1167,7 +1168,7 @@ static void quit_max_writeback_rate(struct cache_set *c,
 blk_qc_t cached_dev_submit_bio(struct bio *bio)
 {
 	struct search *s;
-	struct bcache_device *d = bio->bi_disk->private_data;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	int rw = bio_data_dir(bio);
 
@@ -1274,7 +1275,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
 {
 	struct search *s;
 	struct closure *cl;
-	struct bcache_device *d = bio->bi_disk->private_data;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 
 	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
 		bio->bi_status = BLK_STS_IOERR;
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 2ea0360108e1..a3b71350eec8 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -18,8 +18,7 @@
  */
 
 struct dm_bio_details {
-	struct gendisk *bi_disk;
-	u8 bi_partno;
+	struct block_device *bi_bdev;
 	int __bi_remaining;
 	unsigned long bi_flags;
 	struct bvec_iter bi_iter;
@@ -31,8 +30,7 @@ struct dm_bio_details {
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
-	bd->bi_disk = bio->bi_disk;
-	bd->bi_partno = bio->bi_partno;
+	bd->bi_bdev = bio->bi_bdev;
 	bd->bi_flags = bio->bi_flags;
 	bd->bi_iter = bio->bi_iter;
 	bd->__bi_remaining = atomic_read(&bio->__bi_remaining);
@@ -44,8 +42,7 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
-	bio->bi_disk = bd->bi_disk;
-	bio->bi_partno = bd->bi_partno;
+	bio->bi_bdev = bd->bi_bdev;
 	bio->bi_flags = bd->bi_flags;
 	bio->bi_iter = bd->bi_iter;
 	atomic_set(&bio->__bi_remaining, bd->__bi_remaining);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index fa09bc4e4c54..b0a82f29a2e4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
 
 struct dm_raid1_bio_record {
 	struct mirror *m;
-	/* if details->bi_disk == NULL, details were not saved */
+	/* if details->bi_bdev == NULL, details were not saved */
 	struct dm_bio_details details;
 	region_t write_region;
 };
@@ -1190,7 +1190,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	struct dm_raid1_bio_record *bio_record =
 	  dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
 
-	bio_record->details.bi_disk = NULL;
+	bio_record->details.bi_bdev = NULL;
 
 	if (rw == WRITE) {
 		/* Save region for mirror_end_io() handler */
@@ -1257,7 +1257,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 		goto out;
 
 	if (unlikely(*error)) {
-		if (!bio_record->details.bi_disk) {
+		if (!bio_record->details.bi_bdev) {
 			/*
 			 * There wasn't enough memory to record necessary
 			 * information for a retry or there was no other
@@ -1282,7 +1282,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 			bd = &bio_record->details;
 
 			dm_bio_restore(bd, bio);
-			bio_record->details.bi_disk = NULL;
+			bio_record->details.bi_bdev = NULL;
 			bio->bi_status = 0;
 
 			queue_bio(ms, bio, rw);
@@ -1292,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	}
 
 out:
-	bio_record->details.bi_disk = NULL;
+	bio_record->details.bi_bdev = NULL;
 
 	return DM_ENDIO_DONE;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7bac564f3faa..479ec5bea09e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -977,16 +977,17 @@ static void clone_endio(struct bio *bio)
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 	struct bio *orig_bio = io->orig_bio;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 
 	if (unlikely(error == BLK_STS_TARGET)) {
 		if (bio_op(bio) == REQ_OP_DISCARD &&
-		    !bio->bi_disk->queue->limits.max_discard_sectors)
+		    !q->limits.max_discard_sectors)
 			disable_discard(md);
 		else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-			 !bio->bi_disk->queue->limits.max_write_same_sectors)
+			 !q->limits.max_write_same_sectors)
 			disable_write_same(md);
 		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
-			 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
+			 !q->limits.max_write_zeroes_sectors)
 			disable_write_zeroes(md);
 	}
 
@@ -996,7 +997,7 @@ static void clone_endio(struct bio *bio)
 	 */
 	if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
 		sector_t written_sector = bio->bi_iter.bi_sector;
-		struct request_queue *q = orig_bio->bi_disk->queue;
+		struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue;
 		u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
 
 		orig_bio->bi_iter.bi_sector += written_sector & mask;
@@ -1422,8 +1423,7 @@ static int __send_empty_flush(struct clone_info *ci)
 	 */
 	bio_init(&flush_bio, NULL, 0);
 	flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-	flush_bio.bi_disk = ci->io->md->disk;
-	bio_associate_blkg(&flush_bio);
+	bio_set_dev(&flush_bio, ci->io->md->disk->part0);
 
 	ci->bio = &flush_bio;
 	ci->sector_count = 0;
@@ -1626,7 +1626,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
 
 static blk_qc_t dm_submit_bio(struct bio *bio)
 {
-	struct mapped_device *md = bio->bi_disk->private_data;
+	struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
 	blk_qc_t ret = BLK_QC_T_NONE;
 	int srcu_idx;
 	struct dm_table *map;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 68cac7d19278..63ed8329a98d 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -252,7 +252,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		start_sector + data_offset;
 
 	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-		     !blk_queue_discard(bio->bi_disk->queue))) {
+		     !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) {
 		/* Just ignore it */
 		bio_endio(bio);
 	} else {
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 04384452a7ab..cf06dbb1aa53 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -486,7 +486,7 @@ static void md_end_io(struct bio *bio)
 static blk_qc_t md_submit_bio(struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
-	struct mddev *mddev = bio->bi_disk->private_data;
+	struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
 
 	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 34070ab30a8a..f13290ccc1c2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -556,7 +556,7 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
 
 static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
 {
-	atomic_add(nr_sectors, &bio->bi_disk->sync_io);
+	md_sync_acct(bio->bi_bdev, nr_sectors);
 }
 
 struct md_personality
@@ -793,14 +793,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 {
 	if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-	    !bio->bi_disk->queue->limits.max_write_same_sectors)
+	    !bio->bi_bdev->bd_disk->queue->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
 
 static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
 {
 	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
-	    !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
+	    !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
 		mddev->queue->limits.max_write_zeroes_sectors = 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c0347997f6ff..3b19141cdb4b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -794,13 +794,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
-		struct md_rdev *rdev = (void *)bio->bi_disk;
+		struct md_rdev *rdev = (void *)bio->bi_bdev;
 		bio->bi_next = NULL;
 		bio_set_dev(bio, rdev->bdev);
 		if (test_bit(Faulty, &rdev->flags)) {
 			bio_io_error(bio);
 		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-				    !blk_queue_discard(bio->bi_disk->queue)))
+				    !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
 			/* Just ignore it */
 			bio_endio(bio);
 		else
@@ -1520,7 +1520,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
 					      r1_bio->sector);
 		/* flush_pending_writes() needs access to the rdev so...*/
-		mbio->bi_disk = (void *)conf->mirrors[i].rdev;
+		mbio->bi_bdev = (void *)conf->mirrors[i].rdev;
 
 		cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
 		if (cb)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c5d88ef6a45c..be8f14afb6d1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -882,13 +882,13 @@ static void flush_pending_writes(struct r10conf *conf)
 
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
-			struct md_rdev *rdev = (void*)bio->bi_disk;
+			struct md_rdev *rdev = (void*)bio->bi_bdev;
 			bio->bi_next = NULL;
 			bio_set_dev(bio, rdev->bdev);
 			if (test_bit(Faulty, &rdev->flags)) {
 				bio_io_error(bio);
 			} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
-					    !blk_queue_discard(bio->bi_disk->queue)))
+					    !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
 				/* Just ignore it */
 				bio_endio(bio);
 			else
@@ -1075,13 +1075,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
-		struct md_rdev *rdev = (void*)bio->bi_disk;
+		struct md_rdev *rdev = (void*)bio->bi_bdev;
 		bio->bi_next = NULL;
 		bio_set_dev(bio, rdev->bdev);
 		if (test_bit(Faulty, &rdev->flags)) {
 			bio_io_error(bio);
 		} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
-				    !blk_queue_discard(bio->bi_disk->queue)))
+				    !blk_queue_discard(bio->bi_bdev->bd_disk->queue)))
 			/* Just ignore it */
 			bio_endio(bio);
 		else
@@ -1253,7 +1253,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 		trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
 				      r10_bio->sector);
 	/* flush_pending_writes() needs access to the rdev so...*/
-	mbio->bi_disk = (void *)rdev;
+	mbio->bi_bdev = (void *)rdev;
 
 	atomic_inc(&r10_bio->remaining);
 
@@ -3003,7 +3003,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
-	 * have bi_end_io, bi_sector, bi_disk set,
+	 * have bi_end_io, bi_sector, bi_bdev set,
 	 * and bi_private set to the r10bio.
 	 * For recovery, we may actually create several r10bios
 	 * with 2 bios in each, that correspond to the bios in the main one.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3a90cc0e43ca..f411b9e5c332 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5310,7 +5310,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 	unsigned int chunk_sectors;
 	unsigned int bio_sectors = bio_sectors(bio);
 
-	WARN_ON_ONCE(bio->bi_partno);
+	WARN_ON_ONCE(bio->bi_bdev->bd_partno);
 
 	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
 	return  chunk_sectors >=
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 22e5617b2cea..e03a1f38d750 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -165,7 +165,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
 static blk_qc_t nd_blk_submit_bio(struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
-	struct nd_namespace_blk *nsblk = bio->bi_disk->private_data;
+	struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data;
 	struct bvec_iter iter;
 	unsigned long start;
 	struct bio_vec bvec;
@@ -177,7 +177,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio)
 
 	bip = bio_integrity(bio);
 	rw = bio_data_dir(bio);
-	do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+	do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
 	if (do_acct)
 		start = bio_start_io_acct(bio);
 	bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 12ff6f8784ac..41aa1f01fc07 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1442,7 +1442,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
 static blk_qc_t btt_submit_bio(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct btt *btt = bio->bi_disk->private_data;
+	struct btt *btt = bio->bi_bdev->bd_disk->private_data;
 	struct bvec_iter iter;
 	unsigned long start;
 	struct bio_vec bvec;
@@ -1452,7 +1452,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio)
 	if (!bio_integrity_prep(bio))
 		return BLK_QC_T_NONE;
 
-	do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+	do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
 	if (do_acct)
 		start = bio_start_io_acct(bio);
 	bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 875076b0ea6c..72740835c85c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -197,13 +197,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
 	unsigned long start;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
-	struct pmem_device *pmem = bio->bi_disk->private_data;
+	struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
 	struct nd_region *nd_region = to_region(pmem);
 
 	if (bio->bi_opf & REQ_PREFLUSH)
 		ret = nvdimm_flush(nd_region, bio);
 
-	do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+	do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
 	if (do_acct)
 		start = bio_start_io_acct(bio);
 	bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 566788ba4e7d..a39befb4deba 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1113,7 +1113,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
-	struct gendisk *disk = ns ? ns->disk : NULL;
+	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
 	struct request *req;
 	struct bio *bio = NULL;
 	void *meta = NULL;
@@ -1133,8 +1133,8 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 		if (ret)
 			goto out;
 		bio = req->bio;
-		bio->bi_disk = disk;
-		if (disk && meta_buffer && meta_len) {
+		bio->bi_bdev = bdev;
+		if (bdev && meta_buffer && meta_len) {
 			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
 					meta_seed, write);
 			if (IS_ERR(meta)) {
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 470cef3abec3..6c8eab8de288 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -757,7 +757,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 {
 	bool write = nvme_is_write((struct nvme_command *)vcmd);
 	struct nvm_dev *dev = ns->ndev;
-	struct gendisk *disk = ns->disk;
 	struct request *rq;
 	struct bio *bio = NULL;
 	__le64 *ppa_list = NULL;
@@ -817,7 +816,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
 		}
 
-		bio->bi_disk = disk;
+		bio->bi_bdev = ns->disk->part0;
 	}
 
 	blk_execute_rq(q, NULL, rq, 0);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 9ac762b28811..a6d44e7a775f 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -296,7 +296,7 @@ static bool nvme_available_path(struct nvme_ns_head *head)
 
 blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
 {
-	struct nvme_ns_head *head = bio->bi_disk->private_data;
+	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
 	struct device *dev = disk_to_dev(head->disk);
 	struct nvme_ns *ns;
 	blk_qc_t ret = BLK_QC_T_NONE;
@@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
 	if (likely(ns)) {
-		bio->bi_disk = ns->disk;
+		bio->bi_bdev = ns->disk->part0;
 		bio->bi_opf |= REQ_NVME_MPATH;
 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
 				      bio->bi_iter.bi_sector);
@@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work)
 		 * Reset disk to the mpath node and resubmit to select a new
 		 * path.
 		 */
-		bio->bi_disk = head->disk;
+		bio->bi_bdev = head->disk->part0;
 		submit_bio_noacct(bio);
 	}
 }
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b7ce4f221d99..f5ef3edeb2fd 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1468,7 +1468,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
 	if (unlikely(nr))
 		goto mr_put;
 
-	nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c,
+	nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
 				req->mr->sig_attrs, ns->pi_type);
 	nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 5c5cff3f2374..da33cb4cba28 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -879,7 +879,7 @@ dcssblk_submit_bio(struct bio *bio)
 	blk_queue_split(&bio);
 
 	bytes_done = 0;
-	dev_info = bio->bi_disk->private_data;
+	dev_info = bio->bi_bdev->bd_disk->private_data;
 	if (dev_info == NULL)
 		goto fail;
 	if ((bio->bi_iter.bi_sector & 7) != 0 ||
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index c2536f7767b3..d1ed39162943 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -184,7 +184,7 @@ static unsigned long xpram_highest_page_index(void)
  */
 static blk_qc_t xpram_submit_bio(struct bio *bio)
 {
-	xpram_device_t *xdev = bio->bi_disk->private_data;
+	xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	unsigned int index;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 6ff44e53814c..113cb85c1fd4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2674,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 	mutex_lock(&btrfsic_mutex);
 	/* since btrfsic_submit_bio() is also called before
 	 * btrfsic_mount(), this might return NULL */
-	dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
+	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
 	if (NULL != dev_state &&
 	    (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
 		unsigned int i = 0;
@@ -2690,9 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
 		bio_is_patched = 0;
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
+			pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
 			       bio_op(bio), bio->bi_opf, segs,
-			       bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk);
+			       bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
 
 		mapped_datav = kmalloc_array(segs,
 					     sizeof(*mapped_datav), GFP_NOFS);
@@ -2721,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
 	} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n",
-			       bio_op(bio), bio->bi_opf, bio->bi_disk);
+			pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+			       bio_op(bio), bio->bi_opf, bio->bi_bdev);
 		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
 			if ((dev_state->state->print_mask &
 			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 93fbf87bdc8d..b2204a2942cb 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1105,8 +1105,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 		 * devices or if they are not contiguous
 		 */
 		if (last_end == disk_start && !last->bi_status &&
-		    last->bi_disk == stripe->dev->bdev->bd_disk &&
-		    last->bi_partno == stripe->dev->bdev->bd_partno) {
+		    last->bi_bdev == stripe->dev->bdev) {
 			ret = bio_add_page(last, page, PAGE_SIZE, 0);
 			if (ret == PAGE_SIZE)
 				return 0;
@@ -1357,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 	for (i = 0; i < rbio->bbio->num_stripes; i++) {
 		stripe = &rbio->bbio->stripes[i];
 		if (in_range(physical, stripe->physical, rbio->stripe_len) &&
-		    stripe->dev->bdev &&
-		    bio->bi_disk == stripe->dev->bdev->bd_disk &&
-		    bio->bi_partno == stripe->dev->bdev->bd_partno) {
+		    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
 			return i;
 		}
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5f4f88a4d2c8..33f8f0f108bf 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1695,7 +1695,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
 
 	sbio = sctx->wr_curr_bio;
 	sctx->wr_curr_bio = NULL;
-	WARN_ON(!sbio->bio->bi_disk);
+	WARN_ON(!sbio->bio->bi_bdev);
 	scrub_pending_bio_inc(sctx);
 	/* process all writes in a single worker thread. Then the block layer
 	 * orders the requests before sending them to the driver which
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d53fa92a1ab6..2660e744da2d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -434,7 +434,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
-	dio->bio_disk = bio->bi_disk;
+	dio->bio_disk = bio->bi_bdev->bd_disk;
 
 	if (sdio->submit_io) {
 		sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa34d620bec9..8cbf03159752 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -427,16 +427,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
 	return 0;
 }
 
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
-static bool __same_bdev(struct f2fs_sb_info *sbi,
-				block_t blk_addr, struct bio *bio)
-{
-	struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL);
-	return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
-}
-
 static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 {
 	struct f2fs_sb_info *sbi = fio->sbi;
@@ -741,7 +731,7 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
 		return false;
 	if (last_blkaddr + 1 != cur_blkaddr)
 		return false;
-	return __same_bdev(sbi, cur_blkaddr, bio);
+	return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL);
 }
 
 static bool io_type_is_mergeable(struct f2fs_bio_info *io,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1edda614f7ce..12af7aa5db37 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -483,24 +483,22 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 extern const char *bio_devname(struct bio *bio, char *buffer);
 
-#define bio_set_dev(bio, bdev) 			\
-do {						\
-	if ((bio)->bi_disk != (bdev)->bd_disk)	\
-		bio_clear_flag(bio, BIO_THROTTLED);\
-	(bio)->bi_disk = (bdev)->bd_disk;	\
-	(bio)->bi_partno = (bdev)->bd_partno;	\
-	bio_associate_blkg(bio);		\
+#define bio_set_dev(bio, bdev) 				\
+do {							\
+	if ((bio)->bi_bdev != (bdev))			\
+		bio_clear_flag(bio, BIO_THROTTLED);	\
+	(bio)->bi_bdev = (bdev);			\
+	bio_associate_blkg(bio);			\
 } while (0)
 
 #define bio_copy_dev(dst, src)			\
 do {						\
-	(dst)->bi_disk = (src)->bi_disk;	\
-	(dst)->bi_partno = (src)->bi_partno;	\
+	(dst)->bi_bdev = (src)->bi_bdev;	\
 	bio_clone_blkg_association(dst, src);	\
 } while (0)
 
 #define bio_dev(bio) \
-	disk_devt((bio)->bi_disk)
+	disk_devt((bio)->bi_bdev->bd_disk)
 
 #ifdef CONFIG_BLK_CGROUP
 void bio_associate_blkg(struct bio *bio);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d705b174d346..6b410dab48ee 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -602,8 +602,8 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
 	rq->bio = rq->biotail = bio;
 	rq->ioprio = bio_prio(bio);
 
-	if (bio->bi_disk)
-		rq->rq_disk = bio->bi_disk;
+	if (bio->bi_bdev)
+		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
 blk_qc_t blk_mq_submit_bio(struct bio *bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 866f74261b3b..8ebd8be3e050 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -222,7 +222,7 @@ static inline void bio_issue_init(struct bio_issue *issue,
  */
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
-	struct gendisk		*bi_disk;
+	struct block_device	*bi_bdev;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
 						 * accessors.
@@ -231,7 +231,6 @@ struct bio {
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
 	blk_status_t		bi_status;
-	u8			bi_partno;
 	atomic_t		__bi_remaining;
 
 	struct bvec_iter	bi_iter;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f94ee3089e01..b55bd534b2e1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1967,7 +1967,8 @@ void part_end_io_acct(struct block_device *part, struct bio *bio,
  */
 static inline unsigned long bio_start_io_acct(struct bio *bio)
 {
-	return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio));
+	return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio),
+				  bio_op(bio));
 }
 
 /**
@@ -1977,7 +1978,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio)
  */
 static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 {
-	return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time);
+	return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time);
 }
 
 int bdev_read_only(struct block_device *bdev);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb0fe4c66b84..9e9ee4945043 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 
 static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
 {
-	blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
+	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
@@ -915,22 +915,24 @@ static void blk_add_trace_bio_complete(void *ignore,
 
 static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
 {
-	blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
+	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
+			0);
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
 {
-	blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
+	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
+			0);
 }
 
 static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
 {
-	blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
+	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
 }
 
 static void blk_add_trace_getrq(void *ignore, struct bio *bio)
 {
-	blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
+	blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
 }
 
 static void blk_add_trace_plug(void *ignore, struct request_queue *q)
@@ -967,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 
 static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
 {
-	struct request_queue *q = bio->bi_disk->queue;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	struct blk_trace *bt;
 
 	rcu_read_lock();
@@ -997,7 +999,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
 static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
 				    sector_t from)
 {
-	struct request_queue *q = bio->bi_disk->queue;
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	struct blk_trace *bt;
 	struct blk_io_trace_remap r;
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 9bca17ecc4df..a75f35464a4e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -433,7 +433,7 @@ int swap_readpage(struct page *page, bool synchronous)
 		ret = -ENOMEM;
 		goto out;
 	}
-	disk = bio->bi_disk;
+	disk = bio->bi_bdev->bd_disk;
 	/*
 	 * Keep this task valid during swap readpage because the oom killer may
 	 * attempt to access it in the page fault retry time check.
-- 
cgit v1.2.3


From 30c5d3456c272f0de0d7e7eb9fc355fa64a5f649 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Jan 2021 11:02:36 +0100
Subject: block: do not reassig ->bi_bdev when partition remapping

There is no good reason to reassign ->bi_bdev when remapping the
partition-relative block number to the device wide one, as all the
information required by the drivers comes from the gendisk anyway.

Keeping the original ->bi_bdev alive will allow to greatly simplify
the partition-away I/O accounting.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 5 +++--
 include/linux/bio.h       | 2 ++
 include/linux/blk_types.h | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 64f69022de96..1c1b97a82caa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -752,7 +752,7 @@ static int blk_partition_remap(struct bio *bio)
 				      bio->bi_iter.bi_sector -
 				      p->bd_start_sect);
 	}
-	bio->bi_bdev = bdev_whole(p);
+	bio_set_flag(bio, BIO_REMAPPED);
 	return 0;
 }
 
@@ -817,7 +817,8 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		goto end_io;
 	if (unlikely(bio_check_eod(bio)))
 		goto end_io;
-	if (bio->bi_bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+	if (bio->bi_bdev->bd_partno && !bio_flagged(bio, BIO_REMAPPED) &&
+	    unlikely(blk_partition_remap(bio)))
 		goto end_io;
 
 	/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 12af7aa5db37..2f1155eabaff 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -485,6 +485,7 @@ extern const char *bio_devname(struct bio *bio, char *buffer);
 
 #define bio_set_dev(bio, bdev) 				\
 do {							\
+	bio_clear_flag(bio, BIO_REMAPPED);		\
 	if ((bio)->bi_bdev != (bdev))			\
 		bio_clear_flag(bio, BIO_THROTTLED);	\
 	(bio)->bi_bdev = (bdev);			\
@@ -493,6 +494,7 @@ do {							\
 
 #define bio_copy_dev(dst, src)			\
 do {						\
+	bio_clear_flag(dst, BIO_REMAPPED);		\
 	(dst)->bi_bdev = (src)->bi_bdev;	\
 	bio_clone_blkg_association(dst, src);	\
 } while (0)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8ebd8be3e050..1bc6f6a01070 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -303,6 +303,7 @@ enum {
 				 * of this bio. */
 	BIO_CGROUP_ACCT,	/* has been accounted to a cgroup */
 	BIO_TRACKED,		/* set if bio goes through the rq_qos path */
+	BIO_REMAPPED,
 	BIO_FLAG_LAST
 };
 
-- 
cgit v1.2.3


From 99dfc43ecbf67f12a06512918aaba61d55863efc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Jan 2021 11:02:37 +0100
Subject: block: use ->bi_bdev for bio based I/O accounting

Rework the I/O accounting for bio based drivers to use ->bi_bdev.  This
means all drivers can now simply use bio_start_io_acct to start
accounting, and it will take partitions into account automatically.  To
end I/O account either bio_end_io_acct can be used if the driver never
remaps I/O to a different device, or bio_end_io_acct_remapped if the
driver did remap the I/O.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c            | 23 +++++++++++++----------
 drivers/md/bcache/request.c | 34 +++++++++++++++++++++-------------
 drivers/md/md.c             |  8 ++++----
 include/linux/blkdev.h      | 21 ++++-----------------
 4 files changed, 42 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1c1b97a82caa..9315311c27a9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1321,14 +1321,17 @@ static unsigned long __part_start_io_acct(struct block_device *part,
 	return now;
 }
 
-unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part,
-				 struct bio *bio)
+/**
+ * bio_start_io_acct - start I/O accounting for bio based drivers
+ * @bio:	bio to start account for
+ *
+ * Returns the start time that should be passed back to bio_end_io_acct().
+ */
+unsigned long bio_start_io_acct(struct bio *bio)
 {
-	*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
-
-	return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
+	return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio));
 }
-EXPORT_SYMBOL_GPL(part_start_io_acct);
+EXPORT_SYMBOL_GPL(bio_start_io_acct);
 
 unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 				 unsigned int op)
@@ -1351,12 +1354,12 @@ static void __part_end_io_acct(struct block_device *part, unsigned int op,
 	part_stat_unlock();
 }
 
-void part_end_io_acct(struct block_device *part, struct bio *bio,
-		      unsigned long start_time)
+void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+		struct block_device *orig_bdev)
 {
-	__part_end_io_acct(part, bio_op(bio), start_time);
+	__part_end_io_acct(orig_bdev, bio_op(bio), start_time);
 }
-EXPORT_SYMBOL_GPL(part_end_io_acct);
+EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
 
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		      unsigned long start_time)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index dfc35d6d05ed..29c231758293 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -475,7 +475,7 @@ struct search {
 	unsigned int		read_dirty_data:1;
 	unsigned int		cache_missed:1;
 
-	struct block_device	*part;
+	struct block_device	*orig_bdev;
 	unsigned long		start_time;
 
 	struct btree_op		op;
@@ -670,8 +670,8 @@ static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
 		/* Count on bcache device */
-		part_end_io_acct(s->part, s->orig_bio, s->start_time);
-
+		bio_end_io_acct_remapped(s->orig_bio, s->start_time,
+					 s->orig_bdev);
 		trace_bcache_request_end(s->d, s->orig_bio);
 		s->orig_bio->bi_status = s->iop.status;
 		bio_endio(s->orig_bio);
@@ -714,7 +714,8 @@ static void search_free(struct closure *cl)
 }
 
 static inline struct search *search_alloc(struct bio *bio,
-					  struct bcache_device *d)
+		struct bcache_device *d, struct block_device *orig_bdev,
+		unsigned long start_time)
 {
 	struct search *s;
 
@@ -732,7 +733,8 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->write		= op_is_write(bio_op(bio));
 	s->read_dirty_data	= 0;
 	/* Count on the bcache device */
-	s->start_time		= part_start_io_acct(d->disk, &s->part, bio);
+	s->orig_bdev		= orig_bdev;
+	s->start_time		= start_time;
 	s->iop.c		= d->c;
 	s->iop.bio		= NULL;
 	s->iop.inode		= d->id;
@@ -1074,7 +1076,7 @@ struct detached_dev_io_private {
 	unsigned long		start_time;
 	bio_end_io_t		*bi_end_io;
 	void			*bi_private;
-	struct block_device	*part;
+	struct block_device	*orig_bdev;
 };
 
 static void detached_dev_end_io(struct bio *bio)
@@ -1086,7 +1088,7 @@ static void detached_dev_end_io(struct bio *bio)
 	bio->bi_private = ddip->bi_private;
 
 	/* Count on the bcache device */
-	part_end_io_acct(ddip->part, bio, ddip->start_time);
+	bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev);
 
 	if (bio->bi_status) {
 		struct cached_dev *dc = container_of(ddip->d,
@@ -1099,7 +1101,8 @@ static void detached_dev_end_io(struct bio *bio)
 	bio->bi_end_io(bio);
 }
 
-static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
+		struct block_device *orig_bdev, unsigned long start_time)
 {
 	struct detached_dev_io_private *ddip;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
@@ -1112,7 +1115,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
 	ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
 	ddip->d = d;
 	/* Count on the bcache device */
-	ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio);
+	ddip->orig_bdev = orig_bdev;
+	ddip->start_time = start_time;
 	ddip->bi_end_io = bio->bi_end_io;
 	ddip->bi_private = bio->bi_private;
 	bio->bi_end_io = detached_dev_end_io;
@@ -1168,8 +1172,10 @@ static void quit_max_writeback_rate(struct cache_set *c,
 blk_qc_t cached_dev_submit_bio(struct bio *bio)
 {
 	struct search *s;
-	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct block_device *orig_bdev = bio->bi_bdev;
+	struct bcache_device *d = orig_bdev->bd_disk->private_data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	unsigned long start_time;
 	int rw = bio_data_dir(bio);
 
 	if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
@@ -1194,11 +1200,13 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
 		}
 	}
 
+	start_time = bio_start_io_acct(bio);
+
 	bio_set_dev(bio, dc->bdev);
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
 
 	if (cached_dev_get(dc)) {
-		s = search_alloc(bio, d);
+		s = search_alloc(bio, d, orig_bdev, start_time);
 		trace_bcache_request_start(s->d, bio);
 
 		if (!bio->bi_iter.bi_size) {
@@ -1219,7 +1227,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio)
 		}
 	} else
 		/* I/O request sent to backing device */
-		detached_dev_do_request(d, bio);
+		detached_dev_do_request(d, bio, orig_bdev, start_time);
 
 	return BLK_QC_T_NONE;
 }
@@ -1283,7 +1291,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	s = search_alloc(bio, d);
+	s = search_alloc(bio, d, bio->bi_bdev, bio_start_io_acct(bio));
 	cl = &s->cl;
 	bio = &s->bio.bio;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cf06dbb1aa53..7d1bb24add31 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -463,8 +463,8 @@ struct md_io {
 	struct mddev *mddev;
 	bio_end_io_t *orig_bi_end_io;
 	void *orig_bi_private;
+	struct block_device *orig_bi_bdev;
 	unsigned long start_time;
-	struct block_device *part;
 };
 
 static void md_end_io(struct bio *bio)
@@ -472,7 +472,7 @@ static void md_end_io(struct bio *bio)
 	struct md_io *md_io = bio->bi_private;
 	struct mddev *mddev = md_io->mddev;
 
-	part_end_io_acct(md_io->part, bio, md_io->start_time);
+	bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev);
 
 	bio->bi_end_io = md_io->orig_bi_end_io;
 	bio->bi_private = md_io->orig_bi_private;
@@ -514,12 +514,12 @@ static blk_qc_t md_submit_bio(struct bio *bio)
 		md_io->mddev = mddev;
 		md_io->orig_bi_end_io = bio->bi_end_io;
 		md_io->orig_bi_private = bio->bi_private;
+		md_io->orig_bi_bdev = bio->bi_bdev;
 
 		bio->bi_end_io = md_end_io;
 		bio->bi_private = md_io;
 
-		md_io->start_time = part_start_io_acct(mddev->gendisk,
-						       &md_io->part, bio);
+		md_io->start_time = bio_start_io_acct(bio);
 	}
 
 	/* bio could be mergeable after passing to underlayer */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b55bd534b2e1..4526b9ef8edb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1954,22 +1954,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
 void disk_end_io_acct(struct gendisk *disk, unsigned int op,
 		unsigned long start_time);
 
-unsigned long part_start_io_acct(struct gendisk *disk,
-		struct block_device **part, struct bio *bio);
-void part_end_io_acct(struct block_device *part, struct bio *bio,
-		      unsigned long start_time);
-
-/**
- * bio_start_io_acct - start I/O accounting for bio based drivers
- * @bio:	bio to start account for
- *
- * Returns the start time that should be passed back to bio_end_io_acct().
- */
-static inline unsigned long bio_start_io_acct(struct bio *bio)
-{
-	return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio),
-				  bio_op(bio));
-}
+unsigned long bio_start_io_acct(struct bio *bio);
+void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+		struct block_device *orig_bdev);
 
 /**
  * bio_end_io_acct - end I/O accounting for bio based drivers
@@ -1978,7 +1965,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio)
  */
 static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 {
-	return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time);
+	return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
 }
 
 int bdev_read_only(struct block_device *bdev);
-- 
cgit v1.2.3


From bc359d03c7ec1bf3b86d03bafaf6bbb21e6414fd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Jan 2021 11:02:39 +0100
Subject: block: add a disk_uevent helper

Add a helper to call kobject_uevent for the disk and all partitions, and
unexport the disk_part_iter_* helpers that are now only used in the core
block code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c             | 27 ++++++++++++++-------------
 drivers/s390/block/dasd.c | 26 +++++---------------------
 include/linux/genhd.h     |  2 ++
 3 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index e46de616a19e..7094612c7510 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -203,7 +203,6 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL_GPL(disk_part_iter_init);
 
 /**
  * disk_part_iter_next - proceed iterator to the next partition and return it
@@ -266,7 +265,6 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 
 	return piter->part;
 }
-EXPORT_SYMBOL_GPL(disk_part_iter_next);
 
 /**
  * disk_part_iter_exit - finish up partition iteration
@@ -283,7 +281,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 		bdput(piter->part);
 	piter->part = NULL;
 }
-EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 
 /**
  * disk_has_partitions
@@ -555,6 +552,18 @@ static char *bdevt_str(dev_t devt, char *buf)
 	return buf;
 }
 
+void disk_uevent(struct gendisk *disk, enum kobject_action action)
+{
+	struct disk_part_iter piter;
+	struct block_device *part;
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(bdev_kobj(part), action);
+	disk_part_iter_exit(&piter);
+}
+EXPORT_SYMBOL_GPL(disk_uevent);
+
 static void disk_scan_partitions(struct gendisk *disk)
 {
 	struct block_device *bdev;
@@ -572,8 +581,6 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 			  const struct attribute_group **groups)
 {
 	struct device *ddev = disk_to_dev(disk);
-	struct disk_part_iter piter;
-	struct block_device *part;
 	int err;
 
 	ddev->parent = parent;
@@ -616,15 +623,9 @@ static void register_disk(struct device *parent, struct gendisk *disk,
 
 	disk_scan_partitions(disk);
 
-	/* announce disk after possible partitions are created */
+	/* announce the disk and partitions after all partitions are created */
 	dev_set_uevent_suppress(ddev, 0);
-	kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
-	/* announce possible partitions */
-	disk_part_iter_init(&piter, disk, 0);
-	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(bdev_kobj(part), KOBJ_ADD);
-	disk_part_iter_exit(&piter);
+	disk_uevent(disk, KOBJ_ADD);
 
 	if (disk->queue->backing_dev_info->dev) {
 		err = sysfs_create_link(&ddev->kobj,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index c7eb9a10c680..28c04a4efa66 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -428,23 +428,15 @@ static int dasd_state_unfmt_to_basic(struct dasd_device *device)
 static int
 dasd_state_ready_to_online(struct dasd_device * device)
 {
-	struct gendisk *disk;
-	struct disk_part_iter piter;
-	struct block_device *part;
-
 	device->state = DASD_STATE_ONLINE;
 	if (device->block) {
 		dasd_schedule_block_bh(device->block);
 		if ((device->features & DASD_FEATURE_USERAW)) {
-			disk = device->block->gdp;
-			kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+			kobject_uevent(&disk_to_dev(device->block->gdp)->kobj,
+					KOBJ_CHANGE);
 			return 0;
 		}
-		disk = device->block->bdev->bd_disk;
-		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-		while ((part = disk_part_iter_next(&piter)))
-			kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
-		disk_part_iter_exit(&piter);
+		disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
 	}
 	return 0;
 }
@@ -455,9 +447,6 @@ dasd_state_ready_to_online(struct dasd_device * device)
 static int dasd_state_online_to_ready(struct dasd_device *device)
 {
 	int rc;
-	struct gendisk *disk;
-	struct disk_part_iter piter;
-	struct block_device *part;
 
 	if (device->discipline->online_to_ready) {
 		rc = device->discipline->online_to_ready(device);
@@ -466,13 +455,8 @@ static int dasd_state_online_to_ready(struct dasd_device *device)
 	}
 
 	device->state = DASD_STATE_READY;
-	if (device->block && !(device->features & DASD_FEATURE_USERAW)) {
-		disk = device->block->bdev->bd_disk;
-		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
-		while ((part = disk_part_iter_next(&piter)))
-			kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
-		disk_part_iter_exit(&piter);
-	}
+	if (device->block && !(device->features & DASD_FEATURE_USERAW))
+		disk_uevent(device->block->bdev->bd_disk, KOBJ_CHANGE);
 	return 0;
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a62ccbfac54b..670eaef0e876 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -213,6 +213,8 @@ static inline dev_t disk_devt(struct gendisk *disk)
 	return MKDEV(disk->major, disk->first_minor);
 }
 
+void disk_uevent(struct gendisk *disk, enum kobject_action action);
+
 /*
  * Smarter partition iterator without context limits.
  */
-- 
cgit v1.2.3


From 0470dd9d5f103e7f1d5ba8f755f687c3106c7df1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Jan 2021 11:02:40 +0100
Subject: block: remove DISK_PITER_REVERSE

There is good reason to iterate backwards when deleting all partitions in
del_gendisk, just like we don't in blk_drop_partitions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 37 +++++++------------------------------
 include/linux/genhd.h |  1 -
 2 files changed, 7 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 7094612c7510..1832add5c738 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -184,24 +184,13 @@ static struct block_device *__disk_get_part(struct gendisk *disk, int partno)
 void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 			  unsigned int flags)
 {
-	struct disk_part_tbl *ptbl;
-
-	rcu_read_lock();
-	ptbl = rcu_dereference(disk->part_tbl);
-
 	piter->disk = disk;
 	piter->part = NULL;
-
-	if (flags & DISK_PITER_REVERSE)
-		piter->idx = ptbl->len - 1;
-	else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
+	if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
 		piter->idx = 0;
 	else
 		piter->idx = 1;
-
 	piter->flags = flags;
-
-	rcu_read_unlock();
 }
 
 /**
@@ -216,7 +205,6 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 {
 	struct disk_part_tbl *ptbl;
-	int inc, end;
 
 	/* put the last partition */
 	disk_part_iter_exit(piter);
@@ -225,21 +213,8 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 	rcu_read_lock();
 	ptbl = rcu_dereference(piter->disk->part_tbl);
 
-	/* determine iteration parameters */
-	if (piter->flags & DISK_PITER_REVERSE) {
-		inc = -1;
-		if (piter->flags & (DISK_PITER_INCL_PART0 |
-				    DISK_PITER_INCL_EMPTY_PART0))
-			end = -1;
-		else
-			end = 0;
-	} else {
-		inc = 1;
-		end = ptbl->len;
-	}
-
 	/* iterate to the next partition */
-	for (; piter->idx != end; piter->idx += inc) {
+	for (; piter->idx != ptbl->len; piter->idx += 1) {
 		struct block_device *part;
 
 		part = rcu_dereference(ptbl->part[piter->idx]);
@@ -257,7 +232,10 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 			continue;
 		}
 
-		piter->idx += inc;
+		piter->part = bdgrab(part);
+		if (!piter->part)
+			continue;
+		piter->idx += 1;
 		break;
 	}
 
@@ -781,8 +759,7 @@ void del_gendisk(struct gendisk *disk)
 	down_write(&bdev_lookup_sem);
 
 	/* invalidate stuff */
-	disk_part_iter_init(&piter, disk,
-			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter))) {
 		invalidate_partition(part);
 		delete_partition(part);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 670eaef0e876..51609133c9a3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -218,7 +218,6 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action);
 /*
  * Smarter partition iterator without context limits.
  */
-#define DISK_PITER_REVERSE	(1 << 0) /* iterate in the reverse direction */
 #define DISK_PITER_INCL_EMPTY	(1 << 1) /* include 0-sized parts */
 #define DISK_PITER_INCL_PART0	(1 << 2) /* include partition 0 */
 #define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
-- 
cgit v1.2.3


From a33df75c6328bf40078b35f2040d8e54d574c357 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 24 Jan 2021 11:02:41 +0100
Subject: block: use an xarray for disk->part_tbl

Now that no fast path lookups in the partition table are left, there is
no point in micro-optimizing the data structure for it.  Just use a bog
standard xarray.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c    |   2 +-
 block/blk.h             |   1 -
 block/genhd.c           | 163 ++++--------------------------------------------
 block/partitions/core.c |  31 ++-------
 include/linux/genhd.h   |  18 +-----
 5 files changed, 22 insertions(+), 193 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 43990b1d148b..4c974340f1a9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -865,7 +865,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 		 * we do nothing special as far as the block layer is concerned.
 		 */
 		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
-		    disk_has_partitions(disk))
+		    !xa_empty(&disk->part_tbl))
 			model = BLK_ZONED_NONE;
 		break;
 	case BLK_ZONED_NONE:
diff --git a/block/blk.h b/block/blk.h
index d965cacc5bda..ab0aaf958553 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -345,7 +345,6 @@ int bdev_add_partition(struct block_device *bdev, int partno,
 int bdev_del_partition(struct block_device *bdev, int partno);
 int bdev_resize_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length);
-int disk_expand_part_tbl(struct gendisk *disk, int target);
 
 int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
diff --git a/block/genhd.c b/block/genhd.c
index 1832add5c738..d3ef29fbc536 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -161,15 +161,6 @@ static void part_in_flight_rw(struct block_device *part,
 		inflight[1] = 0;
 }
 
-static struct block_device *__disk_get_part(struct gendisk *disk, int partno)
-{
-	struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
-
-	if (unlikely(partno < 0 || partno >= ptbl->len))
-		return NULL;
-	return rcu_dereference(ptbl->part[partno]);
-}
-
 /**
  * disk_part_iter_init - initialize partition iterator
  * @piter: iterator to initialize
@@ -204,41 +195,26 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
  */
 struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
 {
-	struct disk_part_tbl *ptbl;
+	struct block_device *part;
+	unsigned long idx;
 
 	/* put the last partition */
 	disk_part_iter_exit(piter);
 
-	/* get part_tbl */
 	rcu_read_lock();
-	ptbl = rcu_dereference(piter->disk->part_tbl);
-
-	/* iterate to the next partition */
-	for (; piter->idx != ptbl->len; piter->idx += 1) {
-		struct block_device *part;
-
-		part = rcu_dereference(ptbl->part[piter->idx]);
-		if (!part)
-			continue;
-		piter->part = bdgrab(part);
-		if (!piter->part)
-			continue;
+	xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) {
 		if (!bdev_nr_sectors(part) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
-		      piter->idx == 0)) {
-			bdput(piter->part);
-			piter->part = NULL;
+		      piter->idx == 0))
 			continue;
-		}
 
 		piter->part = bdgrab(part);
 		if (!piter->part)
 			continue;
-		piter->idx += 1;
+		piter->idx = idx + 1;
 		break;
 	}
-
 	rcu_read_unlock();
 
 	return piter->part;
@@ -260,42 +236,6 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 	piter->part = NULL;
 }
 
-/**
- * disk_has_partitions
- * @disk: gendisk of interest
- *
- * Walk through the partition table and check if valid partition exists.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * True if the gendisk has at least one valid non-zero size partition.
- * Otherwise false.
- */
-bool disk_has_partitions(struct gendisk *disk)
-{
-	struct disk_part_tbl *ptbl;
-	int i;
-	bool ret = false;
-
-	rcu_read_lock();
-	ptbl = rcu_dereference(disk->part_tbl);
-
-	/* Iterate partitions skipping the whole device at index 0 */
-	for (i = 1; i < ptbl->len; i++) {
-		if (rcu_dereference(ptbl->part[i])) {
-			ret = true;
-			break;
-		}
-	}
-
-	rcu_read_unlock();
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(disk_has_partitions);
-
 /*
  * Can be deleted altogether. Later.
  *
@@ -858,7 +798,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
 	struct block_device *bdev = NULL;
 
 	rcu_read_lock();
-	bdev = __disk_get_part(disk, partno);
+	bdev = xa_load(&disk->part_tbl, partno);
 	if (bdev && !bdgrab(bdev))
 		bdev = NULL;
 	rcu_read_unlock();
@@ -1248,83 +1188,6 @@ static const struct attribute_group *disk_attr_groups[] = {
 	NULL
 };
 
-/**
- * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
- * @disk: disk to replace part_tbl for
- * @new_ptbl: new part_tbl to install
- *
- * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
- * original ptbl is freed using RCU callback.
- *
- * LOCKING:
- * Matching bd_mutex locked or the caller is the only user of @disk.
- */
-static void disk_replace_part_tbl(struct gendisk *disk,
-				  struct disk_part_tbl *new_ptbl)
-{
-	struct disk_part_tbl *old_ptbl =
-		rcu_dereference_protected(disk->part_tbl, 1);
-
-	rcu_assign_pointer(disk->part_tbl, new_ptbl);
-
-	if (old_ptbl) {
-		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
-		kfree_rcu(old_ptbl, rcu_head);
-	}
-}
-
-/**
- * disk_expand_part_tbl - expand disk->part_tbl
- * @disk: disk to expand part_tbl for
- * @partno: expand such that this partno can fit in
- *
- * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
- * uses RCU to allow unlocked dereferencing for stats and other stuff.
- *
- * LOCKING:
- * Matching bd_mutex locked or the caller is the only user of @disk.
- * Might sleep.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int disk_expand_part_tbl(struct gendisk *disk, int partno)
-{
-	struct disk_part_tbl *old_ptbl =
-		rcu_dereference_protected(disk->part_tbl, 1);
-	struct disk_part_tbl *new_ptbl;
-	int len = old_ptbl ? old_ptbl->len : 0;
-	int i, target;
-
-	/*
-	 * check for int overflow, since we can get here from blkpg_ioctl()
-	 * with a user passed 'partno'.
-	 */
-	target = partno + 1;
-	if (target < 0)
-		return -EINVAL;
-
-	/* disk_max_parts() is zero during initialization, ignore if so */
-	if (disk_max_parts(disk) && target > disk_max_parts(disk))
-		return -EINVAL;
-
-	if (target <= len)
-		return 0;
-
-	new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
-				disk->node_id);
-	if (!new_ptbl)
-		return -ENOMEM;
-
-	new_ptbl->len = target;
-
-	for (i = 0; i < len; i++)
-		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
-
-	disk_replace_part_tbl(disk, new_ptbl);
-	return 0;
-}
-
 /**
  * disk_release - releases all allocated resources of the gendisk
  * @dev: the device representing this disk
@@ -1348,7 +1211,7 @@ static void disk_release(struct device *dev)
 	blk_free_devt(dev->devt);
 	disk_release_events(disk);
 	kfree(disk->random);
-	disk_replace_part_tbl(disk, NULL);
+	xa_destroy(&disk->part_tbl);
 	bdput(disk->part0);
 	if (disk->queue)
 		blk_put_queue(disk->queue);
@@ -1501,7 +1364,6 @@ dev_t blk_lookup_devt(const char *name, int partno)
 struct gendisk *__alloc_disk_node(int minors, int node_id)
 {
 	struct gendisk *disk;
-	struct disk_part_tbl *ptbl;
 
 	if (minors > DISK_MAX_PARTS) {
 		printk(KERN_ERR
@@ -1519,11 +1381,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 		goto out_free_disk;
 
 	disk->node_id = node_id;
-	if (disk_expand_part_tbl(disk, 0))
-		goto out_bdput;
-
-	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
-	rcu_assign_pointer(ptbl->part[0], disk->part0);
+	xa_init(&disk->part_tbl);
+	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
+		goto out_destroy_part_tbl;
 
 	disk->minors = minors;
 	rand_initialize_disk(disk);
@@ -1532,7 +1392,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
 	device_initialize(disk_to_dev(disk));
 	return disk;
 
-out_bdput:
+out_destroy_part_tbl:
+	xa_destroy(&disk->part_tbl);
 	bdput(disk->part0);
 out_free_disk:
 	kfree(disk);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 168d5906077c..b1cdf88f96e2 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -287,13 +287,7 @@ struct device_type part_type = {
  */
 void delete_partition(struct block_device *part)
 {
-	struct gendisk *disk = part->bd_disk;
-	struct disk_part_tbl *ptbl =
-		rcu_dereference_protected(disk->part_tbl, 1);
-
-	rcu_assign_pointer(ptbl->part[part->bd_partno], NULL);
-	rcu_assign_pointer(ptbl->last_lookup, NULL);
-
+	xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
 	kobject_put(part->bd_holder_dir);
 	device_del(&part->bd_device);
 
@@ -325,7 +319,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	struct device *ddev = disk_to_dev(disk);
 	struct device *pdev;
 	struct block_device *bdev;
-	struct disk_part_tbl *ptbl;
 	const char *dname;
 	int err;
 
@@ -347,12 +340,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 		break;
 	}
 
-	err = disk_expand_part_tbl(disk, partno);
-	if (err)
-		return ERR_PTR(err);
-	ptbl = rcu_dereference_protected(disk->part_tbl, 1);
-
-	if (ptbl->part[partno])
+	if (xa_load(&disk->part_tbl, partno))
 		return ERR_PTR(-EBUSY);
 
 	bdev = bdev_alloc(disk, partno);
@@ -405,8 +393,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	}
 
 	/* everything is up and running, commence */
+	err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
+	if (err)
+		goto out_del;
 	bdev_add(bdev, devt);
-	rcu_assign_pointer(ptbl->part[partno], bdev);
 
 	/* suppress uevent if the disk suppresses it */
 	if (!dev_get_uevent_suppress(ddev))
@@ -612,7 +602,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
 int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 {
 	struct parsed_partitions *state;
-	int ret = -EAGAIN, p, highest;
+	int ret = -EAGAIN, p;
 
 	if (!disk_part_scan_enabled(disk))
 		return 0;
@@ -660,15 +650,6 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
-	/*
-	 * Detect the highest partition number and preallocate disk->part_tbl.
-	 * This is an optimization and not strictly necessary.
-	 */
-	for (p = 1, highest = 0; p < state->limit; p++)
-		if (state->parts[p].size)
-			highest = p;
-	disk_expand_part_tbl(disk, highest);
-
 	for (p = 1; p < state->limit; p++)
 		if (!blk_add_partition(disk, bdev, state, p))
 			goto out_free_state;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 51609133c9a3..f364619092cc 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -32,6 +32,7 @@ extern struct class block_class;
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/xarray.h>
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
 /*
@@ -116,13 +117,6 @@ enum {
 	DISK_EVENT_FLAG_UEVENT			= 1 << 1,
 };
 
-struct disk_part_tbl {
-	struct rcu_head rcu_head;
-	int len;
-	struct block_device __rcu *last_lookup;
-	struct block_device __rcu *part[];
-};
-
 struct disk_events;
 struct badblocks;
 
@@ -148,12 +142,7 @@ struct gendisk {
 	unsigned short events;		/* supported events */
 	unsigned short event_flags;	/* flags related to event processing */
 
-	/* Array of pointers to partitions indexed by partno.
-	 * Protected with matching bdev lock but stat and other
-	 * non-critical accesses use RCU.  Always access through
-	 * helpers.
-	 */
-	struct disk_part_tbl __rcu *part_tbl;
+	struct xarray part_tbl;
 	struct block_device *part0;
 
 	const struct block_device_operations *fops;
@@ -225,7 +214,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action);
 struct disk_part_iter {
 	struct gendisk		*disk;
 	struct block_device	*part;
-	int			idx;
+	unsigned long		idx;
 	unsigned int		flags;
 };
 
@@ -233,7 +222,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter,
 				 struct gendisk *disk, unsigned int flags);
 struct block_device *disk_part_iter_next(struct disk_part_iter *piter);
 extern void disk_part_iter_exit(struct disk_part_iter *piter);
-extern bool disk_has_partitions(struct gendisk *disk);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,
-- 
cgit v1.2.3


From 5ac83c644f5fb924f0b2c09102ab82fc788f8411 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Jan 2021 17:47:16 +0100
Subject: Revert "blk-mq, elevator: Count requests per hctx to improve
 performance"

This reverts commit b445547ec1bbd3e7bf4b1c142550942f70527d95.

Since both mq-deadline and BFQ completely ignore hctx they are passed to
their dispatch function and dispatch whatever request they deem fit
checking whether any request for a particular hctx is queued is just
pointless since we'll very likely get a request from a different hctx
anyway. In the following commit we'll deal with lock contention in these
IO schedulers in presence of multiple HW queues in a different way.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c    | 5 -----
 block/blk-mq.c         | 1 -
 block/mq-deadline.c    | 6 ------
 include/linux/blk-mq.h | 4 ----
 4 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c045613ce927..b12a416b51d7 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4677,9 +4677,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 
-	if (!atomic_read(&hctx->elevator_queued))
-		return false;
-
 	/*
 	 * Avoiding lock: a race on bfqd->busy_queues should cause at
 	 * most a call to dispatch for nothing
@@ -5597,7 +5594,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		bfq_insert_request(hctx, rq, at_head);
-		atomic_inc(&hctx->elevator_queued);
 	}
 }
 
@@ -5965,7 +5961,6 @@ static void bfq_finish_requeue_request(struct request *rq)
 
 		bfq_completed_request(bfqq, bfqd);
 		bfq_finish_requeue_request_body(bfqq);
-		atomic_dec(&rq->mq_hctx->elevator_queued);
 
 		spin_unlock_irqrestore(&bfqd->lock, flags);
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 74b17b396f4c..1af6b8a9da5a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2653,7 +2653,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 		goto free_hctx;
 
 	atomic_set(&hctx->nr_active, 0);
-	atomic_set(&hctx->elevator_queued, 0);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 	hctx->numa_node = node;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 800ac902809b..b57470e154c8 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -386,8 +386,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	spin_lock(&dd->lock);
 	rq = __dd_dispatch_request(dd);
 	spin_unlock(&dd->lock);
-	if (rq)
-		atomic_dec(&rq->mq_hctx->elevator_queued);
 
 	return rq;
 }
@@ -535,7 +533,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		dd_insert_request(hctx, rq, at_head);
-		atomic_inc(&hctx->elevator_queued);
 	}
 	spin_unlock(&dd->lock);
 }
@@ -582,9 +579,6 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 
-	if (!atomic_read(&hctx->elevator_queued))
-		return false;
-
 	return !list_empty_careful(&dd->dispatch) ||
 		!list_empty_careful(&dd->fifo_list[0]) ||
 		!list_empty_careful(&dd->fifo_list[1]);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6b410dab48ee..aabbf6830ffc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,10 +140,6 @@ struct blk_mq_hw_ctx {
 	 * shared across request queues.
 	 */
 	atomic_t		nr_active;
-	/**
-	 * @elevator_queued: Number of queued requests on hctx.
-	 */
-	atomic_t                elevator_queued;
 
 	/** @cpuhp_online: List to store request if CPU is going to die */
 	struct hlist_node	cpuhp_online;
-- 
cgit v1.2.3


From b6e68ee82585f2ee890b0a897a6aacbf49a467bb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Jan 2021 17:47:17 +0100
Subject: blk-mq: Improve performance of non-mq IO schedulers with multiple HW
 queues

Currently when non-mq aware IO scheduler (BFQ, mq-deadline) is used for
a queue with multiple HW queues, the performance it rather bad. The
problem is that these IO schedulers use queue-wide locking and their
dispatch function does not respect the hctx it is passed in and returns
any request it finds appropriate. Thus locality of request access is
broken and dispatch from multiple CPUs just contends on IO scheduler
locks. For these IO schedulers there's little point in dispatching from
multiple CPUs. Instead dispatch always only from a single CPU to limit
contention.

Below is a comparison of dbench runs on XFS filesystem where the storage
is a raid card with 64 HW queues and to it attached a single rotating
disk. BFQ is used as IO scheduler:

      clients           MQ                     SQ             MQ-Patched
Amean 1      39.12 (0.00%)       43.29 * -10.67%*       36.09 *   7.74%*
Amean 2     128.58 (0.00%)      101.30 *  21.22%*       96.14 *  25.23%*
Amean 4     577.42 (0.00%)      494.47 *  14.37%*      508.49 *  11.94%*
Amean 8     610.95 (0.00%)      363.86 *  40.44%*      362.12 *  40.73%*
Amean 16    391.78 (0.00%)      261.49 *  33.25%*      282.94 *  27.78%*
Amean 32    324.64 (0.00%)      267.71 *  17.54%*      233.00 *  28.23%*
Amean 64    295.04 (0.00%)      253.02 *  14.24%*      242.37 *  17.85%*
Amean 512 10281.61 (0.00%)    10211.16 *   0.69%*    10447.53 *  -1.61%*

Numbers are times so lower is better. MQ is stock 5.10-rc6 kernel. SQ is
the same kernel with megaraid_sas.host_tagset_enable=0 so that the card
advertises just a single HW queue. MQ-Patched is a kernel with this
patch applied.

You can see multiple hardware queues heavily hurt performance in
combination with BFQ. The patch restores the performance.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c           | 66 +++++++++++++++++++++++++++++++++++++++++++-----
 block/kyber-iosched.c    |  1 +
 include/linux/elevator.h |  2 ++
 3 files changed, 63 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1af6b8a9da5a..f21d922ecfaf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1646,6 +1646,42 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
+/*
+ * Is the request queue handled by an IO scheduler that does not respect
+ * hardware queues when dispatching?
+ */
+static bool blk_mq_has_sqsched(struct request_queue *q)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.dispatch_request &&
+	    !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
+		return true;
+	return false;
+}
+
+/*
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
+ * scheduler.
+ */
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	/*
+	 * If the IO scheduler does not respect hardware queues when
+	 * dispatching, we just don't bother with multiple HW queues and
+	 * dispatch from hctx for the current CPU since running multiple queues
+	 * just causes lock contention inside the scheduler and pointless cache
+	 * bouncing.
+	 */
+	hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
+				     raw_smp_processor_id());
+	if (!blk_mq_hctx_stopped(hctx))
+		return hctx;
+	return NULL;
+}
+
 /**
  * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
  * @q: Pointer to the request queue to run.
@@ -1653,14 +1689,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queue);
  */
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx, *sq_hctx;
 	int i;
 
+	sq_hctx = NULL;
+	if (blk_mq_has_sqsched(q))
+		sq_hctx = blk_mq_get_sq_hctx(q);
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_hctx_stopped(hctx))
 			continue;
-
-		blk_mq_run_hw_queue(hctx, async);
+		/*
+		 * Dispatch from this hctx either if there's no hctx preferred
+		 * by IO scheduler or if it has requests that bypass the
+		 * scheduler.
+		 */
+		if (!sq_hctx || sq_hctx == hctx ||
+		    !list_empty_careful(&hctx->dispatch))
+			blk_mq_run_hw_queue(hctx, async);
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@ -1672,14 +1717,23 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
  */
 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx, *sq_hctx;
 	int i;
 
+	sq_hctx = NULL;
+	if (blk_mq_has_sqsched(q))
+		sq_hctx = blk_mq_get_sq_hctx(q);
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_hctx_stopped(hctx))
 			continue;
-
-		blk_mq_delay_run_hw_queue(hctx, msecs);
+		/*
+		 * Dispatch from this hctx either if there's no hctx preferred
+		 * by IO scheduler or if it has requests that bypass the
+		 * scheduler.
+		 */
+		if (!sq_hctx || sq_hctx == hctx ||
+		    !list_empty_careful(&hctx->dispatch))
+			blk_mq_delay_run_hw_queue(hctx, msecs);
 	}
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index dc89199bc8c6..c25c41d0d061 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -1029,6 +1029,7 @@ static struct elevator_type kyber_sched = {
 #endif
 	.elevator_attrs = kyber_sched_attrs,
 	.elevator_name = "kyber",
+	.elevator_features = ELEVATOR_F_MQ_AWARE,
 	.elevator_owner = THIS_MODULE,
 };
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index bacc40a0bdf3..1fe8e105b83b 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -172,6 +172,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 
 /* Supports zoned block devices sequential write constraint */
 #define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
+/* Supports scheduling on multiple hardware queues */
+#define ELEVATOR_F_MQ_AWARE		(1U << 1)
 
 #endif /* CONFIG_BLOCK */
 #endif
-- 
cgit v1.2.3


From 9f180e315a93cde559ac1c9c4c5ce980aa681c1c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 11 Jan 2021 11:05:54 +0800
Subject: block: don't allocate inline bvecs if this bioset needn't bvecs

The inline bvecs won't be used if user needn't bvecs by not passing
BIOSET_NEED_BVECS, so don't allocate bvecs in this situation.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 7 +++++--
 include/linux/bio.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 87bf16460e0e..8ccda51dd831 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -89,8 +89,7 @@ fail_alloc_slab:
 
 static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
 {
-	return bs->front_pad + sizeof(struct bio) +
-		BIO_INLINE_VECS * sizeof(struct bio_vec);
+	return bs->front_pad + sizeof(struct bio) + bs->back_pad;
 }
 
 static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
@@ -1563,6 +1562,10 @@ int bioset_init(struct bio_set *bs,
 		int flags)
 {
 	bs->front_pad = front_pad;
+	if (flags & BIOSET_NEED_BVECS)
+		bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+	else
+		bs->back_pad = 0;
 
 	spin_lock_init(&bs->rescue_lock);
 	bio_list_init(&bs->rescue_list);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2f1155eabaff..5d8977aafa19 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -703,6 +703,7 @@ struct bio_set {
 	mempool_t bvec_integrity_pool;
 #endif
 
+	unsigned int back_pad;
 	/*
 	 * Deadlock avoidance for stacking block drivers: see comments in
 	 * bio_alloc_bioset() for details
-- 
cgit v1.2.3


From eec716a1c18c796a69db0be5e2a6f282ba5bccd6 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 11 Jan 2021 11:05:56 +0800
Subject: block: move three bvec helpers declaration into private helper

bvec_alloc(), bvec_free() and bvec_nr_vecs() are only used inside block
layer core functions, no need to declare them in public header.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h         | 4 ++++
 include/linux/bio.h | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk.h b/block/blk.h
index ab0aaf958553..0198335c5838 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -55,6 +55,10 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 
 void blk_freeze_queue(struct request_queue *q);
 
+struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
+void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
+unsigned int bvec_nr_vecs(unsigned short idx);
+
 static inline bool biovec_phys_mergeable(struct request_queue *q,
 		struct bio_vec *vec1, struct bio_vec *vec2)
 {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5d8977aafa19..e135b500df5d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -478,9 +478,6 @@ static inline void zero_fill_bio(struct bio *bio)
 	zero_fill_bio_iter(bio, bio->bi_iter);
 }
 
-extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
-extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
-extern unsigned int bvec_nr_vecs(unsigned short idx);
 extern const char *bio_devname(struct bio *bio, char *buffer);
 
 #define bio_set_dev(bio, bdev) 				\
-- 
cgit v1.2.3


From 8eeed0b554b9fda61be05b17cbb0b89ea2cbbb65 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 25 Jan 2021 05:49:57 +0100
Subject: block: remove unnecessary argument from blk_execute_rq_nowait

The 'q' is not used since commit a1ce35fa4985 ("block: remove dead
elevator code"), also update the comment of the function.

And more importantly it never really was needed to start with given
that we can trivial derive it from struct request.

Cc: target-devel@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: linux-ide@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: linux-nfs@vger.kernel.org
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-exec.c                   | 10 ++++------
 drivers/block/sx8.c                |  4 ++--
 drivers/nvme/host/core.c           |  4 ++--
 drivers/nvme/host/lightnvm.c       |  2 +-
 drivers/nvme/host/pci.c            |  4 ++--
 drivers/nvme/target/passthru.c     |  2 +-
 drivers/scsi/scsi_error.c          |  2 +-
 drivers/scsi/sg.c                  |  3 +--
 drivers/scsi/st.c                  |  2 +-
 drivers/target/target_core_pscsi.c |  3 +--
 include/linux/blkdev.h             |  2 +-
 11 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 85324d53d072..2e37e85456fb 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -31,8 +31,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
 }
 
 /**
- * blk_execute_rq_nowait - insert a request into queue for execution
- * @q:		queue to insert the request in
+ * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
  * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
@@ -45,9 +44,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
  * Note:
  *    This function will invoke @done directly if the queue is dead.
  */
-void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
-			   struct request *rq, int at_head,
-			   rq_end_io_fn *done)
+void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
+			   int at_head, rq_end_io_fn *done)
 {
 	WARN_ON(irqs_disabled());
 	WARN_ON(!blk_rq_is_passthrough(rq));
@@ -83,7 +81,7 @@ void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	unsigned long hang_check;
 
 	rq->end_io_data = &wait;
-	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
+	blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
 
 	/* Prevent hang_check timer from firing at us during very long I/O */
 	hang_check = sysctl_hung_task_timeout_secs;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 4478eb7efee0..2cdf2771f8e8 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -539,7 +539,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	spin_unlock_irq(&host->lock);
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
-	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
+	blk_execute_rq_nowait(NULL, rq, true, NULL);
 
 	return 0;
 
@@ -578,7 +578,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	crq->msg_bucket = (u32) rc;
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
-	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
+	blk_execute_rq_nowait(NULL, rq, true, NULL);
 
 	return 0;
 }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a39befb4deba..0bea9ae03092 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -925,7 +925,7 @@ static void nvme_execute_rq_polled(struct request_queue *q,
 
 	rq->cmd_flags |= REQ_HIPRI;
 	rq->end_io_data = &wait;
-	blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+	blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq);
 
 	while (!completion_done(&wait)) {
 		blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
@@ -1202,7 +1202,7 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl)
 	rq->timeout = ctrl->kato * HZ;
 	rq->end_io_data = ctrl;
 
-	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
+	blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io);
 
 	return 0;
 }
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 6c8eab8de288..0e5a55075e35 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -695,7 +695,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
 
 	rq->end_io_data = rqd;
 
-	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
+	blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io);
 
 	return 0;
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 856aa31931c1..5b78e68be9a1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1357,7 +1357,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	}
 
 	abort_req->end_io_data = NULL;
-	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
+	blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio);
 
 	/*
 	 * The aborted req will be completed on receiving the abort req.
@@ -2281,7 +2281,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 	req->end_io_data = nvmeq;
 
 	init_completion(&nvmeq->delete_done);
-	blk_execute_rq_nowait(q, NULL, req, false,
+	blk_execute_rq_nowait(NULL, req, false,
 			opcode == nvme_admin_delete_cq ?
 				nvme_del_cq_end : nvme_del_queue_end);
 	return 0;
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index b9776fc8f08f..cbc88acdd233 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -275,7 +275,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
 		schedule_work(&req->p.work);
 	} else {
 		rq->end_io_data = req;
-		blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
+		blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0,
 				      nvmet_passthru_req_done);
 	}
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f11f51e2465f..c00f06e9ecb0 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2007,7 +2007,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 	req->timeout = 10 * HZ;
 	rq->retries = 5;
 
-	blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
+	blk_execute_rq_nowait(NULL, req, 1, eh_lock_door_done);
 }
 
 /**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index bfa8d77322d7..4383d93110f8 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -829,8 +829,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 
 	srp->rq->timeout = timeout;
 	kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */
-	blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
-			      srp->rq, at_head, sg_rq_end_io);
+	blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io);
 	return 0;
 }
 
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 43f7624508a9..841ad2fc369a 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -585,7 +585,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	rq->retries = retries;
 	req->end_io_data = SRpnt;
 
-	blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
+	blk_execute_rq_nowait(NULL, req, 1, st_scsi_execute_end);
 	return 0;
 }
 
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 7994f27e4527..33770e5808ce 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1000,8 +1000,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		req->timeout = PS_TIMEOUT_OTHER;
 	scsi_req(req)->retries = PS_RETRY;
 
-	blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
-			(cmd->sam_task_attr == TCM_HEAD_TAG),
+	blk_execute_rq_nowait(NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG),
 			pscsi_req_done);
 
 	return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4526b9ef8edb..623a61239429 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -950,7 +950,7 @@ extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       gfp_t);
 extern void blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
-extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq_nowait(struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 
 /* Helper to convert REQ_OP_XXX to its string format XXX */
-- 
cgit v1.2.3


From 684da7628d93bbdcfba9081b917d99f29ad04c23 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 25 Jan 2021 05:49:58 +0100
Subject: block: remove unnecessary argument from blk_execute_rq

We can remove 'q' from blk_execute_rq as well after the previous change
in blk_execute_rq_nowait.

And more importantly it never really was needed to start with given
that we can trivial derive it from struct request.

Cc: linux-scsi@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: linux-ide@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-nvme@lists.infradead.org
Cc: linux-nfs@vger.kernel.org
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # for mmc
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-exec.c                  |  3 +--
 block/bsg.c                       |  2 +-
 block/scsi_ioctl.c                |  6 +++---
 drivers/block/mtip32xx/mtip32xx.c |  2 +-
 drivers/block/paride/pd.c         |  2 +-
 drivers/block/pktcdvd.c           |  2 +-
 drivers/block/virtio_blk.c        |  2 +-
 drivers/cdrom/cdrom.c             |  2 +-
 drivers/ide/ide-atapi.c           |  2 +-
 drivers/ide/ide-cd.c              |  2 +-
 drivers/ide/ide-cd_ioctl.c        |  2 +-
 drivers/ide/ide-devsets.c         |  2 +-
 drivers/ide/ide-disk.c            |  2 +-
 drivers/ide/ide-ioctls.c          |  4 ++--
 drivers/ide/ide-park.c            |  2 +-
 drivers/ide/ide-pm.c              |  4 ++--
 drivers/ide/ide-tape.c            |  2 +-
 drivers/ide/ide-taskfile.c        |  2 +-
 drivers/mmc/core/block.c          | 10 +++++-----
 drivers/nvme/host/core.c          |  4 ++--
 drivers/nvme/host/lightnvm.c      |  2 +-
 drivers/scsi/scsi_lib.c           |  2 +-
 fs/nfsd/blocklayout.c             |  2 +-
 include/linux/blkdev.h            |  3 +--
 24 files changed, 33 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 2e37e85456fb..0ab873f10133 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -74,8 +74,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
-void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
-		   struct request *rq, int at_head)
+void blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	unsigned long hang_check;
diff --git a/block/bsg.c b/block/bsg.c
index 3d78e843a83f..bd10922d5cbb 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -183,7 +183,7 @@ static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
 
 	bio = rq->bio;
 
-	blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
+	blk_execute_rq(NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
 	ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr);
 	blk_rq_unmap_user(bio);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index c9f009cc0446..6599bac0a78c 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -357,7 +357,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	 * (if he doesn't check that is his problem).
 	 * N.B. a non-zero SCSI status is _not_ necessarily an error.
 	 */
-	blk_execute_rq(q, bd_disk, rq, at_head);
+	blk_execute_rq(bd_disk, rq, at_head);
 
 	hdr->duration = jiffies_to_msecs(jiffies - start_time);
 
@@ -493,7 +493,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error;
 	}
 
-	blk_execute_rq(q, disk, rq, 0);
+	blk_execute_rq(disk, rq, 0);
 
 	err = req->result & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
@@ -532,7 +532,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	scsi_req(rq)->cmd[0] = cmd;
 	scsi_req(rq)->cmd[4] = data;
 	scsi_req(rq)->cmd_len = 6;
-	blk_execute_rq(q, bd_disk, rq, 0);
+	blk_execute_rq(bd_disk, rq, 0);
 	err = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 53ac59d19ae5..3fd99836bb1c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1015,7 +1015,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	rq->timeout = timeout;
 
 	/* insert request and run queue */
-	blk_execute_rq(rq->q, NULL, rq, true);
+	blk_execute_rq(NULL, rq, true);
 
 	if (int_cmd->status) {
 		dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n",
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index a7af4f27b7c3..897acda20ac8 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -781,7 +781,7 @@ static int pd_special_command(struct pd_unit *disk,
 	req = blk_mq_rq_to_pdu(rq);
 
 	req->func = func;
-	blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
+	blk_execute_rq(disk->gd, rq, 0);
 	blk_put_request(rq);
 	return 0;
 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 658a0981cb54..fc4b0f1aa86d 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -722,7 +722,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	if (cgc->quiet)
 		rq->rq_flags |= RQF_QUIET;
 
-	blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
+	blk_execute_rq(pd->bdev->bd_disk, rq, 0);
 	if (scsi_req(rq)->result)
 		ret = -EIO;
 out:
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 145606dc52db..b0285db7cf4f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -320,7 +320,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	if (err)
 		goto out;
 
-	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+	blk_execute_rq(vblk->disk, req, false);
 	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
 out:
 	blk_put_request(req);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 8f0e52a71493..90ad34c6ef8e 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2214,7 +2214,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		rq->timeout = 60 * HZ;
 		bio = rq->bio;
 
-		blk_execute_rq(q, cdi->disk, rq, 0);
+		blk_execute_rq(cdi->disk, rq, 0);
 		if (scsi_req(rq)->result) {
 			struct scsi_sense_hdr sshdr;
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 013ad33fbbc8..a1ce9f5ac3aa 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-	blk_execute_rq(drive->queue, disk, rq, 0);
+	blk_execute_rq(disk, rq, 0);
 	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 25d2d88e82ad..cffbcc27a34c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -467,7 +467,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 			}
 		}
 
-		blk_execute_rq(drive->queue, info->disk, rq, 0);
+		blk_execute_rq(info->disk, rq, 0);
 		error = scsi_req(rq)->result ? -EIO : 0;
 
 		if (buffer)
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 46f2df288c6a..011eab9c69b7 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -299,7 +299,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
-	blk_execute_rq(drive->queue, cd->disk, rq, 0);
+	blk_execute_rq(cd->disk, rq, 0);
 	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	/*
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index f2f93ed40356..ca1d4b3d3878 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,7 +173,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
 	ide_req(rq)->special = setting->set;
 
-	blk_execute_rq(q, NULL, rq, 0);
+	blk_execute_rq(NULL, rq, 0);
 	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 34b9441084f8..8413731c6259 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -482,7 +482,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
 	drive->mult_req = arg;
 	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-	blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(NULL, rq, 0);
 	blk_put_request(rq);
 
 	return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 58994da10c06..43fbc37d85c3 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -137,7 +137,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, void __user *argp)
 
 		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
-		blk_execute_rq(drive->queue, NULL, rq, 0);
+		blk_execute_rq(NULL, rq, 0);
 		err = scsi_req(rq)->result ? -EIO : 0;
 		blk_put_request(rq);
 
@@ -235,7 +235,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
-	blk_execute_rq(drive->queue, NULL, rq, 1);
+	blk_execute_rq(NULL, rq, 1);
 	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 	return ret;
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 8af7af6001eb..a80a0f28f7b9 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	ide_req(rq)->special = &timeout;
-	blk_execute_rq(q, NULL, rq, 1);
+	blk_execute_rq(NULL, rq, 1);
 	rc = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	if (rc)
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 82ab308f1aaf..d680b3e3295f 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		mesg.event = PM_EVENT_FREEZE;
 	rqpm.pm_state = mesg.event;
 
-	blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(NULL, rq, 0);
 	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
@@ -50,7 +50,7 @@ static int ide_pm_execute_rq(struct request *rq)
 		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	blk_execute_rq(q, NULL, rq, true);
+	blk_execute_rq(NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 88b96437b22e..fa05e7e7d609 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -868,7 +868,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 			goto out_put;
 	}
 
-	blk_execute_rq(drive->queue, tape->disk, rq, 0);
+	blk_execute_rq(tape->disk, rq, 0);
 
 	/* calculate the number of transferred bytes and update buffer state */
 	size -= scsi_req(rq)->resid_len;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index d016cbe68cba..6665fc4724b9 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -443,7 +443,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	ide_req(rq)->special = cmd;
 	cmd->rq = rq;
 
-	blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(NULL, rq, 0);
 	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 42e27a298218..a1d6b68320ae 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -253,7 +253,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
 		goto out_put;
 	}
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
-	blk_execute_rq(mq->queue, NULL, req, 0);
+	blk_execute_rq(NULL, req, 0);
 	ret = req_to_mmc_queue_req(req)->drv_op_result;
 	blk_put_request(req);
 
@@ -629,7 +629,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
 		rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
 	req_to_mmc_queue_req(req)->drv_op_data = idatas;
 	req_to_mmc_queue_req(req)->ioc_count = 1;
-	blk_execute_rq(mq->queue, NULL, req, 0);
+	blk_execute_rq(NULL, req, 0);
 	ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
 	err = mmc_blk_ioctl_copy_to_user(ic_ptr, idata);
 	blk_put_request(req);
@@ -698,7 +698,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
 		rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
 	req_to_mmc_queue_req(req)->drv_op_data = idata;
 	req_to_mmc_queue_req(req)->ioc_count = num_of_cmds;
-	blk_execute_rq(mq->queue, NULL, req, 0);
+	blk_execute_rq(NULL, req, 0);
 	ioc_err = req_to_mmc_queue_req(req)->drv_op_result;
 
 	/* copy to user if data and response */
@@ -2722,7 +2722,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
-	blk_execute_rq(mq->queue, NULL, req, 0);
+	blk_execute_rq(NULL, req, 0);
 	ret = req_to_mmc_queue_req(req)->drv_op_result;
 	if (ret >= 0) {
 		*val = ret;
@@ -2761,7 +2761,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
 	}
 	req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD;
 	req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
-	blk_execute_rq(mq->queue, NULL, req, 0);
+	blk_execute_rq(NULL, req, 0);
 	err = req_to_mmc_queue_req(req)->drv_op_result;
 	blk_put_request(req);
 	if (err) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0bea9ae03092..eb7963fb167b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -964,7 +964,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 	if (poll)
 		nvme_execute_rq_polled(req->q, NULL, req, at_head);
 	else
-		blk_execute_rq(req->q, NULL, req, at_head);
+		blk_execute_rq(NULL, req, at_head);
 	if (result)
 		*result = nvme_req(req)->result;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -1101,7 +1101,7 @@ void nvme_execute_passthru_rq(struct request *rq)
 	u32 effects;
 
 	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
-	blk_execute_rq(rq->q, disk, rq, 0);
+	blk_execute_rq(disk, rq, 0);
 	nvme_passthru_end(ctrl, effects);
 }
 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 0e5a55075e35..ec38128f51e9 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -819,7 +819,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 		bio->bi_bdev = ns->disk->part0;
 	}
 
-	blk_execute_rq(q, NULL, rq, 0);
+	blk_execute_rq(NULL, rq, 0);
 
 	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
 		ret = -EINTR;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b3f14f05340a..4d2280658559 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -269,7 +269,7 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	/*
 	 * head injection *required* here otherwise quiesce won't work
 	 */
-	blk_execute_rq(req->q, NULL, req, 1);
+	blk_execute_rq(NULL, req, 1);
 
 	/*
 	 * Some devices (USB mass-storage in particular) may transfer
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index a07c39c94bbd..1058659a8d31 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -254,7 +254,7 @@ again:
 	req->cmd[4] = bufflen & 0xff;
 	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
-	blk_execute_rq(rq->q, NULL, rq, 1);
+	blk_execute_rq(NULL, rq, 1);
 	if (req->result) {
 		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
 			req->result);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 623a61239429..20f3706b6b2e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -948,8 +948,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, const struct iov_iter *,
 			       gfp_t);
-extern void blk_execute_rq(struct request_queue *, struct gendisk *,
-			  struct request *, int);
+extern void blk_execute_rq(struct gendisk *, struct request *, int);
 extern void blk_execute_rq_nowait(struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 
-- 
cgit v1.2.3


From f75a1025c0b94d58de56c9421104ed6e987ddcd2 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Sat, 16 Jan 2021 00:33:11 +0800
Subject: usb: typec: tcpm: Create legacy PDOs for PD2 connection

If the port partner is PD2, the PDOs of the local port should follow the
format defined in PD2 Spec. Dynamically modify the pre-defined PD3 PDOs
and transform them into PD2 format before sending them to the PD2 port
partner.

Reviewed-by: Guenter Roeck <linux@roeckus.net>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210115163311.391332-1-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 62 ++++++++++++++++++++++++++++++++++++-------
 include/linux/usb/pd.h        |  1 +
 2 files changed, 53 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 0dd932fe08d0..0afd8ef692e8 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -1023,13 +1023,47 @@ static int tcpm_set_pwr_role(struct tcpm_port *port, enum typec_role role)
 	return 0;
 }
 
+/*
+ * Transform the PDO to be compliant to PD rev2.0.
+ * Return 0 if the PDO type is not defined in PD rev2.0.
+ * Otherwise, return the converted PDO.
+ */
+static u32 tcpm_forge_legacy_pdo(struct tcpm_port *port, u32 pdo, enum typec_role role)
+{
+	switch (pdo_type(pdo)) {
+	case PDO_TYPE_FIXED:
+		if (role == TYPEC_SINK)
+			return pdo & ~PDO_FIXED_FRS_CURR_MASK;
+		else
+			return pdo & ~PDO_FIXED_UNCHUNK_EXT;
+	case PDO_TYPE_VAR:
+	case PDO_TYPE_BATT:
+		return pdo;
+	case PDO_TYPE_APDO:
+	default:
+		return 0;
+	}
+}
+
 static int tcpm_pd_send_source_caps(struct tcpm_port *port)
 {
 	struct pd_message msg;
-	int i;
+	u32 pdo;
+	unsigned int i, nr_pdo = 0;
 
 	memset(&msg, 0, sizeof(msg));
-	if (!port->nr_src_pdo) {
+
+	for (i = 0; i < port->nr_src_pdo; i++) {
+		if (port->negotiated_rev >= PD_REV30) {
+			msg.payload[nr_pdo++] =	cpu_to_le32(port->src_pdo[i]);
+		} else {
+			pdo = tcpm_forge_legacy_pdo(port, port->src_pdo[i], TYPEC_SOURCE);
+			if (pdo)
+				msg.payload[nr_pdo++] = cpu_to_le32(pdo);
+		}
+	}
+
+	if (!nr_pdo) {
 		/* No source capabilities defined, sink only */
 		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
 					  port->pwr_role,
@@ -1042,10 +1076,8 @@ static int tcpm_pd_send_source_caps(struct tcpm_port *port)
 					  port->data_role,
 					  port->negotiated_rev,
 					  port->message_id,
-					  port->nr_src_pdo);
+					  nr_pdo);
 	}
-	for (i = 0; i < port->nr_src_pdo; i++)
-		msg.payload[i] = cpu_to_le32(port->src_pdo[i]);
 
 	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
 }
@@ -1053,10 +1085,22 @@ static int tcpm_pd_send_source_caps(struct tcpm_port *port)
 static int tcpm_pd_send_sink_caps(struct tcpm_port *port)
 {
 	struct pd_message msg;
-	int i;
+	u32 pdo;
+	unsigned int i, nr_pdo = 0;
 
 	memset(&msg, 0, sizeof(msg));
-	if (!port->nr_snk_pdo) {
+
+	for (i = 0; i < port->nr_snk_pdo; i++) {
+		if (port->negotiated_rev >= PD_REV30) {
+			msg.payload[nr_pdo++] =	cpu_to_le32(port->snk_pdo[i]);
+		} else {
+			pdo = tcpm_forge_legacy_pdo(port, port->snk_pdo[i], TYPEC_SINK);
+			if (pdo)
+				msg.payload[nr_pdo++] = cpu_to_le32(pdo);
+		}
+	}
+
+	if (!nr_pdo) {
 		/* No sink capabilities defined, source only */
 		msg.header = PD_HEADER_LE(PD_CTRL_REJECT,
 					  port->pwr_role,
@@ -1069,10 +1113,8 @@ static int tcpm_pd_send_sink_caps(struct tcpm_port *port)
 					  port->data_role,
 					  port->negotiated_rev,
 					  port->message_id,
-					  port->nr_snk_pdo);
+					  nr_pdo);
 	}
-	for (i = 0; i < port->nr_snk_pdo; i++)
-		msg.payload[i] = cpu_to_le32(port->snk_pdo[i]);
 
 	return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg);
 }
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 272454f9cd67..70d681918d01 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -225,6 +225,7 @@ enum pd_pdo_type {
 #define PDO_FIXED_EXTPOWER		BIT(27) /* Externally powered */
 #define PDO_FIXED_USB_COMM		BIT(26) /* USB communications capable */
 #define PDO_FIXED_DATA_SWAP		BIT(25) /* Data role swap supported */
+#define PDO_FIXED_UNCHUNK_EXT		BIT(24) /* Unchunked Extended Message supported (Source) */
 #define PDO_FIXED_FRS_CURR_MASK		(BIT(24) | BIT(23)) /* FR_Swap Current (Sink) */
 #define PDO_FIXED_FRS_CURR_SHIFT	23
 #define PDO_FIXED_VOLT_SHIFT		10	/* 50mV units */
-- 
cgit v1.2.3


From 2289e87b5951f97783f07fc895e6c5e804b53668 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 17 Sep 2020 17:22:49 -0400
Subject: SUNRPC: Make trace_svc_process() display the RPC procedure
 symbolically

The next few patches will employ these strings to help make server-
side trace logs more human-readable. A similar technique is already
in use in kernel RPC client code.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc4proc.c        | 24 ++++++++++++++++++++++++
 fs/lockd/svcproc.c         | 24 ++++++++++++++++++++++++
 fs/nfs/callback_xdr.c      |  2 ++
 fs/nfsd/nfs2acl.c          |  5 +++++
 fs/nfsd/nfs3acl.c          |  3 +++
 fs/nfsd/nfs3proc.c         | 22 ++++++++++++++++++++++
 fs/nfsd/nfs4proc.c         |  2 ++
 fs/nfsd/nfsproc.c          | 18 ++++++++++++++++++
 include/linux/sunrpc/svc.h |  1 +
 9 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index fa41dda39925..4c10fb5138f1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -512,6 +512,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "NULL",
 	},
 	[NLMPROC_TEST] = {
 		.pc_func = nlm4svc_proc_test,
@@ -520,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+2+No+Rg,
+		.pc_name = "TEST",
 	},
 	[NLMPROC_LOCK] = {
 		.pc_func = nlm4svc_proc_lock,
@@ -528,6 +530,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "LOCK",
 	},
 	[NLMPROC_CANCEL] = {
 		.pc_func = nlm4svc_proc_cancel,
@@ -536,6 +539,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "CANCEL",
 	},
 	[NLMPROC_UNLOCK] = {
 		.pc_func = nlm4svc_proc_unlock,
@@ -544,6 +548,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "UNLOCK",
 	},
 	[NLMPROC_GRANTED] = {
 		.pc_func = nlm4svc_proc_granted,
@@ -552,6 +557,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "GRANTED",
 	},
 	[NLMPROC_TEST_MSG] = {
 		.pc_func = nlm4svc_proc_test_msg,
@@ -560,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "TEST_MSG",
 	},
 	[NLMPROC_LOCK_MSG] = {
 		.pc_func = nlm4svc_proc_lock_msg,
@@ -568,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "LOCK_MSG",
 	},
 	[NLMPROC_CANCEL_MSG] = {
 		.pc_func = nlm4svc_proc_cancel_msg,
@@ -576,6 +584,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "CANCEL_MSG",
 	},
 	[NLMPROC_UNLOCK_MSG] = {
 		.pc_func = nlm4svc_proc_unlock_msg,
@@ -584,6 +593,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNLOCK_MSG",
 	},
 	[NLMPROC_GRANTED_MSG] = {
 		.pc_func = nlm4svc_proc_granted_msg,
@@ -592,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "GRANTED_MSG",
 	},
 	[NLMPROC_TEST_RES] = {
 		.pc_func = nlm4svc_proc_null,
@@ -600,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "TEST_RES",
 	},
 	[NLMPROC_LOCK_RES] = {
 		.pc_func = nlm4svc_proc_null,
@@ -608,6 +620,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "LOCK_RES",
 	},
 	[NLMPROC_CANCEL_RES] = {
 		.pc_func = nlm4svc_proc_null,
@@ -616,6 +629,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "CANCEL_RES",
 	},
 	[NLMPROC_UNLOCK_RES] = {
 		.pc_func = nlm4svc_proc_null,
@@ -624,6 +638,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNLOCK_RES",
 	},
 	[NLMPROC_GRANTED_RES] = {
 		.pc_func = nlm4svc_proc_granted_res,
@@ -632,6 +647,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "GRANTED_RES",
 	},
 	[NLMPROC_NSM_NOTIFY] = {
 		.pc_func = nlm4svc_proc_sm_notify,
@@ -640,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_reboot),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "SM_NOTIFY",
 	},
 	[17] = {
 		.pc_func = nlm4svc_proc_unused,
@@ -648,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
+		.pc_name = "UNUSED",
 	},
 	[18] = {
 		.pc_func = nlm4svc_proc_unused,
@@ -656,6 +674,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
+		.pc_name = "UNUSED",
 	},
 	[19] = {
 		.pc_func = nlm4svc_proc_unused,
@@ -664,6 +683,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
+		.pc_name = "UNUSED",
 	},
 	[NLMPROC_SHARE] = {
 		.pc_func = nlm4svc_proc_share,
@@ -672,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
+		.pc_name = "SHARE",
 	},
 	[NLMPROC_UNSHARE] = {
 		.pc_func = nlm4svc_proc_unshare,
@@ -680,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
+		.pc_name = "UNSHARE",
 	},
 	[NLMPROC_NM_LOCK] = {
 		.pc_func = nlm4svc_proc_nm_lock,
@@ -688,6 +710,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "NM_LOCK",
 	},
 	[NLMPROC_FREE_ALL] = {
 		.pc_func = nlm4svc_proc_free_all,
@@ -696,5 +719,6 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "FREE_ALL",
 	},
 };
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 50855f2c1f4b..4ae4b63b5392 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -554,6 +554,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "NULL",
 	},
 	[NLMPROC_TEST] = {
 		.pc_func = nlmsvc_proc_test,
@@ -562,6 +563,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+2+No+Rg,
+		.pc_name = "TEST",
 	},
 	[NLMPROC_LOCK] = {
 		.pc_func = nlmsvc_proc_lock,
@@ -570,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "LOCK",
 	},
 	[NLMPROC_CANCEL] = {
 		.pc_func = nlmsvc_proc_cancel,
@@ -578,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "CANCEL",
 	},
 	[NLMPROC_UNLOCK] = {
 		.pc_func = nlmsvc_proc_unlock,
@@ -586,6 +590,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "UNLOCK",
 	},
 	[NLMPROC_GRANTED] = {
 		.pc_func = nlmsvc_proc_granted,
@@ -594,6 +599,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "GRANTED",
 	},
 	[NLMPROC_TEST_MSG] = {
 		.pc_func = nlmsvc_proc_test_msg,
@@ -602,6 +608,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "TEST_MSG",
 	},
 	[NLMPROC_LOCK_MSG] = {
 		.pc_func = nlmsvc_proc_lock_msg,
@@ -610,6 +617,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "LOCK_MSG",
 	},
 	[NLMPROC_CANCEL_MSG] = {
 		.pc_func = nlmsvc_proc_cancel_msg,
@@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "CANCEL_MSG",
 	},
 	[NLMPROC_UNLOCK_MSG] = {
 		.pc_func = nlmsvc_proc_unlock_msg,
@@ -626,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNLOCK_MSG",
 	},
 	[NLMPROC_GRANTED_MSG] = {
 		.pc_func = nlmsvc_proc_granted_msg,
@@ -634,6 +644,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "GRANTED_MSG",
 	},
 	[NLMPROC_TEST_RES] = {
 		.pc_func = nlmsvc_proc_null,
@@ -642,6 +653,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "TEST_RES",
 	},
 	[NLMPROC_LOCK_RES] = {
 		.pc_func = nlmsvc_proc_null,
@@ -650,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "LOCK_RES",
 	},
 	[NLMPROC_CANCEL_RES] = {
 		.pc_func = nlmsvc_proc_null,
@@ -658,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "CANCEL_RES",
 	},
 	[NLMPROC_UNLOCK_RES] = {
 		.pc_func = nlmsvc_proc_null,
@@ -666,6 +680,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNLOCK_RES",
 	},
 	[NLMPROC_GRANTED_RES] = {
 		.pc_func = nlmsvc_proc_granted_res,
@@ -674,6 +689,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "GRANTED_RES",
 	},
 	[NLMPROC_NSM_NOTIFY] = {
 		.pc_func = nlmsvc_proc_sm_notify,
@@ -682,6 +698,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_reboot),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "SM_NOTIFY",
 	},
 	[17] = {
 		.pc_func = nlmsvc_proc_unused,
@@ -690,6 +707,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNUSED",
 	},
 	[18] = {
 		.pc_func = nlmsvc_proc_unused,
@@ -698,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNUSED",
 	},
 	[19] = {
 		.pc_func = nlmsvc_proc_unused,
@@ -706,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
+		.pc_name = "UNUSED",
 	},
 	[NLMPROC_SHARE] = {
 		.pc_func = nlmsvc_proc_share,
@@ -714,6 +734,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
+		.pc_name = "SHARE",
 	},
 	[NLMPROC_UNSHARE] = {
 		.pc_func = nlmsvc_proc_unshare,
@@ -722,6 +743,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
+		.pc_name = "UNSHARE",
 	},
 	[NLMPROC_NM_LOCK] = {
 		.pc_func = nlmsvc_proc_nm_lock,
@@ -730,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
+		.pc_name = "NM_LOCK",
 	},
 	[NLMPROC_FREE_ALL] = {
 		.pc_func = nlmsvc_proc_free_all,
@@ -738,5 +761,6 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_argsize = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
+		.pc_name = "FREE_ALL",
 	},
 };
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 79ff172eb1c8..c5348ba81129 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1060,6 +1060,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
 		.pc_decode = nfs4_decode_void,
 		.pc_encode = nfs4_encode_void,
 		.pc_xdrressize = 1,
+		.pc_name = "NULL",
 	},
 	[CB_COMPOUND] = {
 		.pc_func = nfs4_callback_compound,
@@ -1067,6 +1068,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
 		.pc_argsize = 256,
 		.pc_ressize = 256,
 		.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+		.pc_name = "COMPOUND",
 	}
 };
 
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b0f66604532a..899762da23c9 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -371,6 +371,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST,
+		.pc_name = "NULL",
 	},
 	[ACLPROC2_GETACL] = {
 		.pc_func = nfsacld_proc_getacl,
@@ -381,6 +382,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_ressize = sizeof(struct nfsd3_getaclres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+1+2*(1+ACL),
+		.pc_name = "GETACL",
 	},
 	[ACLPROC2_SETACL] = {
 		.pc_func = nfsacld_proc_setacl,
@@ -391,6 +393,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
+		.pc_name = "SETACL",
 	},
 	[ACLPROC2_GETATTR] = {
 		.pc_func = nfsacld_proc_getattr,
@@ -401,6 +404,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
+		.pc_name = "GETATTR",
 	},
 	[ACLPROC2_ACCESS] = {
 		.pc_func = nfsacld_proc_access,
@@ -411,6 +415,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_ressize = sizeof(struct nfsd3_accessres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT+1,
+		.pc_name = "SETATTR",
 	},
 };
 
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 7c30876a31a1..9e1a92fb9771 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -251,6 +251,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST,
+		.pc_name = "NULL",
 	},
 	[ACLPROC3_GETACL] = {
 		.pc_func = nfsd3_proc_getacl,
@@ -261,6 +262,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
 		.pc_ressize = sizeof(struct nfsd3_getaclres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+1+2*(1+ACL),
+		.pc_name = "GETACL",
 	},
 	[ACLPROC3_SETACL] = {
 		.pc_func = nfsd3_proc_setacl,
@@ -271,6 +273,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
 		.pc_ressize = sizeof(struct nfsd3_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT,
+		.pc_name = "SETACL",
 	},
 };
 
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 76931f4f57c3..c9c64471c568 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -708,6 +708,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST,
+		.pc_name = "NULL",
 	},
 	[NFS3PROC_GETATTR] = {
 		.pc_func = nfsd3_proc_getattr,
@@ -718,6 +719,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_attrstatres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
+		.pc_name = "GETATTR",
 	},
 	[NFS3PROC_SETATTR] = {
 		.pc_func = nfsd3_proc_setattr,
@@ -728,6 +730,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_wccstatres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC,
+		.pc_name = "SETATTR",
 	},
 	[NFS3PROC_LOOKUP] = {
 		.pc_func = nfsd3_proc_lookup,
@@ -738,6 +741,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_diropres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+FH+pAT+pAT,
+		.pc_name = "LOOKUP",
 	},
 	[NFS3PROC_ACCESS] = {
 		.pc_func = nfsd3_proc_access,
@@ -748,6 +752,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_accessres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+1,
+		.pc_name = "ACCESS",
 	},
 	[NFS3PROC_READLINK] = {
 		.pc_func = nfsd3_proc_readlink,
@@ -758,6 +763,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_readlinkres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+		.pc_name = "READLINK",
 	},
 	[NFS3PROC_READ] = {
 		.pc_func = nfsd3_proc_read,
@@ -768,6 +774,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_readres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+		.pc_name = "READ",
 	},
 	[NFS3PROC_WRITE] = {
 		.pc_func = nfsd3_proc_write,
@@ -778,6 +785,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_writeres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC+4,
+		.pc_name = "WRITE",
 	},
 	[NFS3PROC_CREATE] = {
 		.pc_func = nfsd3_proc_create,
@@ -788,6 +796,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+		.pc_name = "CREATE",
 	},
 	[NFS3PROC_MKDIR] = {
 		.pc_func = nfsd3_proc_mkdir,
@@ -798,6 +807,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+		.pc_name = "MKDIR",
 	},
 	[NFS3PROC_SYMLINK] = {
 		.pc_func = nfsd3_proc_symlink,
@@ -808,6 +818,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+		.pc_name = "SYMLINK",
 	},
 	[NFS3PROC_MKNOD] = {
 		.pc_func = nfsd3_proc_mknod,
@@ -818,6 +829,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
+		.pc_name = "MKNOD",
 	},
 	[NFS3PROC_REMOVE] = {
 		.pc_func = nfsd3_proc_remove,
@@ -828,6 +840,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_wccstatres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC,
+		.pc_name = "REMOVE",
 	},
 	[NFS3PROC_RMDIR] = {
 		.pc_func = nfsd3_proc_rmdir,
@@ -838,6 +851,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_wccstatres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC,
+		.pc_name = "RMDIR",
 	},
 	[NFS3PROC_RENAME] = {
 		.pc_func = nfsd3_proc_rename,
@@ -848,6 +862,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_renameres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC+WC,
+		.pc_name = "RENAME",
 	},
 	[NFS3PROC_LINK] = {
 		.pc_func = nfsd3_proc_link,
@@ -858,6 +873,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_linkres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+pAT+WC,
+		.pc_name = "LINK",
 	},
 	[NFS3PROC_READDIR] = {
 		.pc_func = nfsd3_proc_readdir,
@@ -867,6 +883,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_argsize = sizeof(struct nfsd3_readdirargs),
 		.pc_ressize = sizeof(struct nfsd3_readdirres),
 		.pc_cachetype = RC_NOCACHE,
+		.pc_name = "READDIR",
 	},
 	[NFS3PROC_READDIRPLUS] = {
 		.pc_func = nfsd3_proc_readdirplus,
@@ -876,6 +893,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
 		.pc_ressize = sizeof(struct nfsd3_readdirres),
 		.pc_cachetype = RC_NOCACHE,
+		.pc_name = "READDIRPLUS",
 	},
 	[NFS3PROC_FSSTAT] = {
 		.pc_func = nfsd3_proc_fsstat,
@@ -885,6 +903,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_fsstatres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+2*6+1,
+		.pc_name = "FSSTAT",
 	},
 	[NFS3PROC_FSINFO] = {
 		.pc_func = nfsd3_proc_fsinfo,
@@ -894,6 +913,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_fsinfores),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+12,
+		.pc_name = "FSINFO",
 	},
 	[NFS3PROC_PATHCONF] = {
 		.pc_func = nfsd3_proc_pathconf,
@@ -903,6 +923,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_pathconfres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+6,
+		.pc_name = "PATHCONF",
 	},
 	[NFS3PROC_COMMIT] = {
 		.pc_func = nfsd3_proc_commit,
@@ -913,6 +934,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_ressize = sizeof(struct nfsd3_commitres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+WC+2,
+		.pc_name = "COMMIT",
 	},
 };
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8d6d2678abad..f567592692ee 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3305,6 +3305,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 1,
+		.pc_name = "NULL",
 	},
 	[NFSPROC4_COMPOUND] = {
 		.pc_func = nfsd4_proc_compound,
@@ -3315,6 +3316,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
 		.pc_release = nfsd4_release_compoundargs,
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = NFSD_BUFSIZE/4,
+		.pc_name = "COMPOUND",
 	},
 };
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 9473d048efec..1f85a4dc9d1b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -623,6 +623,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 0,
+		.pc_name = "NULL",
 	},
 	[NFSPROC_GETATTR] = {
 		.pc_func = nfsd_proc_getattr,
@@ -633,6 +634,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
+		.pc_name = "GETATTR",
 	},
 	[NFSPROC_SETATTR] = {
 		.pc_func = nfsd_proc_setattr,
@@ -643,6 +645,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+AT,
+		.pc_name = "SETATTR",
 	},
 	[NFSPROC_ROOT] = {
 		.pc_func = nfsd_proc_root,
@@ -652,6 +655,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 0,
+		.pc_name = "ROOT",
 	},
 	[NFSPROC_LOOKUP] = {
 		.pc_func = nfsd_proc_lookup,
@@ -662,6 +666,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_diropres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+FH+AT,
+		.pc_name = "LOOKUP",
 	},
 	[NFSPROC_READLINK] = {
 		.pc_func = nfsd_proc_readlink,
@@ -671,6 +676,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_readlinkres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+		.pc_name = "READLINK",
 	},
 	[NFSPROC_READ] = {
 		.pc_func = nfsd_proc_read,
@@ -681,6 +687,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_readres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+		.pc_name = "READ",
 	},
 	[NFSPROC_WRITECACHE] = {
 		.pc_func = nfsd_proc_writecache,
@@ -690,6 +697,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 0,
+		.pc_name = "WRITECACHE",
 	},
 	[NFSPROC_WRITE] = {
 		.pc_func = nfsd_proc_write,
@@ -700,6 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+AT,
+		.pc_name = "WRITE",
 	},
 	[NFSPROC_CREATE] = {
 		.pc_func = nfsd_proc_create,
@@ -710,6 +719,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_diropres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+FH+AT,
+		.pc_name = "CREATE",
 	},
 	[NFSPROC_REMOVE] = {
 		.pc_func = nfsd_proc_remove,
@@ -719,6 +729,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
+		.pc_name = "REMOVE",
 	},
 	[NFSPROC_RENAME] = {
 		.pc_func = nfsd_proc_rename,
@@ -728,6 +739,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
+		.pc_name = "RENAME",
 	},
 	[NFSPROC_LINK] = {
 		.pc_func = nfsd_proc_link,
@@ -737,6 +749,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
+		.pc_name = "LINK",
 	},
 	[NFSPROC_SYMLINK] = {
 		.pc_func = nfsd_proc_symlink,
@@ -746,6 +759,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
+		.pc_name = "SYMLINK",
 	},
 	[NFSPROC_MKDIR] = {
 		.pc_func = nfsd_proc_mkdir,
@@ -756,6 +770,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_diropres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+FH+AT,
+		.pc_name = "MKDIR",
 	},
 	[NFSPROC_RMDIR] = {
 		.pc_func = nfsd_proc_rmdir,
@@ -765,6 +780,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
+		.pc_name = "RMDIR",
 	},
 	[NFSPROC_READDIR] = {
 		.pc_func = nfsd_proc_readdir,
@@ -773,6 +789,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_argsize = sizeof(struct nfsd_readdirargs),
 		.pc_ressize = sizeof(struct nfsd_readdirres),
 		.pc_cachetype = RC_NOCACHE,
+		.pc_name = "READDIR",
 	},
 	[NFSPROC_STATFS] = {
 		.pc_func = nfsd_proc_statfs,
@@ -782,6 +799,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_ressize = sizeof(struct nfsd_statfsres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+5,
+		.pc_name = "STATFS",
 	},
 };
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 34c2a69820e9..31ee3b6047c3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -463,6 +463,7 @@ struct svc_procedure {
 	unsigned int		pc_ressize;	/* result struct size */
 	unsigned int		pc_cachetype;	/* cache info (NFS) */
 	unsigned int		pc_xdrressize;	/* maximum size of XDR reply */
+	const char *		pc_name;	/* for display */
 };
 
 /*
-- 
cgit v1.2.3


From 81d217474326b25d7f14274b02fe3da1e85ad934 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 27 Nov 2020 17:37:02 -0500
Subject: SUNRPC: Move definition of XDR_UNIT

Clean up: The unit of XDR alignment is defined by RFC 4506,
not as part of the RPC message header. Thus it belongs in
include/linux/sunrpc/xdr.h.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/msg_prot.h |  3 ---
 include/linux/sunrpc/xdr.h      | 13 ++++++++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index 43f854487539..938c2bf29db8 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -10,9 +10,6 @@
 
 #define RPC_VERSION 2
 
-/* size of an XDR encoding unit in bytes, i.e. 32bit */
-#define XDR_UNIT	(4)
-
 /* spec defines authentication flavor as an unsigned 32 bit integer */
 typedef u32	rpc_authflavor_t;
 
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 19b6dea27367..dbba537caab6 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -19,6 +19,13 @@
 struct bio_vec;
 struct rpc_rqst;
 
+/*
+ * Size of an XDR encoding unit in bytes, i.e. 32 bits,
+ * as defined in Section 3 of RFC 4506. All encoded
+ * XDR data items are aligned on a boundary of 32 bits.
+ */
+#define XDR_UNIT		sizeof(__be32)
+
 /*
  * Buffer adjustment
  */
@@ -330,7 +337,7 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
 static inline size_t
 xdr_align_size(size_t n)
 {
-	const size_t mask = sizeof(__u32) - 1;
+	const size_t mask = XDR_UNIT - 1;
 
 	return (n + mask) & ~mask;
 }
@@ -360,7 +367,7 @@ static inline size_t xdr_pad_size(size_t n)
  */
 static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr)
 {
-	const size_t len = sizeof(__be32);
+	const size_t len = XDR_UNIT;
 	__be32 *p = xdr_reserve_space(xdr, len);
 
 	if (unlikely(!p))
@@ -379,7 +386,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr)
  */
 static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
 {
-	const size_t len = sizeof(__be32);
+	const size_t len = XDR_UNIT;
 	__be32 *p = xdr_reserve_space(xdr, len);
 
 	if (unlikely(!p))
-- 
cgit v1.2.3


From 6bb844b4eb6e3b109a2fdaffb60e6da722dc4356 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 17 Nov 2020 10:38:46 -0500
Subject: NFSD: Add an xdr_stream-based decoder for NFSv2/3 ACLs

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs_common/nfsacl.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfsacl.h |  3 +++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index d056ad2fdefd..79c563c1a5e8 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -295,3 +295,55 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
 		   nfsacl_desc.desc.array_len;
 }
 EXPORT_SYMBOL_GPL(nfsacl_decode);
+
+/**
+ * nfs_stream_decode_acl - Decode an NFSv3 ACL
+ *
+ * @xdr: an xdr_stream positioned at an encoded ACL
+ * @aclcnt: OUT: count of ACEs in decoded posix_acl
+ * @pacl: OUT: a dynamically-allocated buffer containing the decoded posix_acl
+ *
+ * Return values:
+ *   %false: The encoded ACL is not valid
+ *   %true: @pacl contains a decoded ACL, and @xdr is advanced
+ *
+ * On a successful return, caller must release *pacl using posix_acl_release().
+ */
+bool nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt,
+			   struct posix_acl **pacl)
+{
+	const size_t elem_size = XDR_UNIT * 3;
+	struct nfsacl_decode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = elem_size,
+			.xcode = pacl ? xdr_nfsace_decode : NULL,
+		},
+	};
+	unsigned int base;
+	u32 entries;
+
+	if (xdr_stream_decode_u32(xdr, &entries) < 0)
+		return false;
+	if (entries > NFS_ACL_MAX_ENTRIES)
+		return false;
+
+	base = xdr_stream_pos(xdr);
+	if (!xdr_inline_decode(xdr, XDR_UNIT + elem_size * entries))
+		return false;
+	nfsacl_desc.desc.array_maxlen = entries;
+	if (xdr_decode_array2(xdr->buf, base, &nfsacl_desc.desc))
+		return false;
+
+	if (pacl) {
+		if (entries != nfsacl_desc.desc.array_len ||
+		    posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
+			posix_acl_release(nfsacl_desc.acl);
+			return false;
+		}
+		*pacl = nfsacl_desc.acl;
+	}
+	if (aclcnt)
+		*aclcnt = entries;
+	return true;
+}
+EXPORT_SYMBOL_GPL(nfs_stream_decode_acl);
diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h
index 103d44695323..0ba99c513649 100644
--- a/include/linux/nfsacl.h
+++ b/include/linux/nfsacl.h
@@ -38,5 +38,8 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 extern int
 nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
 	      struct posix_acl **pacl);
+extern bool
+nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt,
+		      struct posix_acl **pacl);
 
 #endif  /* __LINUX_NFSACL_H */
-- 
cgit v1.2.3


From df971cd853c05778ae1175e8aeb80a04bb9d4be5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 29 Dec 2020 15:47:44 -0500
Subject: svcrdma: Convert rdma_stat_recv to a per-CPU counter

Receives are frequent events. Avoid the overhead of a memory bus
lock cycle for counting a value that is hardly every used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  3 +-
 net/sunrpc/xprtrdma/svc_rdma.c          | 55 ++++++++++++++++++++++++++++++---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  3 +-
 3 files changed, 54 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 294b56e61522..ff32c59a27e7 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -49,6 +49,7 @@
 #include <linux/sunrpc/rpc_rdma_cid.h>
 #include <linux/sunrpc/svc_rdma_pcl.h>
 
+#include <linux/percpu_counter.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 
@@ -65,7 +66,7 @@ extern unsigned int svcrdma_max_requests;
 extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
 
-extern atomic_t rdma_stat_recv;
+extern struct percpu_counter svcrdma_stat_recv;
 extern atomic_t rdma_stat_read;
 extern atomic_t rdma_stat_write;
 extern atomic_t rdma_stat_sq_starve;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 1fc1d5cbeb9b..3e5e622bad81 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -63,7 +63,7 @@ unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
 static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
 static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
 
-atomic_t rdma_stat_recv;
+struct percpu_counter svcrdma_stat_recv;
 atomic_t rdma_stat_read;
 atomic_t rdma_stat_write;
 atomic_t rdma_stat_sq_starve;
@@ -110,6 +110,42 @@ static int read_reset_stat(struct ctl_table *table, int write,
 	return 0;
 }
 
+enum {
+	SVCRDMA_COUNTER_BUFSIZ	= sizeof(unsigned long long),
+};
+
+static int svcrdma_counter_handler(struct ctl_table *table, int write,
+				   void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct percpu_counter *stat = (struct percpu_counter *)table->data;
+	char tmp[SVCRDMA_COUNTER_BUFSIZ + 1];
+	int len;
+
+	if (write) {
+		percpu_counter_set(stat, 0);
+		return 0;
+	}
+
+	len = snprintf(tmp, SVCRDMA_COUNTER_BUFSIZ, "%lld\n",
+		       percpu_counter_sum_positive(stat));
+	if (len >= SVCRDMA_COUNTER_BUFSIZ)
+		return -EFAULT;
+	len = strlen(tmp);
+	if (*ppos > len) {
+		*lenp = 0;
+		return 0;
+	}
+	len -= *ppos;
+	if (len > *lenp)
+		len = *lenp;
+	if (len)
+		memcpy(buffer, tmp, len);
+	*lenp = len;
+	*ppos += len;
+
+	return 0;
+}
+
 static struct ctl_table_header *svcrdma_table_header;
 static struct ctl_table svcrdma_parm_table[] = {
 	{
@@ -149,10 +185,10 @@ static struct ctl_table svcrdma_parm_table[] = {
 	},
 	{
 		.procname	= "rdma_stat_recv",
-		.data		= &rdma_stat_recv,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_recv,
+		.maxlen		= SVCRDMA_COUNTER_BUFSIZ,
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= svcrdma_counter_handler,
 	},
 	{
 		.procname	= "rdma_stat_write",
@@ -230,15 +266,26 @@ static void svc_rdma_proc_cleanup(void)
 		return;
 	unregister_sysctl_table(svcrdma_table_header);
 	svcrdma_table_header = NULL;
+
+	percpu_counter_destroy(&svcrdma_stat_recv);
 }
 
 static int svc_rdma_proc_init(void)
 {
+	int rc;
+
 	if (svcrdma_table_header)
 		return 0;
 
+	rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL);
+	if (rc)
+		goto out_err;
+
 	svcrdma_table_header = register_sysctl_table(svcrdma_root_table);
 	return 0;
+
+out_err:
+	return rc;
 }
 
 void svc_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index cbdb71247755..7d14a74df716 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -845,8 +845,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	}
 	list_del(&ctxt->rc_list);
 	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
-
-	atomic_inc(&rdma_stat_recv);
+	percpu_counter_inc(&svcrdma_stat_recv);
 
 	svc_rdma_build_arg_xdr(rqstp, ctxt);
 
-- 
cgit v1.2.3


From 22df5a22462e836ccb30634c3a52602091179a73 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 29 Dec 2020 15:55:17 -0500
Subject: svcrdma: Convert rdma_stat_sq_starve to a per-CPU counter

Avoid the overhead of a memory bus lock cycle for counting a value
that is hardly every used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       |  2 +-
 net/sunrpc/xprtrdma/svc_rdma.c        | 13 +++++++++----
 net/sunrpc/xprtrdma/svc_rdma_rw.c     |  1 +
 net/sunrpc/xprtrdma/svc_rdma_sendto.c |  2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ff32c59a27e7..c06b16ccf83e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -69,7 +69,7 @@ extern unsigned int svcrdma_max_req_size;
 extern struct percpu_counter svcrdma_stat_recv;
 extern atomic_t rdma_stat_read;
 extern atomic_t rdma_stat_write;
-extern atomic_t rdma_stat_sq_starve;
+extern struct percpu_counter svcrdma_stat_sq_starve;
 extern atomic_t rdma_stat_rq_starve;
 extern atomic_t rdma_stat_rq_poll;
 extern atomic_t rdma_stat_rq_prod;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 3e5e622bad81..ee768d4c3c90 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -66,7 +66,7 @@ static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
 struct percpu_counter svcrdma_stat_recv;
 atomic_t rdma_stat_read;
 atomic_t rdma_stat_write;
-atomic_t rdma_stat_sq_starve;
+struct percpu_counter svcrdma_stat_sq_starve;
 atomic_t rdma_stat_rq_starve;
 atomic_t rdma_stat_rq_poll;
 atomic_t rdma_stat_rq_prod;
@@ -199,10 +199,10 @@ static struct ctl_table svcrdma_parm_table[] = {
 	},
 	{
 		.procname	= "rdma_stat_sq_starve",
-		.data		= &rdma_stat_sq_starve,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_sq_starve,
+		.maxlen		= SVCRDMA_COUNTER_BUFSIZ,
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= svcrdma_counter_handler,
 	},
 	{
 		.procname	= "rdma_stat_rq_starve",
@@ -267,6 +267,7 @@ static void svc_rdma_proc_cleanup(void)
 	unregister_sysctl_table(svcrdma_table_header);
 	svcrdma_table_header = NULL;
 
+	percpu_counter_destroy(&svcrdma_stat_sq_starve);
 	percpu_counter_destroy(&svcrdma_stat_recv);
 }
 
@@ -278,6 +279,9 @@ static int svc_rdma_proc_init(void)
 		return 0;
 
 	rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL);
+	if (rc)
+		goto out_err;
+	rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL);
 	if (rc)
 		goto out_err;
 
@@ -285,6 +289,7 @@ static int svc_rdma_proc_init(void)
 	return 0;
 
 out_err:
+	percpu_counter_destroy(&svcrdma_stat_recv);
 	return rc;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 0b63e1321d74..d7d98b2df00b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -364,6 +364,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
 			return 0;
 		}
 
+		percpu_counter_inc(&svcrdma_stat_sq_starve);
 		trace_svcrdma_sq_full(rdma);
 		atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
 		wait_event(rdma->sc_send_wait,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 68af79d4f04f..52c759a8543e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -317,7 +317,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 	/* If the SQ is full, wait until an SQ entry is available */
 	while (1) {
 		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
-			atomic_inc(&rdma_stat_sq_starve);
+			percpu_counter_inc(&svcrdma_stat_sq_starve);
 			trace_svcrdma_sq_full(rdma);
 			atomic_inc(&rdma->sc_sq_avail);
 			wait_event(rdma->sc_send_wait,
-- 
cgit v1.2.3


From 1e7e55731628c90d8c701c45f9c3a3b8718840d6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 29 Dec 2020 16:09:36 -0500
Subject: svcrdma: Restore read and write stats

Now that we have an efficient mechanism to update these two stats,
let's start maintaining them again.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h   |  4 ++--
 net/sunrpc/xprtrdma/svc_rdma.c    | 26 ++++++++++++++++++--------
 net/sunrpc/xprtrdma/svc_rdma_rw.c |  2 ++
 3 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index c06b16ccf83e..c882816cba8e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -66,10 +66,10 @@ extern unsigned int svcrdma_max_requests;
 extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
 
+extern struct percpu_counter svcrdma_stat_read;
 extern struct percpu_counter svcrdma_stat_recv;
-extern atomic_t rdma_stat_read;
-extern atomic_t rdma_stat_write;
 extern struct percpu_counter svcrdma_stat_sq_starve;
+extern struct percpu_counter svcrdma_stat_write;
 extern atomic_t rdma_stat_rq_starve;
 extern atomic_t rdma_stat_rq_poll;
 extern atomic_t rdma_stat_rq_prod;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index ee768d4c3c90..c8336df7a142 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -63,10 +63,10 @@ unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
 static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
 static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
 
+struct percpu_counter svcrdma_stat_read;
 struct percpu_counter svcrdma_stat_recv;
-atomic_t rdma_stat_read;
-atomic_t rdma_stat_write;
 struct percpu_counter svcrdma_stat_sq_starve;
+struct percpu_counter svcrdma_stat_write;
 atomic_t rdma_stat_rq_starve;
 atomic_t rdma_stat_rq_poll;
 atomic_t rdma_stat_rq_prod;
@@ -178,10 +178,10 @@ static struct ctl_table svcrdma_parm_table[] = {
 
 	{
 		.procname	= "rdma_stat_read",
-		.data		= &rdma_stat_read,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_read,
+		.maxlen		= SVCRDMA_COUNTER_BUFSIZ,
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= svcrdma_counter_handler,
 	},
 	{
 		.procname	= "rdma_stat_recv",
@@ -192,10 +192,10 @@ static struct ctl_table svcrdma_parm_table[] = {
 	},
 	{
 		.procname	= "rdma_stat_write",
-		.data		= &rdma_stat_write,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_write,
+		.maxlen		= SVCRDMA_COUNTER_BUFSIZ,
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= svcrdma_counter_handler,
 	},
 	{
 		.procname	= "rdma_stat_sq_starve",
@@ -267,8 +267,10 @@ static void svc_rdma_proc_cleanup(void)
 	unregister_sysctl_table(svcrdma_table_header);
 	svcrdma_table_header = NULL;
 
+	percpu_counter_destroy(&svcrdma_stat_write);
 	percpu_counter_destroy(&svcrdma_stat_sq_starve);
 	percpu_counter_destroy(&svcrdma_stat_recv);
+	percpu_counter_destroy(&svcrdma_stat_read);
 }
 
 static int svc_rdma_proc_init(void)
@@ -278,10 +280,16 @@ static int svc_rdma_proc_init(void)
 	if (svcrdma_table_header)
 		return 0;
 
+	rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL);
+	if (rc)
+		goto out_err;
 	rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL);
 	if (rc)
 		goto out_err;
 	rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL);
+	if (rc)
+		goto out_err;
+	rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL);
 	if (rc)
 		goto out_err;
 
@@ -289,7 +297,9 @@ static int svc_rdma_proc_init(void)
 	return 0;
 
 out_err:
+	percpu_counter_destroy(&svcrdma_stat_sq_starve);
 	percpu_counter_destroy(&svcrdma_stat_recv);
+	percpu_counter_destroy(&svcrdma_stat_read);
 	return rc;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index d7d98b2df00b..693d139a8633 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -469,6 +469,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
 					   DMA_TO_DEVICE);
 		if (ret < 0)
 			return -EIO;
+		percpu_counter_inc(&svcrdma_stat_write);
 
 		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
 		cc->cc_sqecount += ret;
@@ -719,6 +720,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
 				   segment->rs_handle, DMA_FROM_DEVICE);
 	if (ret < 0)
 		return -EIO;
+	percpu_counter_inc(&svcrdma_stat_read);
 
 	list_add(&ctxt->rw_list, &cc->cc_rwctxts);
 	cc->cc_sqecount += ret;
-- 
cgit v1.2.3


From c6226ff9a62a17182b8092883ca201df5cd47f59 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 29 Dec 2020 14:53:09 -0500
Subject: svcrdma: Deprecate stat variables that are no longer used

Clean up. We are not permitted to remove old proc files. Instead,
convert these variables to stubs that are only ever allowed to
display a value of zero.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h |  5 ---
 net/sunrpc/xprtrdma/svc_rdma.c  | 84 +++++++++++++----------------------------
 2 files changed, 27 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index c882816cba8e..1e76ed688044 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -70,11 +70,6 @@ extern struct percpu_counter svcrdma_stat_read;
 extern struct percpu_counter svcrdma_stat_recv;
 extern struct percpu_counter svcrdma_stat_sq_starve;
 extern struct percpu_counter svcrdma_stat_write;
-extern atomic_t rdma_stat_rq_starve;
-extern atomic_t rdma_stat_rq_poll;
-extern atomic_t rdma_stat_rq_prod;
-extern atomic_t rdma_stat_sq_poll;
-extern atomic_t rdma_stat_sq_prod;
 
 struct svcxprt_rdma {
 	struct svc_xprt      sc_xprt;		/* SVC transport structure */
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c8336df7a142..5bc20e9d09cd 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -62,53 +62,13 @@ static unsigned int max_max_requests = 16384;
 unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
 static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
 static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
+static unsigned int svcrdma_stat_unused;
+static unsigned int zero;
 
 struct percpu_counter svcrdma_stat_read;
 struct percpu_counter svcrdma_stat_recv;
 struct percpu_counter svcrdma_stat_sq_starve;
 struct percpu_counter svcrdma_stat_write;
-atomic_t rdma_stat_rq_starve;
-atomic_t rdma_stat_rq_poll;
-atomic_t rdma_stat_rq_prod;
-atomic_t rdma_stat_sq_poll;
-atomic_t rdma_stat_sq_prod;
-
-/*
- * This function implements reading and resetting an atomic_t stat
- * variable through read/write to a proc file. Any write to the file
- * resets the associated statistic to zero. Any read returns it's
- * current value.
- */
-static int read_reset_stat(struct ctl_table *table, int write,
-			   void *buffer, size_t *lenp, loff_t *ppos)
-{
-	atomic_t *stat = (atomic_t *)table->data;
-
-	if (!stat)
-		return -EINVAL;
-
-	if (write)
-		atomic_set(stat, 0);
-	else {
-		char str_buf[32];
-		int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
-		if (len >= 32)
-			return -EFAULT;
-		len = strlen(str_buf);
-		if (*ppos > len) {
-			*lenp = 0;
-			return 0;
-		}
-		len -= *ppos;
-		if (len > *lenp)
-			len = *lenp;
-		if (len)
-			memcpy(buffer, str_buf, len);
-		*lenp = len;
-		*ppos += len;
-	}
-	return 0;
-}
 
 enum {
 	SVCRDMA_COUNTER_BUFSIZ	= sizeof(unsigned long long),
@@ -206,38 +166,48 @@ static struct ctl_table svcrdma_parm_table[] = {
 	},
 	{
 		.procname	= "rdma_stat_rq_starve",
-		.data		= &rdma_stat_rq_starve,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_unused,
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &zero,
 	},
 	{
 		.procname	= "rdma_stat_rq_poll",
-		.data		= &rdma_stat_rq_poll,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_unused,
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &zero,
 	},
 	{
 		.procname	= "rdma_stat_rq_prod",
-		.data		= &rdma_stat_rq_prod,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_unused,
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &zero,
 	},
 	{
 		.procname	= "rdma_stat_sq_poll",
-		.data		= &rdma_stat_sq_poll,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_unused,
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &zero,
 	},
 	{
 		.procname	= "rdma_stat_sq_prod",
-		.data		= &rdma_stat_sq_prod,
-		.maxlen		= sizeof(atomic_t),
+		.data		= &svcrdma_stat_unused,
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= read_reset_stat,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &zero,
 	},
 	{ },
 };
-- 
cgit v1.2.3


From 43042b90cae11cc2d9827c91df6d6b5fe498d5ce Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Dec 2020 13:14:15 -0500
Subject: svcrdma: Reduce Receive doorbell rate

This is similar to commit e340c2d6ef2a ("xprtrdma: Reduce the
doorbell rate (Receive)") which added Receive batching to the
client.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 82 +++++++++++++++++----------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 1e76ed688044..7c693b31965e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -104,6 +104,7 @@ struct svcxprt_rdma {
 
 	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */
 	unsigned long	     sc_flags;
+	u32		     sc_pending_recvs;
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 7d14a74df716..ab0b7e9777bc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -266,33 +266,46 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp)
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
 }
 
-static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
-				struct svc_rdma_recv_ctxt *ctxt)
+static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
+				   unsigned int wanted, bool temp)
 {
+	const struct ib_recv_wr *bad_wr = NULL;
+	struct svc_rdma_recv_ctxt *ctxt;
+	struct ib_recv_wr *recv_chain;
 	int ret;
 
-	trace_svcrdma_post_recv(ctxt);
-	ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
+	recv_chain = NULL;
+	while (wanted--) {
+		ctxt = svc_rdma_recv_ctxt_get(rdma);
+		if (!ctxt)
+			break;
+
+		trace_svcrdma_post_recv(ctxt);
+		ctxt->rc_temp = temp;
+		ctxt->rc_recv_wr.next = recv_chain;
+		recv_chain = &ctxt->rc_recv_wr;
+		rdma->sc_pending_recvs++;
+	}
+	if (!recv_chain)
+		return false;
+
+	ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
 	if (ret)
 		goto err_post;
-	return 0;
+	return true;
 
 err_post:
-	trace_svcrdma_rq_post_err(rdma, ret);
-	svc_rdma_recv_ctxt_put(rdma, ctxt);
-	return ret;
-}
-
-static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
-{
-	struct svc_rdma_recv_ctxt *ctxt;
+	while (bad_wr) {
+		ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt,
+				    rc_recv_wr);
+		bad_wr = bad_wr->next;
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
+	}
 
-	if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
-		return 0;
-	ctxt = svc_rdma_recv_ctxt_get(rdma);
-	if (!ctxt)
-		return -ENOMEM;
-	return __svc_rdma_post_recv(rdma, ctxt);
+	trace_svcrdma_rq_post_err(rdma, ret);
+	/* Since we're destroying the xprt, no need to reset
+	 * sc_pending_recvs. */
+	return false;
 }
 
 /**
@@ -303,20 +316,7 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
-	struct svc_rdma_recv_ctxt *ctxt;
-	unsigned int i;
-	int ret;
-
-	for (i = 0; i < rdma->sc_max_requests; i++) {
-		ctxt = svc_rdma_recv_ctxt_get(rdma);
-		if (!ctxt)
-			return false;
-		ctxt->rc_temp = true;
-		ret = __svc_rdma_post_recv(rdma, ctxt);
-		if (ret)
-			return false;
-	}
-	return true;
+	return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
 }
 
 /**
@@ -324,8 +324,6 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
  * @cq: Completion Queue context
  * @wc: Work Completion object
  *
- * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
- * the Receive completion handler could be running.
  */
 static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
@@ -333,6 +331,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_recv_ctxt *ctxt;
 
+	rdma->sc_pending_recvs--;
+
 	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
 	ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
 
@@ -340,9 +340,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	if (wc->status != IB_WC_SUCCESS)
 		goto flushed;
 
-	if (svc_rdma_post_recv(rdma))
-		goto post_err;
-
 	/* All wc fields are now known to be valid */
 	ctxt->rc_byte_len = wc->byte_len;
 	ib_dma_sync_single_for_cpu(rdma->sc_pd->device,
@@ -356,11 +353,18 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	spin_unlock(&rdma->sc_rq_dto_lock);
 	if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
 		svc_xprt_enqueue(&rdma->sc_xprt);
+
+	if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) &&
+	    rdma->sc_pending_recvs < rdma->sc_max_requests)
+		if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH,
+					    false))
+			goto post_err;
+
 	return;
 
 flushed:
-post_err:
 	svc_rdma_recv_ctxt_put(rdma, ctxt);
+post_err:
 	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
 	svc_xprt_enqueue(&rdma->sc_xprt);
 }
-- 
cgit v1.2.3


From 3e1a88ec96259282b9a8b45c3f1fda7a3ff4f6ea Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 9 Jan 2021 16:03:02 +0000
Subject: bio: add a helper calculating nr segments to alloc

Add a helper function calculating the number of bvec segments we need to
allocate to construct a bio. It doesn't change anything functionally,
but will be used to not duplicate special cases in the future.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c       |  7 ++++---
 fs/iomap/direct-io.c |  9 ++++-----
 include/linux/bio.h  | 10 ++++++++++
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3b8963e228a1..6f5bd9950baf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -416,7 +416,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
-		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES);
 		if (!nr_pages) {
 			bool polled = false;
 
@@ -481,9 +481,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	int nr_pages;
 
-	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
-	if (!nr_pages)
+	if (!iov_iter_count(iter))
 		return 0;
+
+	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1);
 	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 933f234d5bec..ea1e8f696076 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -250,11 +250,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	orig_count = iov_iter_count(dio->submit.iter);
 	iov_iter_truncate(dio->submit.iter, length);
 
-	nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
-	if (nr_pages <= 0) {
-		ret = nr_pages;
+	if (!iov_iter_count(dio->submit.iter))
 		goto out;
-	}
 
 	if (need_zeroout) {
 		/* zero out from the start of the block to the write offset */
@@ -263,6 +260,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 			iomap_dio_zero(dio, iomap, pos - pad, pad);
 	}
 
+	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES);
 	do {
 		size_t n;
 		if (dio->error) {
@@ -308,7 +306,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		dio->size += n;
 		copied += n;
 
-		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
+						 BIO_MAX_PAGES);
 		iomap_dio_submit_bio(dio, iomap, bio, pos);
 		pos += n;
 	} while (nr_pages);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e135b500df5d..9ddb19801a03 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -10,6 +10,7 @@
 #include <linux/ioprio.h>
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include <linux/blk_types.h>
+#include <linux/uio.h>
 
 #define BIO_DEBUG
 
@@ -441,6 +442,15 @@ static inline void bio_wouldblock_error(struct bio *bio)
 	bio_endio(bio);
 }
 
+/*
+ * Calculate number of bvec segments that should be allocated to fit data
+ * pointed by @iter.
+ */
+static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
+{
+	return iov_iter_npages(iter, max_segs);
+}
+
 struct request_queue;
 
 extern int submit_bio_wait(struct bio *bio);
-- 
cgit v1.2.3


From c42bca92be928ce7dece5fc04cf68d0e37ee6718 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 9 Jan 2021 16:03:03 +0000
Subject: bio: don't copy bvec for direct IO

The block layer spends quite a while in blkdev_direct_IO() to copy and
initialise bio's bvec. However, if we've already got a bvec in the input
iterator it might be reused in some cases, i.e. when new
ITER_BVEC_FLAG_FIXED flag is set. Simple tests show considerable
performance boost, and it also reduces memory footprint.

Suggested-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/filesystems/porting.rst |  9 +++++
 block/bio.c                           | 67 +++++++++++++++--------------------
 include/linux/bio.h                   |  5 ++-
 3 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index c722d94f29ea..1f8cf8e10b34 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -872,3 +872,12 @@ its result is kern_unmount() or kern_unmount_array().
 
 zero-length bvec segments are disallowed, they must be filtered out before
 passed on to an iterator.
+
+---
+
+**mandatory**
+
+For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but
+uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and
+page references stay until I/O has completed, i.e. until ->ki_complete() has
+been called or returned with non -EIOCBQUEUED code.
diff --git a/block/bio.c b/block/bio.c
index 1cd8a2e79048..99040a7e6656 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -942,21 +942,17 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
 }
 EXPORT_SYMBOL_GPL(bio_release_pages);
 
-static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
+static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
-	const struct bio_vec *bv = iter->bvec;
-	unsigned int len;
-	size_t size;
-
-	if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
-		return -EINVAL;
-
-	len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
-	size = bio_add_page(bio, bv->bv_page, len,
-				bv->bv_offset + iter->iov_offset);
-	if (unlikely(size != len))
-		return -EINVAL;
-	iov_iter_advance(iter, size);
+	WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0);
+
+	bio->bi_vcnt = iter->nr_segs;
+	bio->bi_max_vecs = iter->nr_segs;
+	bio->bi_io_vec = (struct bio_vec *)iter->bvec;
+	bio->bi_iter.bi_bvec_done = iter->iov_offset;
+	bio->bi_iter.bi_size = iter->count;
+
+	iov_iter_advance(iter, iter->count);
 	return 0;
 }
 
@@ -1070,12 +1066,12 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
  * This takes either an iterator pointing to user memory, or one pointing to
  * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
  * map them into the kernel. On IO completion, the caller should put those
- * pages. If we're adding kernel pages, and the caller told us it's safe to
- * do so, we just have to add the pages to the bio directly. We don't grab an
- * extra reference to those pages (the user should already have that), and we
- * don't put the page on IO completion. The caller needs to check if the bio is
- * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
- * released.
+ * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
+ * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
+ * to ensure the bvecs and pages stay referenced until the submitted I/O is
+ * completed by a call to ->ki_complete() or returns with an error other than
+ * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
+ * on IO completion. If it isn't, then pages should be released.
  *
  * The function tries, but does not guarantee, to pin as many pages as
  * fit into the bio, or are requested in @iter, whatever is smaller. If
@@ -1087,27 +1083,22 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
-	const bool is_bvec = iov_iter_is_bvec(iter);
-	int ret;
-
-	if (WARN_ON_ONCE(bio->bi_vcnt))
-		return -EINVAL;
+	int ret = 0;
 
-	do {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-			if (WARN_ON_ONCE(is_bvec))
-				return -EINVAL;
-			ret = __bio_iov_append_get_pages(bio, iter);
-		} else {
-			if (is_bvec)
-				ret = __bio_iov_bvec_add_pages(bio, iter);
+	if (iov_iter_is_bvec(iter)) {
+		if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
+			return -EINVAL;
+		bio_iov_bvec_set(bio, iter);
+		bio_set_flag(bio, BIO_NO_PAGE_REF);
+		return 0;
+	} else {
+		do {
+			if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+				ret = __bio_iov_append_get_pages(bio, iter);
 			else
 				ret = __bio_iov_iter_get_pages(bio, iter);
-		}
-	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
-
-	if (is_bvec)
-		bio_set_flag(bio, BIO_NO_PAGE_REF);
+		} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
+	}
 
 	/* don't account direct I/O as memory stall */
 	bio_clear_flag(bio, BIO_WORKINGSET);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9ddb19801a03..676870b2c88d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -444,10 +444,13 @@ static inline void bio_wouldblock_error(struct bio *bio)
 
 /*
  * Calculate number of bvec segments that should be allocated to fit data
- * pointed by @iter.
+ * pointed by @iter. If @iter is backed by bvec it's going to be reused
+ * instead of allocating a new one.
  */
 static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
 {
+	if (iov_iter_is_bvec(iter))
+		return 0;
 	return iov_iter_npages(iter, max_segs);
 }
 
-- 
cgit v1.2.3


From 9d56653d14cd5e545599cd9e3013daa17df50cd4 Mon Sep 17 00:00:00 2001
From: Jiaxun Yang <jiaxun.yang@flygoat.com>
Date: Mon, 25 Jan 2021 12:59:56 +0100
Subject: ACPI: platform-profile: Drop const qualifier for cur_profile

Drop the const qualifier from the static global cur_profile
pointer declaration.

This is a preparation patch for passing the cur_profile pointer as
parameter to the profile_get() and profile_set() callbacks so that
drivers dynamically allocating their driver-data struct, with their
platform_profile_handler struct embedded, can use this pointer to
get to their driver-data.

Note this also requires dropping the const from the pprof
platform_profile_register() function argument. Dropping this
const is not a problem, non of the queued up consumers of
platform_profile_register() actually pass in a const pointer.

Link: https://lore.kernel.org/linux-acpi/5e7a4d87-52ef-e487-9cc2-8e7094beaa08@redhat.com/
Link: https://lore.kernel.org/r/20210114073429.176462-2-jiaxun.yang@flygoat.com
Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
[ hdegoede@redhat.com: Also remove const from platform_profile_register() ]
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/platform_profile.c  | 4 ++--
 include/linux/platform_profile.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index 91be50a32cc8..f65c61db7921 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -9,7 +9,7 @@
 #include <linux/platform_profile.h>
 #include <linux/sysfs.h>
 
-static const struct platform_profile_handler *cur_profile;
+static struct platform_profile_handler *cur_profile;
 static DEFINE_MUTEX(profile_lock);
 
 static const char * const profile_names[] = {
@@ -132,7 +132,7 @@ void platform_profile_notify(void)
 }
 EXPORT_SYMBOL_GPL(platform_profile_notify);
 
-int platform_profile_register(const struct platform_profile_handler *pprof)
+int platform_profile_register(struct platform_profile_handler *pprof)
 {
 	int err;
 
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index 3623d7108421..c797fdb3d91a 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -32,7 +32,7 @@ struct platform_profile_handler {
 	int (*profile_set)(enum platform_profile_option profile);
 };
 
-int platform_profile_register(const struct platform_profile_handler *pprof);
+int platform_profile_register(struct platform_profile_handler *pprof);
 int platform_profile_remove(void);
 void platform_profile_notify(void);
 
-- 
cgit v1.2.3


From d17655759b3fc660ea49bd7be665c193030c77c0 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Sat, 23 Jan 2021 19:34:45 +0300
Subject: reset: Add devm_reset_control_get_optional_exclusive_released()

NVIDIA Tegra DRM and media drivers will need a resource-managed-optional
variant of reset_control_get_exclusive_released() in order to switch away
from a legacy Tegra-specific PD API to a GENPD API without much hassle.
Add the new reset helper to the reset API.

Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20
Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 include/linux/reset.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index 439fec7112a9..b9109efa2a5c 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -362,6 +362,25 @@ __must_check devm_reset_control_get_exclusive_released(struct device *dev,
 	return __devm_reset_control_get(dev, id, 0, false, false, false);
 }
 
+/**
+ * devm_reset_control_get_optional_exclusive_released - resource managed
+ *                                                      reset_control_get_optional_exclusive_released()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed-and-optional variant of reset_control_get_exclusive_released(). For
+ * reset controllers returned from this function, reset_control_put() is called
+ * automatically on driver detach.
+ *
+ * See reset_control_get_exclusive_released() for more information.
+ */
+static inline struct reset_control *
+__must_check devm_reset_control_get_optional_exclusive_released(struct device *dev,
+								const char *id)
+{
+	return __devm_reset_control_get(dev, id, 0, false, true, false);
+}
+
 /**
  * devm_reset_control_get_shared - resource managed reset_control_get_shared()
  * @dev: device to be reset by the controller
-- 
cgit v1.2.3


From ba6dfce47c4d002d96cd02a304132fca76981172 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 21 Jan 2021 16:17:23 -0500
Subject: SUNRPC: Move simple_get_bytes and simple_get_netobj into private
 header

Remove duplicated helper functions to parse opaque XDR objects
and place inside new file net/sunrpc/auth_gss/auth_gss_internal.h.
In the new file carry the license and copyright from the source file
net/sunrpc/auth_gss/auth_gss.c.  Finally, update the comment inside
include/linux/sunrpc/xdr.h since lockd is not the only user of
struct xdr_netobj.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h              |  3 +--
 net/sunrpc/auth_gss/auth_gss.c          | 30 +----------------------
 net/sunrpc/auth_gss/auth_gss_internal.h | 42 +++++++++++++++++++++++++++++++++
 net/sunrpc/auth_gss/gss_krb5_mech.c     | 31 ++----------------------
 4 files changed, 46 insertions(+), 60 deletions(-)
 create mode 100644 net/sunrpc/auth_gss/auth_gss_internal.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 19b6dea27367..b26213ae8c1a 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -25,8 +25,7 @@ struct rpc_rqst;
 #define XDR_QUADLEN(l)		(((l) + 3) >> 2)
 
 /*
- * Generic opaque `network object.' At the kernel level, this type
- * is used only by lockd.
+ * Generic opaque `network object.'
  */
 #define XDR_MAX_NETOBJ		1024
 struct xdr_netobj {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4ecc2a959567..5f42aa5fc612 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -29,6 +29,7 @@
 #include <linux/uaccess.h>
 #include <linux/hashtable.h>
 
+#include "auth_gss_internal.h"
 #include "../netns.h"
 
 #include <trace/events/rpcgss.h>
@@ -125,35 +126,6 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
 	clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
 }
 
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, size_t len)
-{
-	const void *q = (const void *)((const char *)p + len);
-	if (unlikely(q > end || q < p))
-		return ERR_PTR(-EFAULT);
-	memcpy(res, p, len);
-	return q;
-}
-
-static inline const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
-{
-	const void *q;
-	unsigned int len;
-
-	p = simple_get_bytes(p, end, &len, sizeof(len));
-	if (IS_ERR(p))
-		return p;
-	q = (const void *)((const char *)p + len);
-	if (unlikely(q > end || q < p))
-		return ERR_PTR(-EFAULT);
-	dest->data = kmemdup(p, len, GFP_NOFS);
-	if (unlikely(dest->data == NULL))
-		return ERR_PTR(-ENOMEM);
-	dest->len = len;
-	return q;
-}
-
 static struct gss_cl_ctx *
 gss_cred_get_ctx(struct rpc_cred *cred)
 {
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
new file mode 100644
index 000000000000..c5603242b54b
--- /dev/null
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*
+ * linux/net/sunrpc/auth_gss/auth_gss_internal.h
+ *
+ * Internal definitions for RPCSEC_GSS client authentication
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ */
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/sunrpc/xdr.h>
+
+static inline const void *
+simple_get_bytes(const void *p, const void *end, void *res, size_t len)
+{
+	const void *q = (const void *)((const char *)p + len);
+	if (unlikely(q > end || q < p))
+		return ERR_PTR(-EFAULT);
+	memcpy(res, p, len);
+	return q;
+}
+
+static inline const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+{
+	const void *q;
+	unsigned int len;
+
+	p = simple_get_bytes(p, end, &len, sizeof(len));
+	if (IS_ERR(p))
+		return p;
+	q = (const void *)((const char *)p + len);
+	if (unlikely(q > end || q < p))
+		return ERR_PTR(-EFAULT);
+	dest->data = kmemdup(p, len, GFP_NOFS);
+	if (unlikely(dest->data == NULL))
+		return ERR_PTR(-ENOMEM);
+	dest->len = len;
+	return q;
+}
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index ae9acf3a7389..1c092b05c2bb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -21,6 +21,8 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 
+#include "auth_gss_internal.h"
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
@@ -143,35 +145,6 @@ get_gss_krb5_enctype(int etype)
 	return NULL;
 }
 
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, int len)
-{
-	const void *q = (const void *)((const char *)p + len);
-	if (unlikely(q > end || q < p))
-		return ERR_PTR(-EFAULT);
-	memcpy(res, p, len);
-	return q;
-}
-
-static const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
-{
-	const void *q;
-	unsigned int len;
-
-	p = simple_get_bytes(p, end, &len, sizeof(len));
-	if (IS_ERR(p))
-		return p;
-	q = (const void *)((const char *)p + len);
-	if (unlikely(q > end || q < p))
-		return ERR_PTR(-EFAULT);
-	res->data = kmemdup(p, len, GFP_NOFS);
-	if (unlikely(res->data == NULL))
-		return ERR_PTR(-ENOMEM);
-	res->len = len;
-	return q;
-}
-
 static inline const void *
 get_key(const void *p, const void *end,
 	struct krb5_ctx *ctx, struct crypto_sync_skcipher **res)
-- 
cgit v1.2.3


From 83ace77f51175023c3757e2d08a92565f9b1c7f3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Jan 2021 16:30:03 +0100
Subject: netfilter: ctnetlink: remove get_ct indirection

Use nf_ct_get() directly, its a small inline helper without dependencies.

Add CONFIG_NF_CONNTRACK guards to elide the relevant part when conntrack
isn't available at all.

v2: add ifdef guard around nf_ct_get call (kernel test robot)
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h            |  2 --
 net/netfilter/nf_conntrack_netlink.c |  7 -------
 net/netfilter/nfnetlink_log.c        |  8 +++++++-
 net/netfilter/nfnetlink_queue.c      | 10 ++++++++--
 4 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 0101747de549..f0f3a8354c3c 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -463,8 +463,6 @@ extern struct nf_ct_hook __rcu *nf_ct_hook;
 struct nlattr;
 
 struct nfnl_ct_hook {
-	struct nf_conn *(*get_ct)(const struct sk_buff *skb,
-				  enum ip_conntrack_info *ctinfo);
 	size_t (*build_size)(const struct nf_conn *ct);
 	int (*build)(struct sk_buff *skb, struct nf_conn *ct,
 		     enum ip_conntrack_info ctinfo,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 84caf3316946..1469365bac7e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2686,12 +2686,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
 	       ;
 }
 
-static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
-					     enum ip_conntrack_info *ctinfo)
-{
-	return nf_ct_get(skb, ctinfo);
-}
-
 static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
 {
 	const struct nf_conntrack_zone *zone;
@@ -2925,7 +2919,6 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
 }
 
 static struct nfnl_ct_hook ctnetlink_glue_hook = {
-	.get_ct		= ctnetlink_glue_get_ct,
 	.build_size	= ctnetlink_glue_build_size,
 	.build		= ctnetlink_glue_build,
 	.parse		= ctnetlink_glue_parse,
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b35e8d9a5b37..26776b88a539 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -43,6 +43,10 @@
 #include "../bridge/br_private.h"
 #endif
 
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
 #define NFULNL_COPY_DISABLED	0xff
 #define NFULNL_NLBUFSIZ_DEFAULT	NLMSG_GOODSIZE
 #define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
@@ -733,14 +737,16 @@ nfulnl_log_packet(struct net *net,
 		size += nla_total_size(sizeof(u_int32_t));
 	if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
 		size += nla_total_size(sizeof(u_int32_t));
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
 		nfnl_ct = rcu_dereference(nfnl_ct_hook);
 		if (nfnl_ct != NULL) {
-			ct = nfnl_ct->get_ct(skb, &ctinfo);
+			ct = nf_ct_get(skb, &ctinfo);
 			if (ct != NULL)
 				size += nfnl_ct->build_size(ct);
 		}
 	}
+#endif
 	if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
 		size += nfulnl_get_bridge_size(skb);
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index d1d8bca03b4f..48a07914fd94 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -444,13 +444,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 
 	nfnl_ct = rcu_dereference(nfnl_ct_hook);
 
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	if (queue->flags & NFQA_CFG_F_CONNTRACK) {
 		if (nfnl_ct != NULL) {
-			ct = nfnl_ct->get_ct(entskb, &ctinfo);
+			ct = nf_ct_get(entskb, &ctinfo);
 			if (ct != NULL)
 				size += nfnl_ct->build_size(ct);
 		}
 	}
+#endif
 
 	if (queue->flags & NFQA_CFG_F_UID_GID) {
 		size += (nla_total_size(sizeof(u_int32_t))	/* uid */
@@ -1104,9 +1106,10 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
 				      struct nf_queue_entry *entry,
 				      enum ip_conntrack_info *ctinfo)
 {
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	struct nf_conn *ct;
 
-	ct = nfnl_ct->get_ct(entry->skb, ctinfo);
+	ct = nf_ct_get(entry->skb, ctinfo);
 	if (ct == NULL)
 		return NULL;
 
@@ -1118,6 +1121,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
 				      NETLINK_CB(entry->skb).portid,
 				      nlmsg_report(nlh));
 	return ct;
+#else
+	return NULL;
+#endif
 }
 
 static int nfqa_parse_bridge(struct nf_queue_entry *entry,
-- 
cgit v1.2.3


From 30e88d017fcbeb50c4b07577fe059558361067e7 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Fri, 22 Jan 2021 10:24:49 +0100
Subject: isa: Make the remove callback for isa drivers return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of the remove callback, so
don't give isa drivers the chance to provide a value.

Adapt all isa_drivers with a remove callbacks accordingly; they all
return 0 unconditionally anyhow.

Acked-by: Marc Kleine-Budde <mkl@pengutronix.de> # for drivers/net/can/sja1000/tscan1.c
Acked-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Acked-by: Wolfram Sang <wsa@kernel.org> # for drivers/i2c/
Reviewed-by: Takashi Iway <tiwai@suse.de> # for sound/
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl> # for drivers/media/
Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Link: https://lore.kernel.org/r/20210122092449.426097-4-uwe@kleine-koenig.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/base/isa.c                   | 2 +-
 drivers/i2c/busses/i2c-elektor.c     | 4 +---
 drivers/i2c/busses/i2c-pca-isa.c     | 4 +---
 drivers/input/touchscreen/htcpen.c   | 4 +---
 drivers/media/radio/radio-isa.c      | 4 +---
 drivers/media/radio/radio-isa.h      | 2 +-
 drivers/media/radio/radio-sf16fmr2.c | 4 +---
 drivers/net/can/sja1000/tscan1.c     | 4 +---
 drivers/net/ethernet/3com/3c509.c    | 3 +--
 drivers/scsi/advansys.c              | 3 +--
 drivers/scsi/aha1542.c               | 3 +--
 drivers/scsi/fdomain_isa.c           | 3 +--
 drivers/scsi/g_NCR5380.c             | 5 ++---
 drivers/watchdog/pcwd.c              | 4 +---
 include/linux/isa.h                  | 2 +-
 sound/isa/ad1848/ad1848.c            | 3 +--
 sound/isa/adlib.c                    | 3 +--
 sound/isa/cmi8328.c                  | 3 +--
 sound/isa/cmi8330.c                  | 3 +--
 sound/isa/cs423x/cs4231.c            | 3 +--
 sound/isa/cs423x/cs4236.c            | 3 +--
 sound/isa/es1688/es1688.c            | 3 +--
 sound/isa/es18xx.c                   | 5 ++---
 sound/isa/galaxy/galaxy.c            | 3 +--
 sound/isa/gus/gusclassic.c           | 3 +--
 sound/isa/gus/gusextreme.c           | 3 +--
 sound/isa/gus/gusmax.c               | 3 +--
 sound/isa/gus/interwave.c            | 3 +--
 sound/isa/msnd/msnd_pinnacle.c       | 3 +--
 sound/isa/opl3sa2.c                  | 3 +--
 sound/isa/opti9xx/miro.c             | 3 +--
 sound/isa/opti9xx/opti92x-ad1848.c   | 5 ++---
 sound/isa/sb/jazz16.c                | 3 +--
 sound/isa/sb/sb16.c                  | 3 +--
 sound/isa/sb/sb8.c                   | 3 +--
 sound/isa/sc6000.c                   | 3 +--
 sound/isa/sscape.c                   | 3 +--
 sound/isa/wavefront/wavefront.c      | 3 +--
 38 files changed, 41 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/isa.c b/drivers/base/isa.c
index 2772f5d1948a..aa4737667026 100644
--- a/drivers/base/isa.c
+++ b/drivers/base/isa.c
@@ -51,7 +51,7 @@ static int isa_bus_remove(struct device *dev)
 	struct isa_driver *isa_driver = dev->platform_data;
 
 	if (isa_driver && isa_driver->remove)
-		return isa_driver->remove(dev, to_isa_dev(dev)->id);
+		isa_driver->remove(dev, to_isa_dev(dev)->id);
 
 	return 0;
 }
diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c
index 140426db28df..b72a3c3ef2ab 100644
--- a/drivers/i2c/busses/i2c-elektor.c
+++ b/drivers/i2c/busses/i2c-elektor.c
@@ -282,7 +282,7 @@ static int elektor_probe(struct device *dev, unsigned int id)
 	return -ENODEV;
 }
 
-static int elektor_remove(struct device *dev, unsigned int id)
+static void elektor_remove(struct device *dev, unsigned int id)
 {
 	i2c_del_adapter(&pcf_isa_ops);
 
@@ -298,8 +298,6 @@ static int elektor_remove(struct device *dev, unsigned int id)
 		iounmap(base_iomem);
 		release_mem_region(base, 2);
 	}
-
-	return 0;
 }
 
 static struct isa_driver i2c_elektor_driver = {
diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c
index f27bc1e55385..85e8cf58e8bf 100644
--- a/drivers/i2c/busses/i2c-pca-isa.c
+++ b/drivers/i2c/busses/i2c-pca-isa.c
@@ -161,7 +161,7 @@ static int pca_isa_probe(struct device *dev, unsigned int id)
 	return -ENODEV;
 }
 
-static int pca_isa_remove(struct device *dev, unsigned int id)
+static void pca_isa_remove(struct device *dev, unsigned int id)
 {
 	i2c_del_adapter(&pca_isa_ops);
 
@@ -170,8 +170,6 @@ static int pca_isa_remove(struct device *dev, unsigned int id)
 		free_irq(irq, &pca_isa_ops);
 	}
 	release_region(base, IO_SIZE);
-
-	return 0;
 }
 
 static struct isa_driver pca_isa_driver = {
diff --git a/drivers/input/touchscreen/htcpen.c b/drivers/input/touchscreen/htcpen.c
index 2f261a34f9c2..056ba76087e8 100644
--- a/drivers/input/touchscreen/htcpen.c
+++ b/drivers/input/touchscreen/htcpen.c
@@ -171,7 +171,7 @@ static int htcpen_isa_probe(struct device *dev, unsigned int id)
 	return err;
 }
 
-static int htcpen_isa_remove(struct device *dev, unsigned int id)
+static void htcpen_isa_remove(struct device *dev, unsigned int id)
 {
 	struct input_dev *htcpen_dev = dev_get_drvdata(dev);
 
@@ -182,8 +182,6 @@ static int htcpen_isa_remove(struct device *dev, unsigned int id)
 	release_region(HTCPEN_PORT_INDEX, 2);
 	release_region(HTCPEN_PORT_INIT, 1);
 	release_region(HTCPEN_PORT_IRQ_CLEAR, 1);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/media/radio/radio-isa.c b/drivers/media/radio/radio-isa.c
index 527f4c3b0ca4..c591c0851fa2 100644
--- a/drivers/media/radio/radio-isa.c
+++ b/drivers/media/radio/radio-isa.c
@@ -337,13 +337,11 @@ int radio_isa_probe(struct device *pdev, unsigned int dev)
 }
 EXPORT_SYMBOL_GPL(radio_isa_probe);
 
-int radio_isa_remove(struct device *pdev, unsigned int dev)
+void radio_isa_remove(struct device *pdev, unsigned int dev)
 {
 	struct radio_isa_card *isa = dev_get_drvdata(pdev);
 
 	radio_isa_common_remove(isa, isa->drv->region_size);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(radio_isa_remove);
 
diff --git a/drivers/media/radio/radio-isa.h b/drivers/media/radio/radio-isa.h
index 2f0736edfda8..c9159958203e 100644
--- a/drivers/media/radio/radio-isa.h
+++ b/drivers/media/radio/radio-isa.h
@@ -91,7 +91,7 @@ struct radio_isa_driver {
 
 int radio_isa_match(struct device *pdev, unsigned int dev);
 int radio_isa_probe(struct device *pdev, unsigned int dev);
-int radio_isa_remove(struct device *pdev, unsigned int dev);
+void radio_isa_remove(struct device *pdev, unsigned int dev);
 #ifdef CONFIG_PNP
 int radio_isa_pnp_probe(struct pnp_dev *dev,
 			const struct pnp_device_id *dev_id);
diff --git a/drivers/media/radio/radio-sf16fmr2.c b/drivers/media/radio/radio-sf16fmr2.c
index 0388894cfe41..d0dde55b7930 100644
--- a/drivers/media/radio/radio-sf16fmr2.c
+++ b/drivers/media/radio/radio-sf16fmr2.c
@@ -293,11 +293,9 @@ static void fmr2_remove(struct fmr2 *fmr2)
 	kfree(fmr2);
 }
 
-static int fmr2_isa_remove(struct device *pdev, unsigned int ndev)
+static void fmr2_isa_remove(struct device *pdev, unsigned int ndev)
 {
 	fmr2_remove(dev_get_drvdata(pdev));
-
-	return 0;
 }
 
 static void fmr2_pnp_remove(struct pnp_dev *pdev)
diff --git a/drivers/net/can/sja1000/tscan1.c b/drivers/net/can/sja1000/tscan1.c
index 6ea802c66124..3dbba8d61afb 100644
--- a/drivers/net/can/sja1000/tscan1.c
+++ b/drivers/net/can/sja1000/tscan1.c
@@ -159,7 +159,7 @@ static int tscan1_probe(struct device *dev, unsigned id)
 	return -ENXIO;
 }
 
-static int tscan1_remove(struct device *dev, unsigned id /*unused*/)
+static void tscan1_remove(struct device *dev, unsigned id /*unused*/)
 {
 	struct net_device *netdev;
 	struct sja1000_priv *priv;
@@ -179,8 +179,6 @@ static int tscan1_remove(struct device *dev, unsigned id /*unused*/)
 	release_region(pld_base, TSCAN1_PLD_SIZE);
 
 	free_sja1000dev(netdev);
-
-	return 0;
 }
 
 static struct isa_driver tscan1_isa_driver = {
diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c
index 667f38c9e4c6..53e1f7e07959 100644
--- a/drivers/net/ethernet/3com/3c509.c
+++ b/drivers/net/ethernet/3com/3c509.c
@@ -335,12 +335,11 @@ static int el3_isa_match(struct device *pdev, unsigned int ndev)
 	return 1;
 }
 
-static int el3_isa_remove(struct device *pdev,
+static void el3_isa_remove(struct device *pdev,
 				    unsigned int ndev)
 {
 	el3_device_remove(pdev);
 	dev_set_drvdata(pdev, NULL);
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index 79830e77afa9..b1e97f75b0ba 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -11459,12 +11459,11 @@ static int advansys_isa_probe(struct device *dev, unsigned int id)
 	return err;
 }
 
-static int advansys_isa_remove(struct device *dev, unsigned int id)
+static void advansys_isa_remove(struct device *dev, unsigned int id)
 {
 	int ioport = _asc_def_iop_base[id];
 	advansys_release(dev_get_drvdata(dev));
 	release_region(ioport, ASC_IOADR_GAP);
-	return 0;
 }
 
 static struct isa_driver advansys_isa_driver = {
diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c
index dc5667afeb27..e0d8cca1c70b 100644
--- a/drivers/scsi/aha1542.c
+++ b/drivers/scsi/aha1542.c
@@ -1025,12 +1025,11 @@ static int aha1542_isa_match(struct device *pdev, unsigned int ndev)
 	return 1;
 }
 
-static int aha1542_isa_remove(struct device *pdev,
+static void aha1542_isa_remove(struct device *pdev,
 				    unsigned int ndev)
 {
 	aha1542_release(dev_get_drvdata(pdev));
 	dev_set_drvdata(pdev, NULL);
-	return 0;
 }
 
 static struct isa_driver aha1542_isa_driver = {
diff --git a/drivers/scsi/fdomain_isa.c b/drivers/scsi/fdomain_isa.c
index e0cdcd2003d0..2b4280a43a53 100644
--- a/drivers/scsi/fdomain_isa.c
+++ b/drivers/scsi/fdomain_isa.c
@@ -175,7 +175,7 @@ static int fdomain_isa_param_match(struct device *dev, unsigned int ndev)
 	return 1;
 }
 
-static int fdomain_isa_remove(struct device *dev, unsigned int ndev)
+static void fdomain_isa_remove(struct device *dev, unsigned int ndev)
 {
 	struct Scsi_Host *sh = dev_get_drvdata(dev);
 	int base = sh->io_port;
@@ -183,7 +183,6 @@ static int fdomain_isa_remove(struct device *dev, unsigned int ndev)
 	fdomain_destroy(sh);
 	release_region(base, FDOMAIN_REGION_SIZE);
 	dev_set_drvdata(dev, NULL);
-	return 0;
 }
 
 static struct isa_driver fdomain_isa_driver = {
diff --git a/drivers/scsi/g_NCR5380.c b/drivers/scsi/g_NCR5380.c
index 2df2f38a9b12..7ba3c9312731 100644
--- a/drivers/scsi/g_NCR5380.c
+++ b/drivers/scsi/g_NCR5380.c
@@ -720,12 +720,11 @@ static int generic_NCR5380_isa_match(struct device *pdev, unsigned int ndev)
 	return 1;
 }
 
-static int generic_NCR5380_isa_remove(struct device *pdev,
-                                      unsigned int ndev)
+static void generic_NCR5380_isa_remove(struct device *pdev,
+				       unsigned int ndev)
 {
 	generic_NCR5380_release_resources(dev_get_drvdata(pdev));
 	dev_set_drvdata(pdev, NULL);
-	return 0;
 }
 
 static struct isa_driver generic_NCR5380_isa_driver = {
diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c
index b95cd38f3ceb..a793b03a785d 100644
--- a/drivers/watchdog/pcwd.c
+++ b/drivers/watchdog/pcwd.c
@@ -951,7 +951,7 @@ error_request_region:
 	return ret;
 }
 
-static int pcwd_isa_remove(struct device *dev, unsigned int id)
+static void pcwd_isa_remove(struct device *dev, unsigned int id)
 {
 	if (debug >= DEBUG)
 		pr_debug("pcwd_isa_remove id=%d\n", id);
@@ -968,8 +968,6 @@ static int pcwd_isa_remove(struct device *dev, unsigned int id)
 			(pcwd_private.revision == PCWD_REVISION_A) ? 2 : 4);
 	pcwd_private.io_addr = 0x0000;
 	cards_found--;
-
-	return 0;
 }
 
 static void pcwd_isa_shutdown(struct device *dev, unsigned int id)
diff --git a/include/linux/isa.h b/include/linux/isa.h
index 41336da0f4e7..e30963190968 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -13,7 +13,7 @@
 struct isa_driver {
 	int (*match)(struct device *, unsigned int);
 	int (*probe)(struct device *, unsigned int);
-	int (*remove)(struct device *, unsigned int);
+	void (*remove)(struct device *, unsigned int);
 	void (*shutdown)(struct device *, unsigned int);
 	int (*suspend)(struct device *, unsigned int, pm_message_t);
 	int (*resume)(struct device *, unsigned int);
diff --git a/sound/isa/ad1848/ad1848.c b/sound/isa/ad1848/ad1848.c
index 593c6e959afe..48f7cc57c3da 100644
--- a/sound/isa/ad1848/ad1848.c
+++ b/sound/isa/ad1848/ad1848.c
@@ -118,10 +118,9 @@ out:	snd_card_free(card);
 	return error;
 }
 
-static int snd_ad1848_remove(struct device *dev, unsigned int n)
+static void snd_ad1848_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/adlib.c b/sound/isa/adlib.c
index 5105524b6f38..e6cd7c4da38e 100644
--- a/sound/isa/adlib.c
+++ b/sound/isa/adlib.c
@@ -97,10 +97,9 @@ out:	snd_card_free(card);
 	return error;
 }
 
-static int snd_adlib_remove(struct device *dev, unsigned int n)
+static void snd_adlib_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 static struct isa_driver snd_adlib_driver = {
diff --git a/sound/isa/cmi8328.c b/sound/isa/cmi8328.c
index faca5dd95bfe..3b9fbb02864b 100644
--- a/sound/isa/cmi8328.c
+++ b/sound/isa/cmi8328.c
@@ -403,7 +403,7 @@ error:
 	return err;
 }
 
-static int snd_cmi8328_remove(struct device *pdev, unsigned int dev)
+static void snd_cmi8328_remove(struct device *pdev, unsigned int dev)
 {
 	struct snd_card *card = dev_get_drvdata(pdev);
 	struct snd_cmi8328 *cmi = card->private_data;
@@ -420,7 +420,6 @@ static int snd_cmi8328_remove(struct device *pdev, unsigned int dev)
 	snd_cmi8328_cfg_write(cmi->port, CFG2, 0);
 	snd_cmi8328_cfg_write(cmi->port, CFG3, 0);
 	snd_card_free(card);
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/cmi8330.c b/sound/isa/cmi8330.c
index 4669eb0cc8ce..19e258527d69 100644
--- a/sound/isa/cmi8330.c
+++ b/sound/isa/cmi8330.c
@@ -631,11 +631,10 @@ static int snd_cmi8330_isa_probe(struct device *pdev,
 	return 0;
 }
 
-static int snd_cmi8330_isa_remove(struct device *devptr,
+static void snd_cmi8330_isa_remove(struct device *devptr,
 				  unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/cs423x/cs4231.c b/sound/isa/cs423x/cs4231.c
index 2135963eba78..383ee621cea1 100644
--- a/sound/isa/cs423x/cs4231.c
+++ b/sound/isa/cs423x/cs4231.c
@@ -135,10 +135,9 @@ out:	snd_card_free(card);
 	return error;
 }
 
-static int snd_cs4231_remove(struct device *dev, unsigned int n)
+static void snd_cs4231_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/cs423x/cs4236.c b/sound/isa/cs423x/cs4236.c
index fa3c39cff5f8..24688271e73f 100644
--- a/sound/isa/cs423x/cs4236.c
+++ b/sound/isa/cs423x/cs4236.c
@@ -487,11 +487,10 @@ static int snd_cs423x_isa_probe(struct device *pdev,
 	return 0;
 }
 
-static int snd_cs423x_isa_remove(struct device *pdev,
+static void snd_cs423x_isa_remove(struct device *pdev,
 				 unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(pdev));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/es1688/es1688.c b/sound/isa/es1688/es1688.c
index 64610571a5e1..d99bb3f8f0c1 100644
--- a/sound/isa/es1688/es1688.c
+++ b/sound/isa/es1688/es1688.c
@@ -192,10 +192,9 @@ out:
 	return error;
 }
 
-static int snd_es1688_isa_remove(struct device *dev, unsigned int n)
+static void snd_es1688_isa_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 static struct isa_driver snd_es1688_driver = {
diff --git a/sound/isa/es18xx.c b/sound/isa/es18xx.c
index 5f8d7e8a5477..9beef8079177 100644
--- a/sound/isa/es18xx.c
+++ b/sound/isa/es18xx.c
@@ -2210,11 +2210,10 @@ static int snd_es18xx_isa_probe(struct device *pdev, unsigned int dev)
 	}
 }
 
-static int snd_es18xx_isa_remove(struct device *devptr,
-				 unsigned int dev)
+static void snd_es18xx_isa_remove(struct device *devptr,
+				  unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/galaxy/galaxy.c b/sound/isa/galaxy/galaxy.c
index 65f9f46c9f58..d33d69f29924 100644
--- a/sound/isa/galaxy/galaxy.c
+++ b/sound/isa/galaxy/galaxy.c
@@ -608,10 +608,9 @@ error:
 	return err;
 }
 
-static int snd_galaxy_remove(struct device *dev, unsigned int n)
+static void snd_galaxy_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 static struct isa_driver snd_galaxy_driver = {
diff --git a/sound/isa/gus/gusclassic.c b/sound/isa/gus/gusclassic.c
index 7419b1939754..015f88a11352 100644
--- a/sound/isa/gus/gusclassic.c
+++ b/sound/isa/gus/gusclassic.c
@@ -195,10 +195,9 @@ out:	snd_card_free(card);
 	return error;
 }
 
-static int snd_gusclassic_remove(struct device *dev, unsigned int n)
+static void snd_gusclassic_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 static struct isa_driver snd_gusclassic_driver = {
diff --git a/sound/isa/gus/gusextreme.c b/sound/isa/gus/gusextreme.c
index ed2f9d64efae..c9f31b4fb887 100644
--- a/sound/isa/gus/gusextreme.c
+++ b/sound/isa/gus/gusextreme.c
@@ -324,10 +324,9 @@ out:	snd_card_free(card);
 	return error;
 }
 
-static int snd_gusextreme_remove(struct device *dev, unsigned int n)
+static void snd_gusextreme_remove(struct device *dev, unsigned int n)
 {
 	snd_card_free(dev_get_drvdata(dev));
-	return 0;
 }
 
 static struct isa_driver snd_gusextreme_driver = {
diff --git a/sound/isa/gus/gusmax.c b/sound/isa/gus/gusmax.c
index 05cd9be4dd8a..dc09fbd6f88d 100644
--- a/sound/isa/gus/gusmax.c
+++ b/sound/isa/gus/gusmax.c
@@ -338,10 +338,9 @@ static int snd_gusmax_probe(struct device *pdev, unsigned int dev)
 	return err;
 }
 
-static int snd_gusmax_remove(struct device *devptr, unsigned int dev)
+static void snd_gusmax_remove(struct device *devptr, unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #define DEV_NAME "gusmax"
diff --git a/sound/isa/gus/interwave.c b/sound/isa/gus/interwave.c
index 3e9ad930deae..e4d412e72b75 100644
--- a/sound/isa/gus/interwave.c
+++ b/sound/isa/gus/interwave.c
@@ -825,10 +825,9 @@ static int snd_interwave_isa_probe(struct device *pdev,
 	}
 }
 
-static int snd_interwave_isa_remove(struct device *devptr, unsigned int dev)
+static void snd_interwave_isa_remove(struct device *devptr, unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 static struct isa_driver snd_interwave_driver = {
diff --git a/sound/isa/msnd/msnd_pinnacle.c b/sound/isa/msnd/msnd_pinnacle.c
index 24b34ecf5e5b..69647b41300d 100644
--- a/sound/isa/msnd/msnd_pinnacle.c
+++ b/sound/isa/msnd/msnd_pinnacle.c
@@ -1049,10 +1049,9 @@ cfg_error:
 #endif
 }
 
-static int snd_msnd_isa_remove(struct device *pdev, unsigned int dev)
+static void snd_msnd_isa_remove(struct device *pdev, unsigned int dev)
 {
 	snd_msnd_unload(dev_get_drvdata(pdev));
-	return 0;
 }
 
 static struct isa_driver snd_msnd_driver = {
diff --git a/sound/isa/opl3sa2.c b/sound/isa/opl3sa2.c
index 85a181acd388..7649a8a4128d 100644
--- a/sound/isa/opl3sa2.c
+++ b/sound/isa/opl3sa2.c
@@ -878,11 +878,10 @@ static int snd_opl3sa2_isa_probe(struct device *pdev,
 	return 0;
 }
 
-static int snd_opl3sa2_isa_remove(struct device *devptr,
+static void snd_opl3sa2_isa_remove(struct device *devptr,
 				  unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/opti9xx/miro.c b/sound/isa/opti9xx/miro.c
index 44ed1b65f6ce..20933342f5eb 100644
--- a/sound/isa/opti9xx/miro.c
+++ b/sound/isa/opti9xx/miro.c
@@ -1480,11 +1480,10 @@ static int snd_miro_isa_probe(struct device *devptr, unsigned int n)
 	return 0;
 }
 
-static int snd_miro_isa_remove(struct device *devptr,
+static void snd_miro_isa_remove(struct device *devptr,
 			       unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #define DEV_NAME "miro"
diff --git a/sound/isa/opti9xx/opti92x-ad1848.c b/sound/isa/opti9xx/opti92x-ad1848.c
index 881d3b5711d2..758f5b579138 100644
--- a/sound/isa/opti9xx/opti92x-ad1848.c
+++ b/sound/isa/opti9xx/opti92x-ad1848.c
@@ -1024,11 +1024,10 @@ static int snd_opti9xx_isa_probe(struct device *devptr,
 	return 0;
 }
 
-static int snd_opti9xx_isa_remove(struct device *devptr,
-				  unsigned int dev)
+static void snd_opti9xx_isa_remove(struct device *devptr,
+				   unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/sb/jazz16.c b/sound/isa/sb/jazz16.c
index ee379bbf70a4..0e2e0ab3b9e4 100644
--- a/sound/isa/sb/jazz16.c
+++ b/sound/isa/sb/jazz16.c
@@ -339,12 +339,11 @@ err_free:
 	return err;
 }
 
-static int snd_jazz16_remove(struct device *devptr, unsigned int dev)
+static void snd_jazz16_remove(struct device *devptr, unsigned int dev)
 {
 	struct snd_card *card = dev_get_drvdata(devptr);
 
 	snd_card_free(card);
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/sb/sb16.c b/sound/isa/sb/sb16.c
index 479197c13803..db284b7b88a7 100644
--- a/sound/isa/sb/sb16.c
+++ b/sound/isa/sb/sb16.c
@@ -547,10 +547,9 @@ static int snd_sb16_isa_probe(struct device *pdev, unsigned int dev)
 	}
 }
 
-static int snd_sb16_isa_remove(struct device *pdev, unsigned int dev)
+static void snd_sb16_isa_remove(struct device *pdev, unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(pdev));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/sb/sb8.c b/sound/isa/sb/sb8.c
index 438109f167d6..8e3e67b9a341 100644
--- a/sound/isa/sb/sb8.c
+++ b/sound/isa/sb/sb8.c
@@ -192,10 +192,9 @@ static int snd_sb8_probe(struct device *pdev, unsigned int dev)
 	return err;
 }
 
-static int snd_sb8_remove(struct device *pdev, unsigned int dev)
+static void snd_sb8_remove(struct device *pdev, unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(pdev));
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/sound/isa/sc6000.c b/sound/isa/sc6000.c
index 3d0bea44f454..def137579717 100644
--- a/sound/isa/sc6000.c
+++ b/sound/isa/sc6000.c
@@ -672,7 +672,7 @@ err_exit:
 	return err;
 }
 
-static int snd_sc6000_remove(struct device *devptr, unsigned int dev)
+static void snd_sc6000_remove(struct device *devptr, unsigned int dev)
 {
 	struct snd_card *card = dev_get_drvdata(devptr);
 	char __iomem **vport = card->private_data;
@@ -684,7 +684,6 @@ static int snd_sc6000_remove(struct device *devptr, unsigned int dev)
 	release_region(mss_port[dev], 4);
 
 	snd_card_free(card);
-	return 0;
 }
 
 static struct isa_driver snd_sc6000_driver = {
diff --git a/sound/isa/sscape.c b/sound/isa/sscape.c
index 2e5a5c5279e8..e70ef9aee545 100644
--- a/sound/isa/sscape.c
+++ b/sound/isa/sscape.c
@@ -1183,10 +1183,9 @@ _release_card:
 	return ret;
 }
 
-static int snd_sscape_remove(struct device *devptr, unsigned int dev)
+static void snd_sscape_remove(struct device *devptr, unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #define DEV_NAME "sscape"
diff --git a/sound/isa/wavefront/wavefront.c b/sound/isa/wavefront/wavefront.c
index 9e0f6b226775..b750a4fd40de 100644
--- a/sound/isa/wavefront/wavefront.c
+++ b/sound/isa/wavefront/wavefront.c
@@ -565,11 +565,10 @@ static int snd_wavefront_isa_probe(struct device *pdev,
 	return 0;
 }
 
-static int snd_wavefront_isa_remove(struct device *devptr,
+static void snd_wavefront_isa_remove(struct device *devptr,
 				    unsigned int dev)
 {
 	snd_card_free(dev_get_drvdata(devptr));
-	return 0;
 }
 
 #define DEV_NAME "wavefront"
-- 
cgit v1.2.3


From ceecd1bff6f9a741adc173f062f1f9312c3a66c8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 19 Jan 2021 12:07:22 -0800
Subject: HID: correct kernel-doc notation in <linux/hid*.h>

Correct kernel-doc notation in HID header files (include/linux/hid*.h).
Add notation (comments) where it is missing.
Use the documented "Return:" notation for function return values.
Fix a few typos/spellos.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: linux-input@vger.kernel.org
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid-sensor-hub.h |  9 +++++----
 include/linux/hid.h            | 15 +++++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 46bcef380446..763802b2b8f9 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -150,7 +150,7 @@ int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev,
 * @info:	return information about attribute after parsing report
 *
 * Parses report and returns the attribute information such as report id,
-* field index, units and exponet etc.
+* field index, units and exponent etc.
 */
 int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
 			u8 type,
@@ -167,7 +167,7 @@ int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
 * @is_signed:   If true then fields < 32 bits will be sign-extended
 *
 * Issues a synchronous or asynchronous read request for an input attribute.
-* Returns data upto 32 bits.
+* Return: data up to 32 bits.
 */
 
 enum sensor_hub_read_flags {
@@ -205,8 +205,9 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 * @buffer:	buffer to copy output
 *
 * Used to get a field in feature report. For example this can get polling
-* interval, sensitivity, activate/deactivate state. On success it returns
-* number of bytes copied to buffer. On failure, it returns value < 0.
+* interval, sensitivity, activate/deactivate state.
+* Return: On success, it returns the number of bytes copied to buffer.
+* On failure, it returns value < 0.
 */
 int sensor_hub_get_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
 			   u32 field_index, int buffer_size, void *buffer);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index c39d71eb1fd0..ef702b3f56e3 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -918,7 +918,7 @@ __u32 hid_field_extract(const struct hid_device *hid, __u8 *report,
 /**
  * hid_device_io_start - enable HID input during probe, remove
  *
- * @hid - the device
+ * @hid: the device
  *
  * This should only be called during probe or remove and only be
  * called by the thread calling probe or remove. It will allow
@@ -936,7 +936,7 @@ static inline void hid_device_io_start(struct hid_device *hid) {
 /**
  * hid_device_io_stop - disable HID input during probe, remove
  *
- * @hid - the device
+ * @hid: the device
  *
  * Should only be called after hid_device_io_start. It will prevent
  * incoming packets from going to the driver for the duration of
@@ -1010,6 +1010,13 @@ static inline void hid_map_usage(struct hid_input *hidinput,
 /**
  * hid_map_usage_clear - map usage input bits and clear the input bit
  *
+ * @hidinput: hidinput which we are interested in
+ * @usage: usage to fill in
+ * @bit: pointer to input->{}bit (out parameter)
+ * @max: maximal valid usage->code to consider later (out parameter)
+ * @type: input event type (EV_KEY, EV_REL, ...)
+ * @c: code which corresponds to this usage and type
+ *
  * The same as hid_map_usage, except the @c bit is also cleared in supported
  * bits (@bit).
  */
@@ -1084,7 +1091,7 @@ static inline void hid_hw_request(struct hid_device *hdev,
  * @rtype: HID report type
  * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT
  *
- * @return: count of data transfered, negative if error
+ * Return: count of data transferred, negative if error
  *
  * Same behavior as hid_hw_request, but with raw buffers instead.
  */
@@ -1106,7 +1113,7 @@ static inline int hid_hw_raw_request(struct hid_device *hdev,
  * @buf: raw data to transfer
  * @len: length of buf
  *
- * @return: count of data transfered, negative if error
+ * Return: count of data transferred, negative if error
  */
 static inline int hid_hw_output_report(struct hid_device *hdev, __u8 *buf,
 					size_t len)
-- 
cgit v1.2.3


From 081df94301e317e84c3413686043987da2c3e39d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:23 -0600
Subject: objtool: Add asm version of STACK_FRAME_NON_STANDARD

To be used for adding asm functions to the ignore list.  The "aw" is
needed to help the ELF section metadata match GCC-created sections.
Otherwise the linker creates duplicate sections instead of combining
them.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/8faa476f9a5ac89af27944ec184c89f95f3c6c49.1611263462.git.jpoimboe@redhat.com
---
 include/linux/objtool.h       | 8 ++++++++
 tools/include/linux/objtool.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 577f51436cf9..add1c6eb157e 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -109,6 +109,12 @@ struct unwind_hint {
 	.popsection
 .endm
 
+.macro STACK_FRAME_NON_STANDARD func:req
+	.pushsection .discard.func_stack_frame_non_standard, "aw"
+		.long \func - .
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_STACK_VALIDATION */
@@ -122,6 +128,8 @@ struct unwind_hint {
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
 .endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
 #endif
 
 #endif /* CONFIG_STACK_VALIDATION */
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index 577f51436cf9..add1c6eb157e 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -109,6 +109,12 @@ struct unwind_hint {
 	.popsection
 .endm
 
+.macro STACK_FRAME_NON_STANDARD func:req
+	.pushsection .discard.func_stack_frame_non_standard, "aw"
+		.long \func - .
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #else /* !CONFIG_STACK_VALIDATION */
@@ -122,6 +128,8 @@ struct unwind_hint {
 #define ANNOTATE_INTRA_FUNCTION_CALL
 .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0
 .endm
+.macro STACK_FRAME_NON_STANDARD func:req
+.endm
 #endif
 
 #endif /* CONFIG_STACK_VALIDATION */
-- 
cgit v1.2.3


From b735bd3e68824316655252a931a3353a6ebc036f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 21 Jan 2021 15:29:24 -0600
Subject: objtool: Combine UNWIND_HINT_RET_OFFSET and UNWIND_HINT_FUNC

The ORC metadata generated for UNWIND_HINT_FUNC isn't actually very
func-like.  With certain usages it can cause stack state mismatches
because it doesn't set the return address (CFI_RA).

Also, users of UNWIND_HINT_RET_OFFSET no longer need to set a custom
return stack offset.  Instead they just need to specify a func-like
situation, so the current ret_offset code is hacky for no good reason.

Solve both problems by simplifying the RET_OFFSET handling and
converting it into a more useful UNWIND_HINT_FUNC.

If we end up needing the old 'ret_offset' functionality again in the
future, we should be able to support it pretty easily with the addition
of a custom 'sp_offset' in UNWIND_HINT_FUNC.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/db9d1f5d79dddfbb3725ef6d8ec3477ad199948d.1611263462.git.jpoimboe@redhat.com
---
 arch/x86/include/asm/unwind_hints.h   | 13 ++----------
 arch/x86/kernel/ftrace_64.S           |  2 +-
 arch/x86/lib/retpoline.S              |  2 +-
 include/linux/objtool.h               |  5 ++++-
 tools/include/linux/objtool.h         |  5 ++++-
 tools/objtool/arch/x86/decode.c       |  4 ++--
 tools/objtool/check.c                 | 37 ++++++++++++++---------------------
 tools/objtool/include/objtool/check.h |  1 -
 8 files changed, 29 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 664d4610d700..8e574c0afef8 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -48,17 +48,8 @@
 	UNWIND_HINT_REGS base=\base offset=\offset partial=1
 .endm
 
-.macro UNWIND_HINT_FUNC sp_offset=8
-	UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
-.endm
-
-/*
- * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN
- * and sibling calls. On these, sp_offset denotes the expected offset from
- * initial_func_cfi.
- */
-.macro UNWIND_HINT_RET_OFFSET sp_offset=8
-	UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset
+.macro UNWIND_HINT_FUNC
+	UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
 .endm
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 58d125ed9d1a..1bf568d901b1 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -277,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
 	restore_mcount_regs 8
 	/* Restore flags */
 	popfq
-	UNWIND_HINT_RET_OFFSET
+	UNWIND_HINT_FUNC
 	jmp	ftrace_epilogue
 
 SYM_FUNC_END(ftrace_regs_caller)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index b4c43a9b1483..f6fb1d218dcc 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -28,7 +28,7 @@ SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
 	jmp	.Lspec_trap_\@
 .Ldo_rop_\@:
 	mov	%\reg, (%_ASM_SP)
-	UNWIND_HINT_RET_OFFSET
+	UNWIND_HINT_FUNC
 	ret
 SYM_FUNC_END(__x86_retpoline_\reg)
 
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index add1c6eb157e..7e72d975cb76 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -29,11 +29,14 @@ struct unwind_hint {
  *
  * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
  * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
  */
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
-#define UNWIND_HINT_TYPE_RET_OFFSET	3
+#define UNWIND_HINT_TYPE_FUNC		3
 
 #ifdef CONFIG_STACK_VALIDATION
 
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
index add1c6eb157e..7e72d975cb76 100644
--- a/tools/include/linux/objtool.h
+++ b/tools/include/linux/objtool.h
@@ -29,11 +29,14 @@ struct unwind_hint {
  *
  * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
  * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
  */
 #define UNWIND_HINT_TYPE_CALL		0
 #define UNWIND_HINT_TYPE_REGS		1
 #define UNWIND_HINT_TYPE_REGS_PARTIAL	2
-#define UNWIND_HINT_TYPE_RET_OFFSET	3
+#define UNWIND_HINT_TYPE_FUNC		3
 
 #ifdef CONFIG_STACK_VALIDATION
 
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 6baa22732ca6..9637e3bf5ab8 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -563,8 +563,8 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state)
 	state->cfa.offset = 8;
 
 	/* initial RA (return address) */
-	state->regs[16].base = CFI_CFA;
-	state->regs[16].offset = -8;
+	state->regs[CFI_RA].base = CFI_CFA;
+	state->regs[CFI_RA].offset = -8;
 }
 
 const char *arch_nop_insn(int len)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b4e1655017de..f88f20327bf2 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1404,13 +1404,20 @@ static int add_jump_table_alts(struct objtool_file *file)
 	return 0;
 }
 
+static void set_func_state(struct cfi_state *state)
+{
+	state->cfa = initial_func_cfi.cfa;
+	memcpy(&state->regs, &initial_func_cfi.regs,
+	       CFI_NUM_REGS * sizeof(struct cfi_reg));
+	state->stack_size = initial_func_cfi.cfa.offset;
+}
+
 static int read_unwind_hints(struct objtool_file *file)
 {
 	struct section *sec, *relocsec;
 	struct reloc *reloc;
 	struct unwind_hint *hint;
 	struct instruction *insn;
-	struct cfi_reg *cfa;
 	int i;
 
 	sec = find_section_by_name(file->elf, ".discard.unwind_hints");
@@ -1445,22 +1452,20 @@ static int read_unwind_hints(struct objtool_file *file)
 			return -1;
 		}
 
-		cfa = &insn->cfi.cfa;
+		insn->hint = true;
 
-		if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) {
-			insn->ret_offset = bswap_if_needed(hint->sp_offset);
+		if (hint->type == UNWIND_HINT_TYPE_FUNC) {
+			set_func_state(&insn->cfi);
 			continue;
 		}
 
-		insn->hint = true;
-
 		if (arch_decode_hint_reg(insn, hint->sp_reg)) {
 			WARN_FUNC("unsupported unwind_hint sp base reg %d",
 				  insn->sec, insn->offset, hint->sp_reg);
 			return -1;
 		}
 
-		cfa->offset = bswap_if_needed(hint->sp_offset);
+		insn->cfi.cfa.offset = bswap_if_needed(hint->sp_offset);
 		insn->cfi.type = hint->type;
 		insn->cfi.end = hint->end;
 	}
@@ -1716,27 +1721,18 @@ static bool is_fentry_call(struct instruction *insn)
 
 static bool has_modified_stack_frame(struct instruction *insn, struct insn_state *state)
 {
-	u8 ret_offset = insn->ret_offset;
 	struct cfi_state *cfi = &state->cfi;
 	int i;
 
 	if (cfi->cfa.base != initial_func_cfi.cfa.base || cfi->drap)
 		return true;
 
-	if (cfi->cfa.offset != initial_func_cfi.cfa.offset + ret_offset)
+	if (cfi->cfa.offset != initial_func_cfi.cfa.offset)
 		return true;
 
-	if (cfi->stack_size != initial_func_cfi.cfa.offset + ret_offset)
+	if (cfi->stack_size != initial_func_cfi.cfa.offset)
 		return true;
 
-	/*
-	 * If there is a ret offset hint then don't check registers
-	 * because a callee-saved register might have been pushed on
-	 * the stack.
-	 */
-	if (ret_offset)
-		return false;
-
 	for (i = 0; i < CFI_NUM_REGS; i++) {
 		if (cfi->regs[i].base != initial_func_cfi.regs[i].base ||
 		    cfi->regs[i].offset != initial_func_cfi.regs[i].offset)
@@ -2880,10 +2876,7 @@ static int validate_section(struct objtool_file *file, struct section *sec)
 			continue;
 
 		init_insn_state(&state, sec);
-		state.cfi.cfa = initial_func_cfi.cfa;
-		memcpy(&state.cfi.regs, &initial_func_cfi.regs,
-		       CFI_NUM_REGS * sizeof(struct cfi_reg));
-		state.cfi.stack_size = initial_func_cfi.cfa.offset;
+		set_func_state(&state.cfi);
 
 		warnings += validate_symbol(file, sec, func, &state);
 	}
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index b408636c0201..4891ead0e85f 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -50,7 +50,6 @@ struct instruction {
 	bool retpoline_safe;
 	s8 instr;
 	u8 visited;
-	u8 ret_offset;
 	struct alt_group *alt_group;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
-- 
cgit v1.2.3


From d07b6621d948944c6828fc9b5cbdcb6f389b3b04 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 18 Jan 2021 09:15:48 -0300
Subject: dmaengine: imx-sdma: Remove platform data support

Since 5.10-rc1, i.MX has been converted to a devicetree-only platform.

The platform data support in this driver was only used for non-DT
platforms.

Remove the platform data support as it has no more users.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20210118121549.1625217-1-festevam@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                     | 35 +++++++++---------------------
 include/linux/platform_data/dma-imx-sdma.h | 11 ----------
 2 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 41ba21eea7c8..a68950f80635 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1961,7 +1961,6 @@ static int sdma_probe(struct platform_device *pdev)
 	int irq;
 	struct resource *iores;
 	struct resource spba_res;
-	struct sdma_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	int i;
 	struct sdma_engine *sdma;
 	s32 *saddr_arr;
@@ -2063,8 +2062,6 @@ static int sdma_probe(struct platform_device *pdev)
 
 	if (sdma->drvdata->script_addrs)
 		sdma_add_scripts(sdma, sdma->drvdata->script_addrs);
-	if (pdata && pdata->script_addrs)
-		sdma_add_scripts(sdma, pdata->script_addrs);
 
 	sdma->dma_device.dev = &pdev->dev;
 
@@ -2110,30 +2107,18 @@ static int sdma_probe(struct platform_device *pdev)
 	}
 
 	/*
-	 * Kick off firmware loading as the very last step:
-	 * attempt to load firmware only if we're not on the error path, because
-	 * the firmware callback requires a fully functional and allocated sdma
-	 * instance.
+	 * Because that device tree does not encode ROM script address,
+	 * the RAM script in firmware is mandatory for device tree
+	 * probe, otherwise it fails.
 	 */
-	if (pdata) {
-		ret = sdma_get_firmware(sdma, pdata->fw_name);
-		if (ret)
-			dev_warn(&pdev->dev, "failed to get firmware from platform data\n");
+	ret = of_property_read_string(np, "fsl,sdma-ram-script-name",
+				      &fw_name);
+	if (ret) {
+		dev_warn(&pdev->dev, "failed to get firmware name\n");
 	} else {
-		/*
-		 * Because that device tree does not encode ROM script address,
-		 * the RAM script in firmware is mandatory for device tree
-		 * probe, otherwise it fails.
-		 */
-		ret = of_property_read_string(np, "fsl,sdma-ram-script-name",
-					      &fw_name);
-		if (ret) {
-			dev_warn(&pdev->dev, "failed to get firmware name\n");
-		} else {
-			ret = sdma_get_firmware(sdma, fw_name);
-			if (ret)
-				dev_warn(&pdev->dev, "failed to get firmware from device tree\n");
-		}
+		ret = sdma_get_firmware(sdma, fw_name);
+		if (ret)
+			dev_warn(&pdev->dev, "failed to get firmware from device tree\n");
 	}
 
 	return 0;
diff --git a/include/linux/platform_data/dma-imx-sdma.h b/include/linux/platform_data/dma-imx-sdma.h
index 30e676b36b24..725602d9df91 100644
--- a/include/linux/platform_data/dma-imx-sdma.h
+++ b/include/linux/platform_data/dma-imx-sdma.h
@@ -57,15 +57,4 @@ struct sdma_script_start_addrs {
 	/* End of v4 array */
 };
 
-/**
- * struct sdma_platform_data - platform specific data for SDMA engine
- *
- * @fw_name		The firmware name
- * @script_addrs	SDMA scripts addresses in SDMA ROM
- */
-struct sdma_platform_data {
-	char *fw_name;
-	struct sdma_script_start_addrs *script_addrs;
-};
-
 #endif /* __MACH_MXC_SDMA_H__ */
-- 
cgit v1.2.3


From ec6ab42f5aadd765b0b8c4e2d21508ac1e20f2ed Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:18:57 +0100
Subject: dmaengine: remove sirfsoc driver

The CSR SiRF prima2/atlas platforms are getting removed, so this driver
is no longer needed.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Barry Song <baohua@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Link: https://lore.kernel.org/r/20210120131859.2056308-2-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/sirfsoc-dma.txt        |   44 -
 drivers/dma/Kconfig                                |    7 -
 drivers/dma/Makefile                               |    1 -
 drivers/dma/sirf-dma.c                             | 1170 --------------------
 include/linux/sirfsoc_dma.h                        |    7 -
 5 files changed, 1229 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
 delete mode 100644 drivers/dma/sirf-dma.c
 delete mode 100644 include/linux/sirfsoc_dma.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt b/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
deleted file mode 100644
index ccd52d6a231a..000000000000
--- a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-* CSR SiRFSoC DMA controller
-
-See dma.txt first
-
-Required properties:
-- compatible: Should be "sirf,prima2-dmac", "sirf,atlas7-dmac" or
-  "sirf,atlas7-dmac-v2"
-- reg: Should contain DMA registers location and length.
-- interrupts: Should contain one interrupt shared by all channel
-- #dma-cells: must be <1>. used to represent the number of integer
-    cells in the dmas property of client device.
-- clocks: clock required
-
-Example:
-
-Controller:
-dmac0: dma-controller@b00b0000 {
-	compatible = "sirf,prima2-dmac";
-	reg = <0xb00b0000 0x10000>;
-	interrupts = <12>;
-	clocks = <&clks 24>;
-	#dma-cells = <1>;
-};
-
-
-Client:
-Fill the specific dma request line in dmas. In the below example, spi0 read
-channel request line is 9 of the 2nd dma controller, while write channel uses
-4 of the 2nd dma controller; spi1 read channel request line is 12 of the 1st
-dma controller, while write channel uses 13 of the 1st dma controller:
-
-spi0: spi@b00d0000 {
-	compatible = "sirf,prima2-spi";
-	dmas = <&dmac1 9>,
-		<&dmac1 4>;
-	dma-names = "rx", "tx";
-};
-
-spi1: spi@b0170000 {
-	compatible = "sirf,prima2-spi";
-	dmas = <&dmac0 12>,
-		<&dmac0 13>;
-	dma-names = "rx", "tx";
-};
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 6c92f075f7ce..2201ff280f7a 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -519,13 +519,6 @@ config PLX_DMA
 	  These are exposed via extra functions on the switch's
 	  upstream port. Each function exposes one DMA channel.
 
-config SIRF_DMA
-	tristate "CSR SiRFprimaII/SiRFmarco DMA support"
-	depends on ARCH_SIRF
-	select DMA_ENGINE
-	help
-	  Enable support for the CSR SiRFprimaII DMA engine.
-
 config STE_DMA40
 	bool "ST-Ericsson DMA40 support"
 	depends on ARCH_U8500
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 649a4f95ea4b..b5b34c65b075 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -65,7 +65,6 @@ obj-$(CONFIG_PPC_BESTCOMM) += bestcomm/
 obj-$(CONFIG_PXA_DMA) += pxa_dma.o
 obj-$(CONFIG_RENESAS_DMA) += sh/
 obj-$(CONFIG_SF_PDMA) += sf-pdma/
-obj-$(CONFIG_SIRF_DMA) += sirf-dma.o
 obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o
 obj-$(CONFIG_STM32_DMA) += stm32-dma.o
 obj-$(CONFIG_STM32_DMAMUX) += stm32-dmamux.o
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
deleted file mode 100644
index a5c2843384fd..000000000000
--- a/drivers/dma/sirf-dma.c
+++ /dev/null
@@ -1,1170 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * DMA controller driver for CSR SiRFprimaII
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/module.h>
-#include <linux/dmaengine.h>
-#include <linux/dma-mapping.h>
-#include <linux/pm_runtime.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/of_irq.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
-#include <linux/of_platform.h>
-#include <linux/clk.h>
-#include <linux/of_dma.h>
-#include <linux/sirfsoc_dma.h>
-
-#include "dmaengine.h"
-
-#define SIRFSOC_DMA_VER_A7V1                    1
-#define SIRFSOC_DMA_VER_A7V2                    2
-#define SIRFSOC_DMA_VER_A6                      4
-
-#define SIRFSOC_DMA_DESCRIPTORS                 16
-#define SIRFSOC_DMA_CHANNELS                    16
-#define SIRFSOC_DMA_TABLE_NUM                   256
-
-#define SIRFSOC_DMA_CH_ADDR                     0x00
-#define SIRFSOC_DMA_CH_XLEN                     0x04
-#define SIRFSOC_DMA_CH_YLEN                     0x08
-#define SIRFSOC_DMA_CH_CTRL                     0x0C
-
-#define SIRFSOC_DMA_WIDTH_0                     0x100
-#define SIRFSOC_DMA_CH_VALID                    0x140
-#define SIRFSOC_DMA_CH_INT                      0x144
-#define SIRFSOC_DMA_INT_EN                      0x148
-#define SIRFSOC_DMA_INT_EN_CLR                  0x14C
-#define SIRFSOC_DMA_CH_LOOP_CTRL                0x150
-#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR            0x154
-#define SIRFSOC_DMA_WIDTH_ATLAS7                0x10
-#define SIRFSOC_DMA_VALID_ATLAS7                0x14
-#define SIRFSOC_DMA_INT_ATLAS7                  0x18
-#define SIRFSOC_DMA_INT_EN_ATLAS7               0x1c
-#define SIRFSOC_DMA_LOOP_CTRL_ATLAS7            0x20
-#define SIRFSOC_DMA_CUR_DATA_ADDR               0x34
-#define SIRFSOC_DMA_MUL_ATLAS7                  0x38
-#define SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7         0x158
-#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7     0x15C
-#define SIRFSOC_DMA_IOBG_SCMD_EN		0x800
-#define SIRFSOC_DMA_EARLY_RESP_SET		0x818
-#define SIRFSOC_DMA_EARLY_RESP_CLR		0x81C
-
-#define SIRFSOC_DMA_MODE_CTRL_BIT               4
-#define SIRFSOC_DMA_DIR_CTRL_BIT                5
-#define SIRFSOC_DMA_MODE_CTRL_BIT_ATLAS7        2
-#define SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7       3
-#define SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7         4
-#define SIRFSOC_DMA_TAB_NUM_ATLAS7              7
-#define SIRFSOC_DMA_CHAIN_INT_BIT_ATLAS7        5
-#define SIRFSOC_DMA_CHAIN_FLAG_SHIFT_ATLAS7     25
-#define SIRFSOC_DMA_CHAIN_ADDR_SHIFT            32
-
-#define SIRFSOC_DMA_INT_FINI_INT_ATLAS7         BIT(0)
-#define SIRFSOC_DMA_INT_CNT_INT_ATLAS7          BIT(1)
-#define SIRFSOC_DMA_INT_PAU_INT_ATLAS7          BIT(2)
-#define SIRFSOC_DMA_INT_LOOP_INT_ATLAS7         BIT(3)
-#define SIRFSOC_DMA_INT_INV_INT_ATLAS7          BIT(4)
-#define SIRFSOC_DMA_INT_END_INT_ATLAS7          BIT(5)
-#define SIRFSOC_DMA_INT_ALL_ATLAS7              0x3F
-
-/* xlen and dma_width register is in 4 bytes boundary */
-#define SIRFSOC_DMA_WORD_LEN			4
-#define SIRFSOC_DMA_XLEN_MAX_V1         0x800
-#define SIRFSOC_DMA_XLEN_MAX_V2         0x1000
-
-struct sirfsoc_dma_desc {
-	struct dma_async_tx_descriptor	desc;
-	struct list_head		node;
-
-	/* SiRFprimaII 2D-DMA parameters */
-
-	int             xlen;           /* DMA xlen */
-	int             ylen;           /* DMA ylen */
-	int             width;          /* DMA width */
-	int             dir;
-	bool            cyclic;         /* is loop DMA? */
-	bool            chain;          /* is chain DMA? */
-	u32             addr;		/* DMA buffer address */
-	u64 chain_table[SIRFSOC_DMA_TABLE_NUM]; /* chain tbl */
-};
-
-struct sirfsoc_dma_chan {
-	struct dma_chan			chan;
-	struct list_head		free;
-	struct list_head		prepared;
-	struct list_head		queued;
-	struct list_head		active;
-	struct list_head		completed;
-	unsigned long			happened_cyclic;
-	unsigned long			completed_cyclic;
-
-	/* Lock for this structure */
-	spinlock_t			lock;
-
-	int				mode;
-};
-
-struct sirfsoc_dma_regs {
-	u32				ctrl[SIRFSOC_DMA_CHANNELS];
-	u32				interrupt_en;
-};
-
-struct sirfsoc_dma {
-	struct dma_device		dma;
-	struct tasklet_struct		tasklet;
-	struct sirfsoc_dma_chan		channels[SIRFSOC_DMA_CHANNELS];
-	void __iomem			*base;
-	int				irq;
-	struct clk			*clk;
-	int				type;
-	void (*exec_desc)(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base);
-	struct sirfsoc_dma_regs		regs_save;
-};
-
-struct sirfsoc_dmadata {
-	void (*exec)(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base);
-	int type;
-};
-
-enum sirfsoc_dma_chain_flag {
-	SIRFSOC_DMA_CHAIN_NORMAL = 0x01,
-	SIRFSOC_DMA_CHAIN_PAUSE = 0x02,
-	SIRFSOC_DMA_CHAIN_LOOP = 0x03,
-	SIRFSOC_DMA_CHAIN_END = 0x04
-};
-
-#define DRV_NAME	"sirfsoc_dma"
-
-static int sirfsoc_dma_runtime_suspend(struct device *dev);
-
-/* Convert struct dma_chan to struct sirfsoc_dma_chan */
-static inline
-struct sirfsoc_dma_chan *dma_chan_to_sirfsoc_dma_chan(struct dma_chan *c)
-{
-	return container_of(c, struct sirfsoc_dma_chan, chan);
-}
-
-/* Convert struct dma_chan to struct sirfsoc_dma */
-static inline struct sirfsoc_dma *dma_chan_to_sirfsoc_dma(struct dma_chan *c)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(c);
-	return container_of(schan, struct sirfsoc_dma, channels[c->chan_id]);
-}
-
-static void sirfsoc_dma_execute_hw_a7v2(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base)
-{
-	if (sdesc->chain) {
-		/* DMA v2 HW chain mode */
-		writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) |
-			       (sdesc->chain <<
-				SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) |
-			       (0x8 << SIRFSOC_DMA_TAB_NUM_ATLAS7) | 0x3,
-			       base + SIRFSOC_DMA_CH_CTRL);
-	} else {
-		/* DMA v2 legacy mode */
-		writel_relaxed(sdesc->xlen, base + SIRFSOC_DMA_CH_XLEN);
-		writel_relaxed(sdesc->ylen, base + SIRFSOC_DMA_CH_YLEN);
-		writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_ATLAS7);
-		writel_relaxed((sdesc->width*((sdesc->ylen+1)>>1)),
-				base + SIRFSOC_DMA_MUL_ATLAS7);
-		writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) |
-			       (sdesc->chain <<
-				SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) |
-			       0x3, base + SIRFSOC_DMA_CH_CTRL);
-	}
-	writel_relaxed(sdesc->chain ? SIRFSOC_DMA_INT_END_INT_ATLAS7 :
-		       (SIRFSOC_DMA_INT_FINI_INT_ATLAS7 |
-			SIRFSOC_DMA_INT_LOOP_INT_ATLAS7),
-		       base + SIRFSOC_DMA_INT_EN_ATLAS7);
-	writel(sdesc->addr, base + SIRFSOC_DMA_CH_ADDR);
-	if (sdesc->cyclic)
-		writel(0x10001, base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-}
-
-static void sirfsoc_dma_execute_hw_a7v1(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base)
-{
-	writel_relaxed(1, base + SIRFSOC_DMA_IOBG_SCMD_EN);
-	writel_relaxed((1 << cid), base + SIRFSOC_DMA_EARLY_RESP_SET);
-	writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4);
-	writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
-		       (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
-		       base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
-	writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN);
-	writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN);
-	writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) |
-		       (1 << cid), base + SIRFSOC_DMA_INT_EN);
-	writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
-	if (sdesc->cyclic) {
-		writel((1 << cid) | 1 << (cid + 16) |
-		       readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7),
-		       base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7);
-	}
-
-}
-
-static void sirfsoc_dma_execute_hw_a6(struct sirfsoc_dma_desc *sdesc,
-		int cid, int burst_mode, void __iomem *base)
-{
-	writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4);
-	writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
-		       (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
-		       base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
-	writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN);
-	writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN);
-	writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) |
-		       (1 << cid), base + SIRFSOC_DMA_INT_EN);
-	writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
-	if (sdesc->cyclic) {
-		writel((1 << cid) | 1 << (cid + 16) |
-		       readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL),
-		       base + SIRFSOC_DMA_CH_LOOP_CTRL);
-	}
-
-}
-
-/* Execute all queued DMA descriptors */
-static void sirfsoc_dma_execute(struct sirfsoc_dma_chan *schan)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	void __iomem *base;
-
-	/*
-	 * lock has been held by functions calling this, so we don't hold
-	 * lock again
-	 */
-	base = sdma->base;
-	sdesc = list_first_entry(&schan->queued, struct sirfsoc_dma_desc,
-				 node);
-	/* Move the first queued descriptor to active list */
-	list_move_tail(&sdesc->node, &schan->active);
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2)
-		cid = 0;
-
-	/* Start the DMA transfer */
-	sdma->exec_desc(sdesc, cid, schan->mode, base);
-
-	if (sdesc->cyclic)
-		schan->happened_cyclic = schan->completed_cyclic = 0;
-}
-
-/* Interrupt handler */
-static irqreturn_t sirfsoc_dma_irq(int irq, void *data)
-{
-	struct sirfsoc_dma *sdma = data;
-	struct sirfsoc_dma_chan *schan;
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	u32 is;
-	bool chain;
-	int ch;
-	void __iomem *reg;
-
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A6:
-	case SIRFSOC_DMA_VER_A7V1:
-		is = readl(sdma->base + SIRFSOC_DMA_CH_INT);
-		reg = sdma->base + SIRFSOC_DMA_CH_INT;
-		while ((ch = fls(is) - 1) >= 0) {
-			is &= ~(1 << ch);
-			writel_relaxed(1 << ch, reg);
-			schan = &sdma->channels[ch];
-			spin_lock(&schan->lock);
-			sdesc = list_first_entry(&schan->active,
-						 struct sirfsoc_dma_desc, node);
-			if (!sdesc->cyclic) {
-				/* Execute queued descriptors */
-				list_splice_tail_init(&schan->active,
-						      &schan->completed);
-				dma_cookie_complete(&sdesc->desc);
-				if (!list_empty(&schan->queued))
-					sirfsoc_dma_execute(schan);
-			} else
-				schan->happened_cyclic++;
-			spin_unlock(&schan->lock);
-		}
-		break;
-
-	case SIRFSOC_DMA_VER_A7V2:
-		is = readl(sdma->base + SIRFSOC_DMA_INT_ATLAS7);
-
-		reg = sdma->base + SIRFSOC_DMA_INT_ATLAS7;
-		writel_relaxed(SIRFSOC_DMA_INT_ALL_ATLAS7, reg);
-		schan = &sdma->channels[0];
-		spin_lock(&schan->lock);
-		sdesc = list_first_entry(&schan->active,
-					 struct sirfsoc_dma_desc, node);
-		if (!sdesc->cyclic) {
-			chain = sdesc->chain;
-			if ((chain && (is & SIRFSOC_DMA_INT_END_INT_ATLAS7)) ||
-				(!chain &&
-				(is & SIRFSOC_DMA_INT_FINI_INT_ATLAS7))) {
-				/* Execute queued descriptors */
-				list_splice_tail_init(&schan->active,
-						      &schan->completed);
-				dma_cookie_complete(&sdesc->desc);
-				if (!list_empty(&schan->queued))
-					sirfsoc_dma_execute(schan);
-			}
-		} else if (sdesc->cyclic && (is &
-					SIRFSOC_DMA_INT_LOOP_INT_ATLAS7))
-			schan->happened_cyclic++;
-
-		spin_unlock(&schan->lock);
-		break;
-
-	default:
-		break;
-	}
-
-	/* Schedule tasklet */
-	tasklet_schedule(&sdma->tasklet);
-
-	return IRQ_HANDLED;
-}
-
-/* process completed descriptors */
-static void sirfsoc_dma_process_completed(struct sirfsoc_dma *sdma)
-{
-	dma_cookie_t last_cookie = 0;
-	struct sirfsoc_dma_chan *schan;
-	struct sirfsoc_dma_desc *sdesc;
-	struct dma_async_tx_descriptor *desc;
-	unsigned long flags;
-	unsigned long happened_cyclic;
-	LIST_HEAD(list);
-	int i;
-
-	for (i = 0; i < sdma->dma.chancnt; i++) {
-		schan = &sdma->channels[i];
-
-		/* Get all completed descriptors */
-		spin_lock_irqsave(&schan->lock, flags);
-		if (!list_empty(&schan->completed)) {
-			list_splice_tail_init(&schan->completed, &list);
-			spin_unlock_irqrestore(&schan->lock, flags);
-
-			/* Execute callbacks and run dependencies */
-			list_for_each_entry(sdesc, &list, node) {
-				desc = &sdesc->desc;
-
-				dmaengine_desc_get_callback_invoke(desc, NULL);
-				last_cookie = desc->cookie;
-				dma_run_dependencies(desc);
-			}
-
-			/* Free descriptors */
-			spin_lock_irqsave(&schan->lock, flags);
-			list_splice_tail_init(&list, &schan->free);
-			schan->chan.completed_cookie = last_cookie;
-			spin_unlock_irqrestore(&schan->lock, flags);
-		} else {
-			if (list_empty(&schan->active)) {
-				spin_unlock_irqrestore(&schan->lock, flags);
-				continue;
-			}
-
-			/* for cyclic channel, desc is always in active list */
-			sdesc = list_first_entry(&schan->active,
-				struct sirfsoc_dma_desc, node);
-
-			/* cyclic DMA */
-			happened_cyclic = schan->happened_cyclic;
-			spin_unlock_irqrestore(&schan->lock, flags);
-
-			desc = &sdesc->desc;
-			while (happened_cyclic != schan->completed_cyclic) {
-				dmaengine_desc_get_callback_invoke(desc, NULL);
-				schan->completed_cyclic++;
-			}
-		}
-	}
-}
-
-/* DMA Tasklet */
-static void sirfsoc_dma_tasklet(struct tasklet_struct *t)
-{
-	struct sirfsoc_dma *sdma = from_tasklet(sdma, t, tasklet);
-
-	sirfsoc_dma_process_completed(sdma);
-}
-
-/* Submit descriptor to hardware */
-static dma_cookie_t sirfsoc_dma_tx_submit(struct dma_async_tx_descriptor *txd)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(txd->chan);
-	struct sirfsoc_dma_desc *sdesc;
-	unsigned long flags;
-	dma_cookie_t cookie;
-
-	sdesc = container_of(txd, struct sirfsoc_dma_desc, desc);
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	/* Move descriptor to queue */
-	list_move_tail(&sdesc->node, &schan->queued);
-
-	cookie = dma_cookie_assign(txd);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return cookie;
-}
-
-static int sirfsoc_dma_slave_config(struct dma_chan *chan,
-				    struct dma_slave_config *config)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	unsigned long flags;
-
-	if ((config->src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES) ||
-		(config->dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES))
-		return -EINVAL;
-
-	spin_lock_irqsave(&schan->lock, flags);
-	schan->mode = (config->src_maxburst == 4 ? 1 : 0);
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-static int sirfsoc_dma_terminate_all(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A7V1:
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_INT_EN_CLR);
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_INT);
-		writel_relaxed((1 << cid) | 1 << (cid + 16),
-			       sdma->base +
-			       SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7);
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
-		break;
-	case SIRFSOC_DMA_VER_A7V2:
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_INT_EN_ATLAS7);
-		writel_relaxed(SIRFSOC_DMA_INT_ALL_ATLAS7,
-			       sdma->base + SIRFSOC_DMA_INT_ATLAS7);
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_VALID_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A6:
-		writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) &
-			       ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
-		writel_relaxed(readl_relaxed(sdma->base +
-					     SIRFSOC_DMA_CH_LOOP_CTRL) &
-			       ~((1 << cid) | 1 << (cid + 16)),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
-		writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
-		break;
-	default:
-		break;
-	}
-
-	list_splice_tail_init(&schan->active, &schan->free);
-	list_splice_tail_init(&schan->queued, &schan->free);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-static int sirfsoc_dma_pause_chan(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A7V1:
-		writel_relaxed((1 << cid) | 1 << (cid + 16),
-			       sdma->base +
-			       SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A7V2:
-		writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A6:
-		writel_relaxed(readl_relaxed(sdma->base +
-					     SIRFSOC_DMA_CH_LOOP_CTRL) &
-			       ~((1 << cid) | 1 << (cid + 16)),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
-		break;
-
-	default:
-		break;
-	}
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-static int sirfsoc_dma_resume_chan(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
-	int cid = schan->chan.chan_id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-	switch (sdma->type) {
-	case SIRFSOC_DMA_VER_A7V1:
-		writel_relaxed((1 << cid) | 1 << (cid + 16),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A7V2:
-		writel_relaxed(0x10001,
-			       sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7);
-		break;
-	case SIRFSOC_DMA_VER_A6:
-		writel_relaxed(readl_relaxed(sdma->base +
-					     SIRFSOC_DMA_CH_LOOP_CTRL) |
-			       ((1 << cid) | 1 << (cid + 16)),
-			       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
-		break;
-
-	default:
-		break;
-	}
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return 0;
-}
-
-/* Alloc channel resources */
-static int sirfsoc_dma_alloc_chan_resources(struct dma_chan *chan)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma_desc *sdesc;
-	unsigned long flags;
-	LIST_HEAD(descs);
-	int i;
-
-	pm_runtime_get_sync(sdma->dma.dev);
-
-	/* Alloc descriptors for this channel */
-	for (i = 0; i < SIRFSOC_DMA_DESCRIPTORS; i++) {
-		sdesc = kzalloc(sizeof(*sdesc), GFP_KERNEL);
-		if (!sdesc) {
-			dev_notice(sdma->dma.dev, "Memory allocation error. "
-				"Allocated only %u descriptors\n", i);
-			break;
-		}
-
-		dma_async_tx_descriptor_init(&sdesc->desc, chan);
-		sdesc->desc.flags = DMA_CTRL_ACK;
-		sdesc->desc.tx_submit = sirfsoc_dma_tx_submit;
-
-		list_add_tail(&sdesc->node, &descs);
-	}
-
-	/* Return error only if no descriptors were allocated */
-	if (i == 0)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	list_splice_tail_init(&descs, &schan->free);
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return i;
-}
-
-/* Free channel resources */
-static void sirfsoc_dma_free_chan_resources(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_desc *sdesc, *tmp;
-	unsigned long flags;
-	LIST_HEAD(descs);
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	/* Channel must be idle */
-	BUG_ON(!list_empty(&schan->prepared));
-	BUG_ON(!list_empty(&schan->queued));
-	BUG_ON(!list_empty(&schan->active));
-	BUG_ON(!list_empty(&schan->completed));
-
-	/* Move data */
-	list_splice_tail_init(&schan->free, &descs);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	/* Free descriptors */
-	list_for_each_entry_safe(sdesc, tmp, &descs, node)
-		kfree(sdesc);
-
-	pm_runtime_put(sdma->dma.dev);
-}
-
-/* Send pending descriptor to hardware */
-static void sirfsoc_dma_issue_pending(struct dma_chan *chan)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	unsigned long flags;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	if (list_empty(&schan->active) && !list_empty(&schan->queued))
-		sirfsoc_dma_execute(schan);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-}
-
-/* Check request completion status */
-static enum dma_status
-sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
-	struct dma_tx_state *txstate)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	unsigned long flags;
-	enum dma_status ret;
-	struct sirfsoc_dma_desc *sdesc;
-	int cid = schan->chan.chan_id;
-	unsigned long dma_pos;
-	unsigned long dma_request_bytes;
-	unsigned long residue;
-
-	spin_lock_irqsave(&schan->lock, flags);
-
-	if (list_empty(&schan->active)) {
-		ret = dma_cookie_status(chan, cookie, txstate);
-		dma_set_residue(txstate, 0);
-		spin_unlock_irqrestore(&schan->lock, flags);
-		return ret;
-	}
-	sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, node);
-	if (sdesc->cyclic)
-		dma_request_bytes = (sdesc->xlen + 1) * (sdesc->ylen + 1) *
-			(sdesc->width * SIRFSOC_DMA_WORD_LEN);
-	else
-		dma_request_bytes = sdesc->xlen * SIRFSOC_DMA_WORD_LEN;
-
-	ret = dma_cookie_status(chan, cookie, txstate);
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2)
-		cid = 0;
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-		dma_pos = readl_relaxed(sdma->base + SIRFSOC_DMA_CUR_DATA_ADDR);
-	} else {
-		dma_pos = readl_relaxed(
-			sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR) << 2;
-	}
-
-	residue = dma_request_bytes - (dma_pos - sdesc->addr);
-	dma_set_residue(txstate, residue);
-
-	spin_unlock_irqrestore(&schan->lock, flags);
-
-	return ret;
-}
-
-static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
-	struct dma_chan *chan, struct dma_interleaved_template *xt,
-	unsigned long flags)
-{
-	struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	unsigned long iflags;
-	int ret;
-
-	if ((xt->dir != DMA_MEM_TO_DEV) && (xt->dir != DMA_DEV_TO_MEM)) {
-		ret = -EINVAL;
-		goto err_dir;
-	}
-
-	/* Get free descriptor */
-	spin_lock_irqsave(&schan->lock, iflags);
-	if (!list_empty(&schan->free)) {
-		sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc,
-			node);
-		list_del(&sdesc->node);
-	}
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	if (!sdesc) {
-		/* try to free completed descriptors */
-		sirfsoc_dma_process_completed(sdma);
-		ret = 0;
-		goto no_desc;
-	}
-
-	/* Place descriptor in prepared list */
-	spin_lock_irqsave(&schan->lock, iflags);
-
-	/*
-	 * Number of chunks in a frame can only be 1 for prima2
-	 * and ylen (number of frame - 1) must be at least 0
-	 */
-	if ((xt->frame_size == 1) && (xt->numf > 0)) {
-		sdesc->cyclic = 0;
-		sdesc->xlen = xt->sgl[0].size / SIRFSOC_DMA_WORD_LEN;
-		sdesc->width = (xt->sgl[0].size + xt->sgl[0].icg) /
-				SIRFSOC_DMA_WORD_LEN;
-		sdesc->ylen = xt->numf - 1;
-		if (xt->dir == DMA_MEM_TO_DEV) {
-			sdesc->addr = xt->src_start;
-			sdesc->dir = 1;
-		} else {
-			sdesc->addr = xt->dst_start;
-			sdesc->dir = 0;
-		}
-
-		list_add_tail(&sdesc->node, &schan->prepared);
-	} else {
-		pr_err("sirfsoc DMA Invalid xfer\n");
-		ret = -EINVAL;
-		goto err_xfer;
-	}
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	return &sdesc->desc;
-err_xfer:
-	spin_unlock_irqrestore(&schan->lock, iflags);
-no_desc:
-err_dir:
-	return ERR_PTR(ret);
-}
-
-static struct dma_async_tx_descriptor *
-sirfsoc_dma_prep_cyclic(struct dma_chan *chan, dma_addr_t addr,
-	size_t buf_len, size_t period_len,
-	enum dma_transfer_direction direction, unsigned long flags)
-{
-	struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
-	struct sirfsoc_dma_desc *sdesc = NULL;
-	unsigned long iflags;
-
-	/*
-	 * we only support cycle transfer with 2 period
-	 * If the X-length is set to 0, it would be the loop mode.
-	 * The DMA address keeps increasing until reaching the end of a loop
-	 * area whose size is defined by (DMA_WIDTH x (Y_LENGTH + 1)). Then
-	 * the DMA address goes back to the beginning of this area.
-	 * In loop mode, the DMA data region is divided into two parts, BUFA
-	 * and BUFB. DMA controller generates interrupts twice in each loop:
-	 * when the DMA address reaches the end of BUFA or the end of the
-	 * BUFB
-	 */
-	if (buf_len !=  2 * period_len)
-		return ERR_PTR(-EINVAL);
-
-	/* Get free descriptor */
-	spin_lock_irqsave(&schan->lock, iflags);
-	if (!list_empty(&schan->free)) {
-		sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc,
-			node);
-		list_del(&sdesc->node);
-	}
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	if (!sdesc)
-		return NULL;
-
-	/* Place descriptor in prepared list */
-	spin_lock_irqsave(&schan->lock, iflags);
-	sdesc->addr = addr;
-	sdesc->cyclic = 1;
-	sdesc->xlen = 0;
-	sdesc->ylen = buf_len / SIRFSOC_DMA_WORD_LEN - 1;
-	sdesc->width = 1;
-	list_add_tail(&sdesc->node, &schan->prepared);
-	spin_unlock_irqrestore(&schan->lock, iflags);
-
-	return &sdesc->desc;
-}
-
-/*
- * The DMA controller consists of 16 independent DMA channels.
- * Each channel is allocated to a different function
- */
-bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id)
-{
-	unsigned int ch_nr = (unsigned int) chan_id;
-
-	if (ch_nr == chan->chan_id +
-		chan->device->dev_id * SIRFSOC_DMA_CHANNELS)
-		return true;
-
-	return false;
-}
-EXPORT_SYMBOL(sirfsoc_dma_filter_id);
-
-#define SIRFSOC_DMA_BUSWIDTHS \
-	(BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) | \
-	BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \
-	BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \
-	BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | \
-	BIT(DMA_SLAVE_BUSWIDTH_8_BYTES))
-
-static struct dma_chan *of_dma_sirfsoc_xlate(struct of_phandle_args *dma_spec,
-	struct of_dma *ofdma)
-{
-	struct sirfsoc_dma *sdma = ofdma->of_dma_data;
-	unsigned int request = dma_spec->args[0];
-
-	if (request >= SIRFSOC_DMA_CHANNELS)
-		return NULL;
-
-	return dma_get_slave_channel(&sdma->channels[request].chan);
-}
-
-static int sirfsoc_dma_probe(struct platform_device *op)
-{
-	struct device_node *dn = op->dev.of_node;
-	struct device *dev = &op->dev;
-	struct dma_device *dma;
-	struct sirfsoc_dma *sdma;
-	struct sirfsoc_dma_chan *schan;
-	struct sirfsoc_dmadata *data;
-	struct resource res;
-	ulong regs_start, regs_size;
-	u32 id;
-	int ret, i;
-
-	sdma = devm_kzalloc(dev, sizeof(*sdma), GFP_KERNEL);
-	if (!sdma)
-		return -ENOMEM;
-
-	data = (struct sirfsoc_dmadata *)
-		(of_match_device(op->dev.driver->of_match_table,
-				 &op->dev)->data);
-	sdma->exec_desc = data->exec;
-	sdma->type = data->type;
-
-	if (of_property_read_u32(dn, "cell-index", &id)) {
-		dev_err(dev, "Fail to get DMAC index\n");
-		return -ENODEV;
-	}
-
-	sdma->irq = irq_of_parse_and_map(dn, 0);
-	if (!sdma->irq) {
-		dev_err(dev, "Error mapping IRQ!\n");
-		return -EINVAL;
-	}
-
-	sdma->clk = devm_clk_get(dev, NULL);
-	if (IS_ERR(sdma->clk)) {
-		dev_err(dev, "failed to get a clock.\n");
-		return PTR_ERR(sdma->clk);
-	}
-
-	ret = of_address_to_resource(dn, 0, &res);
-	if (ret) {
-		dev_err(dev, "Error parsing memory region!\n");
-		goto irq_dispose;
-	}
-
-	regs_start = res.start;
-	regs_size = resource_size(&res);
-
-	sdma->base = devm_ioremap(dev, regs_start, regs_size);
-	if (!sdma->base) {
-		dev_err(dev, "Error mapping memory region!\n");
-		ret = -ENOMEM;
-		goto irq_dispose;
-	}
-
-	ret = request_irq(sdma->irq, &sirfsoc_dma_irq, 0, DRV_NAME, sdma);
-	if (ret) {
-		dev_err(dev, "Error requesting IRQ!\n");
-		ret = -EINVAL;
-		goto irq_dispose;
-	}
-
-	dma = &sdma->dma;
-	dma->dev = dev;
-
-	dma->device_alloc_chan_resources = sirfsoc_dma_alloc_chan_resources;
-	dma->device_free_chan_resources = sirfsoc_dma_free_chan_resources;
-	dma->device_issue_pending = sirfsoc_dma_issue_pending;
-	dma->device_config = sirfsoc_dma_slave_config;
-	dma->device_pause = sirfsoc_dma_pause_chan;
-	dma->device_resume = sirfsoc_dma_resume_chan;
-	dma->device_terminate_all = sirfsoc_dma_terminate_all;
-	dma->device_tx_status = sirfsoc_dma_tx_status;
-	dma->device_prep_interleaved_dma = sirfsoc_dma_prep_interleaved;
-	dma->device_prep_dma_cyclic = sirfsoc_dma_prep_cyclic;
-	dma->src_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
-	dma->dst_addr_widths = SIRFSOC_DMA_BUSWIDTHS;
-	dma->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
-
-	INIT_LIST_HEAD(&dma->channels);
-	dma_cap_set(DMA_SLAVE, dma->cap_mask);
-	dma_cap_set(DMA_CYCLIC, dma->cap_mask);
-	dma_cap_set(DMA_INTERLEAVE, dma->cap_mask);
-	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
-
-	for (i = 0; i < SIRFSOC_DMA_CHANNELS; i++) {
-		schan = &sdma->channels[i];
-
-		schan->chan.device = dma;
-		dma_cookie_init(&schan->chan);
-
-		INIT_LIST_HEAD(&schan->free);
-		INIT_LIST_HEAD(&schan->prepared);
-		INIT_LIST_HEAD(&schan->queued);
-		INIT_LIST_HEAD(&schan->active);
-		INIT_LIST_HEAD(&schan->completed);
-
-		spin_lock_init(&schan->lock);
-		list_add_tail(&schan->chan.device_node, &dma->channels);
-	}
-
-	tasklet_setup(&sdma->tasklet, sirfsoc_dma_tasklet);
-
-	/* Register DMA engine */
-	dev_set_drvdata(dev, sdma);
-
-	ret = dma_async_device_register(dma);
-	if (ret)
-		goto free_irq;
-
-	/* Device-tree DMA controller registration */
-	ret = of_dma_controller_register(dn, of_dma_sirfsoc_xlate, sdma);
-	if (ret) {
-		dev_err(dev, "failed to register DMA controller\n");
-		goto unreg_dma_dev;
-	}
-
-	pm_runtime_enable(&op->dev);
-	dev_info(dev, "initialized SIRFSOC DMAC driver\n");
-
-	return 0;
-
-unreg_dma_dev:
-	dma_async_device_unregister(dma);
-free_irq:
-	free_irq(sdma->irq, sdma);
-irq_dispose:
-	irq_dispose_mapping(sdma->irq);
-	return ret;
-}
-
-static int sirfsoc_dma_remove(struct platform_device *op)
-{
-	struct device *dev = &op->dev;
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-
-	of_dma_controller_free(op->dev.of_node);
-	dma_async_device_unregister(&sdma->dma);
-	free_irq(sdma->irq, sdma);
-	tasklet_kill(&sdma->tasklet);
-	irq_dispose_mapping(sdma->irq);
-	pm_runtime_disable(&op->dev);
-	if (!pm_runtime_status_suspended(&op->dev))
-		sirfsoc_dma_runtime_suspend(&op->dev);
-
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_runtime_suspend(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-
-	clk_disable_unprepare(sdma->clk);
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_runtime_resume(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-	int ret;
-
-	ret = clk_prepare_enable(sdma->clk);
-	if (ret < 0) {
-		dev_err(dev, "clk_enable failed: %d\n", ret);
-		return ret;
-	}
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_pm_suspend(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-	struct sirfsoc_dma_regs *save = &sdma->regs_save;
-	struct sirfsoc_dma_chan *schan;
-	int ch;
-	int ret;
-	int count;
-	u32 int_offset;
-
-	/*
-	 * if we were runtime-suspended before, resume to enable clock
-	 * before accessing register
-	 */
-	if (pm_runtime_status_suspended(dev)) {
-		ret = sirfsoc_dma_runtime_resume(dev);
-		if (ret < 0)
-			return ret;
-	}
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-		count = 1;
-		int_offset = SIRFSOC_DMA_INT_EN_ATLAS7;
-	} else {
-		count = SIRFSOC_DMA_CHANNELS;
-		int_offset = SIRFSOC_DMA_INT_EN;
-	}
-
-	/*
-	 * DMA controller will lose all registers while suspending
-	 * so we need to save registers for active channels
-	 */
-	for (ch = 0; ch < count; ch++) {
-		schan = &sdma->channels[ch];
-		if (list_empty(&schan->active))
-			continue;
-		save->ctrl[ch] = readl_relaxed(sdma->base +
-			ch * 0x10 + SIRFSOC_DMA_CH_CTRL);
-	}
-	save->interrupt_en = readl_relaxed(sdma->base + int_offset);
-
-	/* Disable clock */
-	sirfsoc_dma_runtime_suspend(dev);
-
-	return 0;
-}
-
-static int __maybe_unused sirfsoc_dma_pm_resume(struct device *dev)
-{
-	struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
-	struct sirfsoc_dma_regs *save = &sdma->regs_save;
-	struct sirfsoc_dma_desc *sdesc;
-	struct sirfsoc_dma_chan *schan;
-	int ch;
-	int ret;
-	int count;
-	u32 int_offset;
-	u32 width_offset;
-
-	/* Enable clock before accessing register */
-	ret = sirfsoc_dma_runtime_resume(dev);
-	if (ret < 0)
-		return ret;
-
-	if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-		count = 1;
-		int_offset = SIRFSOC_DMA_INT_EN_ATLAS7;
-		width_offset = SIRFSOC_DMA_WIDTH_ATLAS7;
-	} else {
-		count = SIRFSOC_DMA_CHANNELS;
-		int_offset = SIRFSOC_DMA_INT_EN;
-		width_offset = SIRFSOC_DMA_WIDTH_0;
-	}
-
-	writel_relaxed(save->interrupt_en, sdma->base + int_offset);
-	for (ch = 0; ch < count; ch++) {
-		schan = &sdma->channels[ch];
-		if (list_empty(&schan->active))
-			continue;
-		sdesc = list_first_entry(&schan->active,
-			struct sirfsoc_dma_desc,
-			node);
-		writel_relaxed(sdesc->width,
-			sdma->base + width_offset + ch * 4);
-		writel_relaxed(sdesc->xlen,
-			sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_XLEN);
-		writel_relaxed(sdesc->ylen,
-			sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_YLEN);
-		writel_relaxed(save->ctrl[ch],
-			sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_CTRL);
-		if (sdma->type == SIRFSOC_DMA_VER_A7V2) {
-			writel_relaxed(sdesc->addr,
-				sdma->base + SIRFSOC_DMA_CH_ADDR);
-		} else {
-			writel_relaxed(sdesc->addr >> 2,
-				sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_ADDR);
-
-		}
-	}
-
-	/* if we were runtime-suspended before, suspend again */
-	if (pm_runtime_status_suspended(dev))
-		sirfsoc_dma_runtime_suspend(dev);
-
-	return 0;
-}
-
-static const struct dev_pm_ops sirfsoc_dma_pm_ops = {
-	SET_RUNTIME_PM_OPS(sirfsoc_dma_runtime_suspend, sirfsoc_dma_runtime_resume, NULL)
-	SET_SYSTEM_SLEEP_PM_OPS(sirfsoc_dma_pm_suspend, sirfsoc_dma_pm_resume)
-};
-
-static struct sirfsoc_dmadata sirfsoc_dmadata_a6 = {
-	.exec = sirfsoc_dma_execute_hw_a6,
-	.type = SIRFSOC_DMA_VER_A6,
-};
-
-static struct sirfsoc_dmadata sirfsoc_dmadata_a7v1 = {
-	.exec = sirfsoc_dma_execute_hw_a7v1,
-	.type = SIRFSOC_DMA_VER_A7V1,
-};
-
-static struct sirfsoc_dmadata sirfsoc_dmadata_a7v2 = {
-	.exec = sirfsoc_dma_execute_hw_a7v2,
-	.type = SIRFSOC_DMA_VER_A7V2,
-};
-
-static const struct of_device_id sirfsoc_dma_match[] = {
-	{ .compatible = "sirf,prima2-dmac", .data = &sirfsoc_dmadata_a6,},
-	{ .compatible = "sirf,atlas7-dmac", .data = &sirfsoc_dmadata_a7v1,},
-	{ .compatible = "sirf,atlas7-dmac-v2", .data = &sirfsoc_dmadata_a7v2,},
-	{},
-};
-MODULE_DEVICE_TABLE(of, sirfsoc_dma_match);
-
-static struct platform_driver sirfsoc_dma_driver = {
-	.probe		= sirfsoc_dma_probe,
-	.remove		= sirfsoc_dma_remove,
-	.driver = {
-		.name = DRV_NAME,
-		.pm = &sirfsoc_dma_pm_ops,
-		.of_match_table	= sirfsoc_dma_match,
-	},
-};
-
-static __init int sirfsoc_dma_init(void)
-{
-	return platform_driver_register(&sirfsoc_dma_driver);
-}
-
-static void __exit sirfsoc_dma_exit(void)
-{
-	platform_driver_unregister(&sirfsoc_dma_driver);
-}
-
-subsys_initcall(sirfsoc_dma_init);
-module_exit(sirfsoc_dma_exit);
-
-MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>");
-MODULE_AUTHOR("Barry Song <baohua.song@csr.com>");
-MODULE_DESCRIPTION("SIRFSOC DMA control driver");
-MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sirfsoc_dma.h b/include/linux/sirfsoc_dma.h
deleted file mode 100644
index 50161b6afb61..000000000000
--- a/include/linux/sirfsoc_dma.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _SIRFSOC_DMA_H_
-#define _SIRFSOC_DMA_H_
-
-bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id);
-
-#endif
-- 
cgit v1.2.3


From a033a74e8b66336fc2ea379842be6bcf176cbfbc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:18:59 +0100
Subject: dmaengine: remove coh901318 driver

The ST-Ericsson U300 platform is getting removed, so this driver is no
longer needed.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210120131859.2056308-4-arnd@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/ste-coh901318.txt      |   32 -
 drivers/dma/Kconfig                                |    7 -
 drivers/dma/Makefile                               |    1 -
 drivers/dma/coh901318.c                            | 2808 --------------------
 drivers/dma/coh901318.h                            |  141 -
 drivers/dma/coh901318_lli.c                        |  313 ---
 include/linux/platform_data/dma-coh901318.h        |   72 -
 7 files changed, 3374 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/dma/ste-coh901318.txt
 delete mode 100644 drivers/dma/coh901318.c
 delete mode 100644 drivers/dma/coh901318.h
 delete mode 100644 drivers/dma/coh901318_lli.c
 delete mode 100644 include/linux/platform_data/dma-coh901318.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/ste-coh901318.txt b/Documentation/devicetree/bindings/dma/ste-coh901318.txt
deleted file mode 100644
index 091ad057e9cf..000000000000
--- a/Documentation/devicetree/bindings/dma/ste-coh901318.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-ST-Ericsson COH 901 318 DMA Controller
-
-This is a DMA controller which has begun as a fork of the
-ARM PL08x PrimeCell VHDL code.
-
-Required properties:
-- compatible: should be "stericsson,coh901318"
-- reg: register locations and length
-- interrupts: the single DMA IRQ
-- #dma-cells: must be set to <1>, as the channels on the
-  COH 901 318 are simple and identified by a single number
-- dma-channels: the number of DMA channels handled
-
-Example:
-
-dmac: dma-controller@c00020000 {
-	compatible = "stericsson,coh901318";
-	reg = <0xc0020000 0x1000>;
-	interrupt-parent = <&vica>;
-	interrupts = <2>;
-	#dma-cells = <1>;
-	dma-channels = <40>;
-};
-
-Consumers example:
-
-uart0: serial@c0013000 {
-	compatible = "...";
-	(...)
-	dmas = <&dmac 17 &dmac 18>;
-	dma-names = "tx", "rx";
-};
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 4fd6a90cfadf..725cda4aa2a3 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -124,13 +124,6 @@ config BCM_SBA_RAID
 	  has the capability to offload memcpy, xor and pq computation
 	  for raid5/6.
 
-config COH901318
-	bool "ST-Ericsson COH901318 DMA support"
-	select DMA_ENGINE
-	depends on ARCH_U300 || COMPILE_TEST
-	help
-	  Enable support for ST-Ericsson COH 901 318 DMA.
-
 config DMA_BCM2835
 	tristate "BCM2835 DMA engine support"
 	depends on ARCH_BCM2835
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 4aed5ea59922..aa69094e3547 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
 obj-$(CONFIG_AT_XDMAC) += at_xdmac.o
 obj-$(CONFIG_AXI_DMAC) += dma-axi-dmac.o
 obj-$(CONFIG_BCM_SBA_RAID) += bcm-sba-raid.o
-obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
 obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o
 obj-$(CONFIG_DMA_JZ4780) += dma-jz4780.o
 obj-$(CONFIG_DMA_SA11X0) += sa11x0-dma.o
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c
deleted file mode 100644
index 95b9b2f5358e..000000000000
--- a/drivers/dma/coh901318.c
+++ /dev/null
@@ -1,2808 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * driver/dma/coh901318.c
- *
- * Copyright (C) 2007-2009 ST-Ericsson
- * DMA driver for COH 901 318
- * Author: Per Friden <per.friden@stericsson.com>
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h> /* printk() */
-#include <linux/fs.h> /* everything... */
-#include <linux/scatterlist.h>
-#include <linux/slab.h> /* kmalloc() */
-#include <linux/dmaengine.h>
-#include <linux/platform_device.h>
-#include <linux/device.h>
-#include <linux/irqreturn.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <linux/debugfs.h>
-#include <linux/platform_data/dma-coh901318.h>
-#include <linux/of_dma.h>
-
-#include "coh901318.h"
-#include "dmaengine.h"
-
-#define COH901318_MOD32_MASK					(0x1F)
-#define COH901318_WORD_MASK					(0xFFFFFFFF)
-/* INT_STATUS - Interrupt Status Registers 32bit (R/-) */
-#define COH901318_INT_STATUS1					(0x0000)
-#define COH901318_INT_STATUS2					(0x0004)
-/* TC_INT_STATUS - Terminal Count Interrupt Status Registers 32bit (R/-) */
-#define COH901318_TC_INT_STATUS1				(0x0008)
-#define COH901318_TC_INT_STATUS2				(0x000C)
-/* TC_INT_CLEAR - Terminal Count Interrupt Clear Registers 32bit (-/W) */
-#define COH901318_TC_INT_CLEAR1					(0x0010)
-#define COH901318_TC_INT_CLEAR2					(0x0014)
-/* RAW_TC_INT_STATUS - Raw Term Count Interrupt Status Registers 32bit (R/-) */
-#define COH901318_RAW_TC_INT_STATUS1				(0x0018)
-#define COH901318_RAW_TC_INT_STATUS2				(0x001C)
-/* BE_INT_STATUS - Bus Error Interrupt Status Registers 32bit (R/-) */
-#define COH901318_BE_INT_STATUS1				(0x0020)
-#define COH901318_BE_INT_STATUS2				(0x0024)
-/* BE_INT_CLEAR - Bus Error Interrupt Clear Registers 32bit (-/W) */
-#define COH901318_BE_INT_CLEAR1					(0x0028)
-#define COH901318_BE_INT_CLEAR2					(0x002C)
-/* RAW_BE_INT_STATUS - Raw Term Count Interrupt Status Registers 32bit (R/-) */
-#define COH901318_RAW_BE_INT_STATUS1				(0x0030)
-#define COH901318_RAW_BE_INT_STATUS2				(0x0034)
-
-/*
- * CX_CFG - Channel Configuration Registers 32bit (R/W)
- */
-#define COH901318_CX_CFG					(0x0100)
-#define COH901318_CX_CFG_SPACING				(0x04)
-/* Channel enable activates tha dma job */
-#define COH901318_CX_CFG_CH_ENABLE				(0x00000001)
-#define COH901318_CX_CFG_CH_DISABLE				(0x00000000)
-/* Request Mode */
-#define COH901318_CX_CFG_RM_MASK				(0x00000006)
-#define COH901318_CX_CFG_RM_MEMORY_TO_MEMORY			(0x0 << 1)
-#define COH901318_CX_CFG_RM_PRIMARY_TO_MEMORY			(0x1 << 1)
-#define COH901318_CX_CFG_RM_MEMORY_TO_PRIMARY			(0x1 << 1)
-#define COH901318_CX_CFG_RM_PRIMARY_TO_SECONDARY		(0x3 << 1)
-#define COH901318_CX_CFG_RM_SECONDARY_TO_PRIMARY		(0x3 << 1)
-/* Linked channel request field. RM must == 11 */
-#define COH901318_CX_CFG_LCRF_SHIFT				3
-#define COH901318_CX_CFG_LCRF_MASK				(0x000001F8)
-#define COH901318_CX_CFG_LCR_DISABLE				(0x00000000)
-/* Terminal Counter Interrupt Request Mask */
-#define COH901318_CX_CFG_TC_IRQ_ENABLE				(0x00000200)
-#define COH901318_CX_CFG_TC_IRQ_DISABLE				(0x00000000)
-/* Bus Error interrupt Mask */
-#define COH901318_CX_CFG_BE_IRQ_ENABLE				(0x00000400)
-#define COH901318_CX_CFG_BE_IRQ_DISABLE				(0x00000000)
-
-/*
- * CX_STAT - Channel Status Registers 32bit (R/-)
- */
-#define COH901318_CX_STAT					(0x0200)
-#define COH901318_CX_STAT_SPACING				(0x04)
-#define COH901318_CX_STAT_RBE_IRQ_IND				(0x00000008)
-#define COH901318_CX_STAT_RTC_IRQ_IND				(0x00000004)
-#define COH901318_CX_STAT_ACTIVE				(0x00000002)
-#define COH901318_CX_STAT_ENABLED				(0x00000001)
-
-/*
- * CX_CTRL - Channel Control Registers 32bit (R/W)
- */
-#define COH901318_CX_CTRL					(0x0400)
-#define COH901318_CX_CTRL_SPACING				(0x10)
-/* Transfer Count Enable */
-#define COH901318_CX_CTRL_TC_ENABLE				(0x00001000)
-#define COH901318_CX_CTRL_TC_DISABLE				(0x00000000)
-/* Transfer Count Value 0 - 4095 */
-#define COH901318_CX_CTRL_TC_VALUE_MASK				(0x00000FFF)
-/* Burst count */
-#define COH901318_CX_CTRL_BURST_COUNT_MASK			(0x0000E000)
-#define COH901318_CX_CTRL_BURST_COUNT_64_BYTES			(0x7 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_48_BYTES			(0x6 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_32_BYTES			(0x5 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_16_BYTES			(0x4 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_8_BYTES			(0x3 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_4_BYTES			(0x2 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_2_BYTES			(0x1 << 13)
-#define COH901318_CX_CTRL_BURST_COUNT_1_BYTE			(0x0 << 13)
-/* Source bus size  */
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_MASK			(0x00030000)
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS			(0x2 << 16)
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_16_BITS			(0x1 << 16)
-#define COH901318_CX_CTRL_SRC_BUS_SIZE_8_BITS			(0x0 << 16)
-/* Source address increment */
-#define COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE			(0x00040000)
-#define COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE			(0x00000000)
-/* Destination Bus Size */
-#define COH901318_CX_CTRL_DST_BUS_SIZE_MASK			(0x00180000)
-#define COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS			(0x2 << 19)
-#define COH901318_CX_CTRL_DST_BUS_SIZE_16_BITS			(0x1 << 19)
-#define COH901318_CX_CTRL_DST_BUS_SIZE_8_BITS			(0x0 << 19)
-/* Destination address increment */
-#define COH901318_CX_CTRL_DST_ADDR_INC_ENABLE			(0x00200000)
-#define COH901318_CX_CTRL_DST_ADDR_INC_DISABLE			(0x00000000)
-/* Master Mode (Master2 is only connected to MSL) */
-#define COH901318_CX_CTRL_MASTER_MODE_MASK			(0x00C00000)
-#define COH901318_CX_CTRL_MASTER_MODE_M2R_M1W			(0x3 << 22)
-#define COH901318_CX_CTRL_MASTER_MODE_M1R_M2W			(0x2 << 22)
-#define COH901318_CX_CTRL_MASTER_MODE_M2RW			(0x1 << 22)
-#define COH901318_CX_CTRL_MASTER_MODE_M1RW			(0x0 << 22)
-/* Terminal Count flag to PER enable */
-#define COH901318_CX_CTRL_TCP_ENABLE				(0x01000000)
-#define COH901318_CX_CTRL_TCP_DISABLE				(0x00000000)
-/* Terminal Count flags to CPU enable */
-#define COH901318_CX_CTRL_TC_IRQ_ENABLE				(0x02000000)
-#define COH901318_CX_CTRL_TC_IRQ_DISABLE			(0x00000000)
-/* Hand shake to peripheral */
-#define COH901318_CX_CTRL_HSP_ENABLE				(0x04000000)
-#define COH901318_CX_CTRL_HSP_DISABLE				(0x00000000)
-#define COH901318_CX_CTRL_HSS_ENABLE				(0x08000000)
-#define COH901318_CX_CTRL_HSS_DISABLE				(0x00000000)
-/* DMA mode */
-#define COH901318_CX_CTRL_DDMA_MASK				(0x30000000)
-#define COH901318_CX_CTRL_DDMA_LEGACY				(0x0 << 28)
-#define COH901318_CX_CTRL_DDMA_DEMAND_DMA1			(0x1 << 28)
-#define COH901318_CX_CTRL_DDMA_DEMAND_DMA2			(0x2 << 28)
-/* Primary Request Data Destination */
-#define COH901318_CX_CTRL_PRDD_MASK				(0x40000000)
-#define COH901318_CX_CTRL_PRDD_DEST				(0x1 << 30)
-#define COH901318_CX_CTRL_PRDD_SOURCE				(0x0 << 30)
-
-/*
- * CX_SRC_ADDR - Channel Source Address Registers 32bit (R/W)
- */
-#define COH901318_CX_SRC_ADDR					(0x0404)
-#define COH901318_CX_SRC_ADDR_SPACING				(0x10)
-
-/*
- * CX_DST_ADDR - Channel Destination Address Registers 32bit R/W
- */
-#define COH901318_CX_DST_ADDR					(0x0408)
-#define COH901318_CX_DST_ADDR_SPACING				(0x10)
-
-/*
- * CX_LNK_ADDR - Channel Link Address Registers 32bit (R/W)
- */
-#define COH901318_CX_LNK_ADDR					(0x040C)
-#define COH901318_CX_LNK_ADDR_SPACING				(0x10)
-#define COH901318_CX_LNK_LINK_IMMEDIATE				(0x00000001)
-
-/**
- * struct coh901318_params - parameters for DMAC configuration
- * @config: DMA config register
- * @ctrl_lli_last: DMA control register for the last lli in the list
- * @ctrl_lli: DMA control register for an lli
- * @ctrl_lli_chained: DMA control register for a chained lli
- */
-struct coh901318_params {
-	u32 config;
-	u32 ctrl_lli_last;
-	u32 ctrl_lli;
-	u32 ctrl_lli_chained;
-};
-
-/**
- * struct coh_dma_channel - dma channel base
- * @name: ascii name of dma channel
- * @number: channel id number
- * @desc_nbr_max: number of preallocated descriptors
- * @priority_high: prio of channel, 0 low otherwise high.
- * @param: configuration parameters
- */
-struct coh_dma_channel {
-	const char name[32];
-	const int number;
-	const int desc_nbr_max;
-	const int priority_high;
-	const struct coh901318_params param;
-};
-
-/**
- * struct powersave - DMA power save structure
- * @lock: lock protecting data in this struct
- * @started_channels: bit mask indicating active dma channels
- */
-struct powersave {
-	spinlock_t lock;
-	u64 started_channels;
-};
-
-/* points out all dma slave channels.
- * Syntax is [A1, B1, A2, B2, .... ,-1,-1]
- * Select all channels from A to B, end of list is marked with -1,-1
- */
-static int dma_slave_channels[] = {
-	U300_DMA_MSL_TX_0, U300_DMA_SPI_RX,
-	U300_DMA_UART1_TX, U300_DMA_UART1_RX, -1, -1};
-
-/* points out all dma memcpy channels. */
-static int dma_memcpy_channels[] = {
-	U300_DMA_GENERAL_PURPOSE_0, U300_DMA_GENERAL_PURPOSE_8, -1, -1};
-
-#define flags_memcpy_config (COH901318_CX_CFG_CH_DISABLE | \
-			COH901318_CX_CFG_RM_MEMORY_TO_MEMORY | \
-			COH901318_CX_CFG_LCR_DISABLE | \
-			COH901318_CX_CFG_TC_IRQ_ENABLE | \
-			COH901318_CX_CFG_BE_IRQ_ENABLE)
-#define flags_memcpy_lli_chained (COH901318_CX_CTRL_TC_ENABLE | \
-			COH901318_CX_CTRL_BURST_COUNT_32_BYTES | \
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_MASTER_MODE_M1RW | \
-			COH901318_CX_CTRL_TCP_DISABLE | \
-			COH901318_CX_CTRL_TC_IRQ_DISABLE | \
-			COH901318_CX_CTRL_HSP_DISABLE | \
-			COH901318_CX_CTRL_HSS_DISABLE | \
-			COH901318_CX_CTRL_DDMA_LEGACY | \
-			COH901318_CX_CTRL_PRDD_SOURCE)
-#define flags_memcpy_lli (COH901318_CX_CTRL_TC_ENABLE | \
-			COH901318_CX_CTRL_BURST_COUNT_32_BYTES | \
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_MASTER_MODE_M1RW | \
-			COH901318_CX_CTRL_TCP_DISABLE | \
-			COH901318_CX_CTRL_TC_IRQ_DISABLE | \
-			COH901318_CX_CTRL_HSP_DISABLE | \
-			COH901318_CX_CTRL_HSS_DISABLE | \
-			COH901318_CX_CTRL_DDMA_LEGACY | \
-			COH901318_CX_CTRL_PRDD_SOURCE)
-#define flags_memcpy_lli_last (COH901318_CX_CTRL_TC_ENABLE | \
-			COH901318_CX_CTRL_BURST_COUNT_32_BYTES | \
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS | \
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE | \
-			COH901318_CX_CTRL_MASTER_MODE_M1RW | \
-			COH901318_CX_CTRL_TCP_DISABLE | \
-			COH901318_CX_CTRL_TC_IRQ_ENABLE | \
-			COH901318_CX_CTRL_HSP_DISABLE | \
-			COH901318_CX_CTRL_HSS_DISABLE | \
-			COH901318_CX_CTRL_DDMA_LEGACY | \
-			COH901318_CX_CTRL_PRDD_SOURCE)
-
-static const struct coh_dma_channel chan_config[U300_DMA_CHANNELS] = {
-	{
-		.number = U300_DMA_MSL_TX_0,
-		.name = "MSL TX 0",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_TX_1,
-		.name = "MSL TX 1",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_MSL_TX_2,
-		.name = "MSL TX 2",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.desc_nbr_max = 10,
-	},
-	{
-		.number = U300_DMA_MSL_TX_3,
-		.name = "MSL TX 3",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_MSL_TX_4,
-		.name = "MSL TX 4",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1R_M2W |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_MSL_TX_5,
-		.name = "MSL TX 5",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_TX_6,
-		.name = "MSL TX 6",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_RX_0,
-		.name = "MSL RX 0",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSL_RX_1,
-		.name = "MSL RX 1",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_2,
-		.name = "MSL RX 2",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_3,
-		.name = "MSL RX 3",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_4,
-		.name = "MSL RX 4",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_5,
-		.name = "MSL RX 5",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_32_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M2R_M1W |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_DEMAND_DMA1 |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_MSL_RX_6,
-		.name = "MSL RX 6",
-		.priority_high = 0,
-	},
-	/*
-	 * Don't set up device address, burst count or size of src
-	 * or dst bus for this peripheral - handled by PrimeCell
-	 * DMA extension.
-	 */
-	{
-		.number = U300_DMA_MMCSD_RX_TX,
-		.name = "MMCSD RX TX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-
-	},
-	{
-		.number = U300_DMA_MSPRO_TX,
-		.name = "MSPRO TX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_MSPRO_RX,
-		.name = "MSPRO RX",
-		.priority_high = 0,
-	},
-	/*
-	 * Don't set up device address, burst count or size of src
-	 * or dst bus for this peripheral - handled by PrimeCell
-	 * DMA extension.
-	 */
-	{
-		.number = U300_DMA_UART0_TX,
-		.name = "UART0 TX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-	},
-	{
-		.number = U300_DMA_UART0_RX,
-		.name = "UART0 RX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-	},
-	{
-		.number = U300_DMA_APEX_TX,
-		.name = "APEX TX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_APEX_RX,
-		.name = "APEX RX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_PCM_I2S0_TX,
-		.name = "PCM I2S0 TX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_PCM_I2S0_RX,
-		.name = "PCM I2S0 RX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_PCM_I2S1_TX,
-		.name = "PCM I2S1 TX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_SOURCE,
-	},
-	{
-		.number = U300_DMA_PCM_I2S1_RX,
-		.name = "PCM I2S1 RX",
-		.priority_high = 1,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_BURST_COUNT_16_BYTES |
-				COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_SRC_ADDR_INC_DISABLE |
-				COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS |
-				COH901318_CX_CTRL_DST_ADDR_INC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_ENABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY |
-				COH901318_CX_CTRL_PRDD_DEST,
-	},
-	{
-		.number = U300_DMA_XGAM_CDI,
-		.name = "XGAM CDI",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_XGAM_PDI,
-		.name = "XGAM PDI",
-		.priority_high = 0,
-	},
-	/*
-	 * Don't set up device address, burst count or size of src
-	 * or dst bus for this peripheral - handled by PrimeCell
-	 * DMA extension.
-	 */
-	{
-		.number = U300_DMA_SPI_TX,
-		.name = "SPI TX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-	},
-	{
-		.number = U300_DMA_SPI_RX,
-		.name = "SPI RX",
-		.priority_high = 0,
-		.param.config = COH901318_CX_CFG_CH_DISABLE |
-				COH901318_CX_CFG_LCR_DISABLE |
-				COH901318_CX_CFG_TC_IRQ_ENABLE |
-				COH901318_CX_CFG_BE_IRQ_ENABLE,
-		.param.ctrl_lli_chained = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_DISABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-		.param.ctrl_lli_last = 0 |
-				COH901318_CX_CTRL_TC_ENABLE |
-				COH901318_CX_CTRL_MASTER_MODE_M1RW |
-				COH901318_CX_CTRL_TCP_DISABLE |
-				COH901318_CX_CTRL_TC_IRQ_ENABLE |
-				COH901318_CX_CTRL_HSP_ENABLE |
-				COH901318_CX_CTRL_HSS_DISABLE |
-				COH901318_CX_CTRL_DDMA_LEGACY,
-
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_0,
-		.name = "GENERAL 00",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_1,
-		.name = "GENERAL 01",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_2,
-		.name = "GENERAL 02",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_3,
-		.name = "GENERAL 03",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_4,
-		.name = "GENERAL 04",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_5,
-		.name = "GENERAL 05",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_6,
-		.name = "GENERAL 06",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_7,
-		.name = "GENERAL 07",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_GENERAL_PURPOSE_8,
-		.name = "GENERAL 08",
-		.priority_high = 0,
-
-		.param.config = flags_memcpy_config,
-		.param.ctrl_lli_chained = flags_memcpy_lli_chained,
-		.param.ctrl_lli = flags_memcpy_lli,
-		.param.ctrl_lli_last = flags_memcpy_lli_last,
-	},
-	{
-		.number = U300_DMA_UART1_TX,
-		.name = "UART1 TX",
-		.priority_high = 0,
-	},
-	{
-		.number = U300_DMA_UART1_RX,
-		.name = "UART1 RX",
-		.priority_high = 0,
-	}
-};
-
-#define COHC_2_DEV(cohc) (&cohc->chan.dev->device)
-
-#ifdef VERBOSE_DEBUG
-#define COH_DBG(x) ({ if (1) x; 0; })
-#else
-#define COH_DBG(x) ({ if (0) x; 0; })
-#endif
-
-struct coh901318_desc {
-	struct dma_async_tx_descriptor desc;
-	struct list_head node;
-	struct scatterlist *sg;
-	unsigned int sg_len;
-	struct coh901318_lli *lli;
-	enum dma_transfer_direction dir;
-	unsigned long flags;
-	u32 head_config;
-	u32 head_ctrl;
-};
-
-struct coh901318_base {
-	struct device *dev;
-	void __iomem *virtbase;
-	unsigned int irq;
-	struct coh901318_pool pool;
-	struct powersave pm;
-	struct dma_device dma_slave;
-	struct dma_device dma_memcpy;
-	struct coh901318_chan *chans;
-};
-
-struct coh901318_chan {
-	spinlock_t lock;
-	int allocated;
-	int id;
-	int stopped;
-
-	struct work_struct free_work;
-	struct dma_chan chan;
-
-	struct tasklet_struct tasklet;
-
-	struct list_head active;
-	struct list_head queue;
-	struct list_head free;
-
-	unsigned long nbr_active_done;
-	unsigned long busy;
-
-	struct dma_slave_config config;
-	u32 addr;
-	u32 ctrl;
-
-	struct coh901318_base *base;
-};
-
-static void coh901318_list_print(struct coh901318_chan *cohc,
-				 struct coh901318_lli *lli)
-{
-	struct coh901318_lli *l = lli;
-	int i = 0;
-
-	while (l) {
-		dev_vdbg(COHC_2_DEV(cohc), "i %d, lli %p, ctrl 0x%x, src %pad"
-			 ", dst %pad, link %pad virt_link_addr 0x%p\n",
-			 i, l, l->control, &l->src_addr, &l->dst_addr,
-			 &l->link_addr, l->virt_link_addr);
-		i++;
-		l = l->virt_link_addr;
-	}
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-#define COH901318_DEBUGFS_ASSIGN(x, y) (x = y)
-
-static struct coh901318_base *debugfs_dma_base;
-static struct dentry *dma_dentry;
-
-static ssize_t coh901318_debugfs_read(struct file *file, char __user *buf,
-				  size_t count, loff_t *f_pos)
-{
-	u64 started_channels = debugfs_dma_base->pm.started_channels;
-	int pool_count = debugfs_dma_base->pool.debugfs_pool_counter;
-	char *dev_buf;
-	char *tmp;
-	int ret;
-	int i;
-
-	dev_buf = kmalloc(4*1024, GFP_KERNEL);
-	if (dev_buf == NULL)
-		return -ENOMEM;
-	tmp = dev_buf;
-
-	tmp += sprintf(tmp, "DMA -- enabled dma channels\n");
-
-	for (i = 0; i < U300_DMA_CHANNELS; i++) {
-		if (started_channels & (1ULL << i))
-			tmp += sprintf(tmp, "channel %d\n", i);
-	}
-
-	tmp += sprintf(tmp, "Pool alloc nbr %d\n", pool_count);
-
-	ret = simple_read_from_buffer(buf, count, f_pos, dev_buf, 
-					tmp - dev_buf);
-	kfree(dev_buf);
-	return ret;
-}
-
-static const struct file_operations coh901318_debugfs_status_operations = {
-	.open		= simple_open,
-	.read		= coh901318_debugfs_read,
-	.llseek		= default_llseek,
-};
-
-
-static int __init init_coh901318_debugfs(void)
-{
-
-	dma_dentry = debugfs_create_dir("dma", NULL);
-
-	debugfs_create_file("status", S_IFREG | S_IRUGO, dma_dentry, NULL,
-			    &coh901318_debugfs_status_operations);
-	return 0;
-}
-
-static void __exit exit_coh901318_debugfs(void)
-{
-	debugfs_remove_recursive(dma_dentry);
-}
-
-module_init(init_coh901318_debugfs);
-module_exit(exit_coh901318_debugfs);
-#else
-
-#define COH901318_DEBUGFS_ASSIGN(x, y)
-
-#endif /* CONFIG_DEBUG_FS */
-
-static inline struct coh901318_chan *to_coh901318_chan(struct dma_chan *chan)
-{
-	return container_of(chan, struct coh901318_chan, chan);
-}
-
-static int coh901318_dma_set_runtimeconfig(struct dma_chan *chan,
-					   struct dma_slave_config *config,
-					   enum dma_transfer_direction direction);
-
-static inline const struct coh901318_params *
-cohc_chan_param(struct coh901318_chan *cohc)
-{
-	return &chan_config[cohc->id].param;
-}
-
-static inline const struct coh_dma_channel *
-cohc_chan_conf(struct coh901318_chan *cohc)
-{
-	return &chan_config[cohc->id];
-}
-
-static void enable_powersave(struct coh901318_chan *cohc)
-{
-	unsigned long flags;
-	struct powersave *pm = &cohc->base->pm;
-
-	spin_lock_irqsave(&pm->lock, flags);
-
-	pm->started_channels &= ~(1ULL << cohc->id);
-
-	spin_unlock_irqrestore(&pm->lock, flags);
-}
-static void disable_powersave(struct coh901318_chan *cohc)
-{
-	unsigned long flags;
-	struct powersave *pm = &cohc->base->pm;
-
-	spin_lock_irqsave(&pm->lock, flags);
-
-	pm->started_channels |= (1ULL << cohc->id);
-
-	spin_unlock_irqrestore(&pm->lock, flags);
-}
-
-static inline int coh901318_set_ctrl(struct coh901318_chan *cohc, u32 control)
-{
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	writel(control,
-	       virtbase + COH901318_CX_CTRL +
-	       COH901318_CX_CTRL_SPACING * channel);
-	return 0;
-}
-
-static inline int coh901318_set_conf(struct coh901318_chan *cohc, u32 conf)
-{
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	writel(conf,
-	       virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING*channel);
-	return 0;
-}
-
-
-static int coh901318_start(struct coh901318_chan *cohc)
-{
-	u32 val;
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	disable_powersave(cohc);
-
-	val = readl(virtbase + COH901318_CX_CFG +
-		    COH901318_CX_CFG_SPACING * channel);
-
-	/* Enable channel */
-	val |= COH901318_CX_CFG_CH_ENABLE;
-	writel(val, virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING * channel);
-
-	return 0;
-}
-
-static int coh901318_prep_linked_list(struct coh901318_chan *cohc,
-				      struct coh901318_lli *lli)
-{
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	BUG_ON(readl(virtbase + COH901318_CX_STAT +
-		     COH901318_CX_STAT_SPACING*channel) &
-	       COH901318_CX_STAT_ACTIVE);
-
-	writel(lli->src_addr,
-	       virtbase + COH901318_CX_SRC_ADDR +
-	       COH901318_CX_SRC_ADDR_SPACING * channel);
-
-	writel(lli->dst_addr, virtbase +
-	       COH901318_CX_DST_ADDR +
-	       COH901318_CX_DST_ADDR_SPACING * channel);
-
-	writel(lli->link_addr, virtbase + COH901318_CX_LNK_ADDR +
-	       COH901318_CX_LNK_ADDR_SPACING * channel);
-
-	writel(lli->control, virtbase + COH901318_CX_CTRL +
-	       COH901318_CX_CTRL_SPACING * channel);
-
-	return 0;
-}
-
-static struct coh901318_desc *
-coh901318_desc_get(struct coh901318_chan *cohc)
-{
-	struct coh901318_desc *desc;
-
-	if (list_empty(&cohc->free)) {
-		/* alloc new desc because we're out of used ones
-		 * TODO: alloc a pile of descs instead of just one,
-		 * avoid many small allocations.
-		 */
-		desc = kzalloc(sizeof(struct coh901318_desc), GFP_NOWAIT);
-		if (desc == NULL)
-			goto out;
-		INIT_LIST_HEAD(&desc->node);
-		dma_async_tx_descriptor_init(&desc->desc, &cohc->chan);
-	} else {
-		/* Reuse an old desc. */
-		desc = list_first_entry(&cohc->free,
-					struct coh901318_desc,
-					node);
-		list_del(&desc->node);
-		/* Initialize it a bit so it's not insane */
-		desc->sg = NULL;
-		desc->sg_len = 0;
-		desc->desc.callback = NULL;
-		desc->desc.callback_param = NULL;
-	}
-
- out:
-	return desc;
-}
-
-static void
-coh901318_desc_free(struct coh901318_chan *cohc, struct coh901318_desc *cohd)
-{
-	list_add_tail(&cohd->node, &cohc->free);
-}
-
-/* call with irq lock held */
-static void
-coh901318_desc_submit(struct coh901318_chan *cohc, struct coh901318_desc *desc)
-{
-	list_add_tail(&desc->node, &cohc->active);
-}
-
-static struct coh901318_desc *
-coh901318_first_active_get(struct coh901318_chan *cohc)
-{
-	return list_first_entry_or_null(&cohc->active, struct coh901318_desc,
-					node);
-}
-
-static void
-coh901318_desc_remove(struct coh901318_desc *cohd)
-{
-	list_del(&cohd->node);
-}
-
-static void
-coh901318_desc_queue(struct coh901318_chan *cohc, struct coh901318_desc *desc)
-{
-	list_add_tail(&desc->node, &cohc->queue);
-}
-
-static struct coh901318_desc *
-coh901318_first_queued(struct coh901318_chan *cohc)
-{
-	return list_first_entry_or_null(&cohc->queue, struct coh901318_desc,
-					node);
-}
-
-static inline u32 coh901318_get_bytes_in_lli(struct coh901318_lli *in_lli)
-{
-	struct coh901318_lli *lli = in_lli;
-	u32 bytes = 0;
-
-	while (lli) {
-		bytes += lli->control & COH901318_CX_CTRL_TC_VALUE_MASK;
-		lli = lli->virt_link_addr;
-	}
-	return bytes;
-}
-
-/*
- * Get the number of bytes left to transfer on this channel,
- * it is unwise to call this before stopping the channel for
- * absolute measures, but for a rough guess you can still call
- * it.
- */
-static u32 coh901318_get_bytes_left(struct dma_chan *chan)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	struct coh901318_desc *cohd;
-	struct list_head *pos;
-	unsigned long flags;
-	u32 left = 0;
-	int i = 0;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/*
-	 * If there are many queued jobs, we iterate and add the
-	 * size of them all. We take a special look on the first
-	 * job though, since it is probably active.
-	 */
-	list_for_each(pos, &cohc->active) {
-		/*
-		 * The first job in the list will be working on the
-		 * hardware. The job can be stopped but still active,
-		 * so that the transfer counter is somewhere inside
-		 * the buffer.
-		 */
-		cohd = list_entry(pos, struct coh901318_desc, node);
-
-		if (i == 0) {
-			struct coh901318_lli *lli;
-			dma_addr_t ladd;
-
-			/* Read current transfer count value */
-			left = readl(cohc->base->virtbase +
-				     COH901318_CX_CTRL +
-				     COH901318_CX_CTRL_SPACING * cohc->id) &
-				COH901318_CX_CTRL_TC_VALUE_MASK;
-
-			/* See if the transfer is linked... */
-			ladd = readl(cohc->base->virtbase +
-				     COH901318_CX_LNK_ADDR +
-				     COH901318_CX_LNK_ADDR_SPACING *
-				     cohc->id) &
-				~COH901318_CX_LNK_LINK_IMMEDIATE;
-			/* Single transaction */
-			if (!ladd)
-				continue;
-
-			/*
-			 * Linked transaction, follow the lli, find the
-			 * currently processing lli, and proceed to the next
-			 */
-			lli = cohd->lli;
-			while (lli && lli->link_addr != ladd)
-				lli = lli->virt_link_addr;
-
-			if (lli)
-				lli = lli->virt_link_addr;
-
-			/*
-			 * Follow remaining lli links around to count the total
-			 * number of bytes left
-			 */
-			left += coh901318_get_bytes_in_lli(lli);
-		} else {
-			left += coh901318_get_bytes_in_lli(cohd->lli);
-		}
-		i++;
-	}
-
-	/* Also count bytes in the queued jobs */
-	list_for_each(pos, &cohc->queue) {
-		cohd = list_entry(pos, struct coh901318_desc, node);
-		left += coh901318_get_bytes_in_lli(cohd->lli);
-	}
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return left;
-}
-
-/*
- * Pauses a transfer without losing data. Enables power save.
- * Use this function in conjunction with coh901318_resume.
- */
-static int coh901318_pause(struct dma_chan *chan)
-{
-	u32 val;
-	unsigned long flags;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* Disable channel in HW */
-	val = readl(virtbase + COH901318_CX_CFG +
-		    COH901318_CX_CFG_SPACING * channel);
-
-	/* Stopping infinite transfer */
-	if ((val & COH901318_CX_CTRL_TC_ENABLE) == 0 &&
-	    (val & COH901318_CX_CFG_CH_ENABLE))
-		cohc->stopped = 1;
-
-
-	val &= ~COH901318_CX_CFG_CH_ENABLE;
-	/* Enable twice, HW bug work around */
-	writel(val, virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING * channel);
-	writel(val, virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING * channel);
-
-	/* Spin-wait for it to actually go inactive */
-	while (readl(virtbase + COH901318_CX_STAT+COH901318_CX_STAT_SPACING *
-		     channel) & COH901318_CX_STAT_ACTIVE)
-		cpu_relax();
-
-	/* Check if we stopped an active job */
-	if ((readl(virtbase + COH901318_CX_CTRL+COH901318_CX_CTRL_SPACING *
-		   channel) & COH901318_CX_CTRL_TC_VALUE_MASK) > 0)
-		cohc->stopped = 1;
-
-	enable_powersave(cohc);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-	return 0;
-}
-
-/* Resumes a transfer that has been stopped via 300_dma_stop(..).
-   Power save is handled.
-*/
-static int coh901318_resume(struct dma_chan *chan)
-{
-	u32 val;
-	unsigned long flags;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	int channel = cohc->id;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	disable_powersave(cohc);
-
-	if (cohc->stopped) {
-		/* Enable channel in HW */
-		val = readl(cohc->base->virtbase + COH901318_CX_CFG +
-			    COH901318_CX_CFG_SPACING * channel);
-
-		val |= COH901318_CX_CFG_CH_ENABLE;
-
-		writel(val, cohc->base->virtbase + COH901318_CX_CFG +
-		       COH901318_CX_CFG_SPACING*channel);
-
-		cohc->stopped = 0;
-	}
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-	return 0;
-}
-
-bool coh901318_filter_id(struct dma_chan *chan, void *chan_id)
-{
-	unsigned long ch_nr = (unsigned long) chan_id;
-
-	if (ch_nr == to_coh901318_chan(chan)->id)
-		return true;
-
-	return false;
-}
-EXPORT_SYMBOL(coh901318_filter_id);
-
-struct coh901318_filter_args {
-	struct coh901318_base *base;
-	unsigned int ch_nr;
-};
-
-static bool coh901318_filter_base_and_id(struct dma_chan *chan, void *data)
-{
-	struct coh901318_filter_args *args = data;
-
-	if (&args->base->dma_slave == chan->device &&
-	    args->ch_nr == to_coh901318_chan(chan)->id)
-		return true;
-
-	return false;
-}
-
-static struct dma_chan *coh901318_xlate(struct of_phandle_args *dma_spec,
-					struct of_dma *ofdma)
-{
-	struct coh901318_filter_args args = {
-		.base = ofdma->of_dma_data,
-		.ch_nr = dma_spec->args[0],
-	};
-	dma_cap_mask_t cap;
-	dma_cap_zero(cap);
-	dma_cap_set(DMA_SLAVE, cap);
-
-	return dma_request_channel(cap, coh901318_filter_base_and_id, &args);
-}
-/*
- * DMA channel allocation
- */
-static int coh901318_config(struct coh901318_chan *cohc,
-			    struct coh901318_params *param)
-{
-	const struct coh901318_params *p;
-	int channel = cohc->id;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	if (param)
-		p = param;
-	else
-		p = cohc_chan_param(cohc);
-
-	/* Clear any pending BE or TC interrupt */
-	if (channel < 32) {
-		writel(1 << channel, virtbase + COH901318_BE_INT_CLEAR1);
-		writel(1 << channel, virtbase + COH901318_TC_INT_CLEAR1);
-	} else {
-		writel(1 << (channel - 32), virtbase +
-		       COH901318_BE_INT_CLEAR2);
-		writel(1 << (channel - 32), virtbase +
-		       COH901318_TC_INT_CLEAR2);
-	}
-
-	coh901318_set_conf(cohc, p->config);
-	coh901318_set_ctrl(cohc, p->ctrl_lli_last);
-
-	return 0;
-}
-
-/* must lock when calling this function
- * start queued jobs, if any
- * TODO: start all queued jobs in one go
- *
- * Returns descriptor if queued job is started otherwise NULL.
- * If the queue is empty NULL is returned.
- */
-static struct coh901318_desc *coh901318_queue_start(struct coh901318_chan *cohc)
-{
-	struct coh901318_desc *cohd;
-
-	/*
-	 * start queued jobs, if any
-	 * TODO: transmit all queued jobs in one go
-	 */
-	cohd = coh901318_first_queued(cohc);
-
-	if (cohd != NULL) {
-		/* Remove from queue */
-		coh901318_desc_remove(cohd);
-		/* initiate DMA job */
-		cohc->busy = 1;
-
-		coh901318_desc_submit(cohc, cohd);
-
-		/* Program the transaction head */
-		coh901318_set_conf(cohc, cohd->head_config);
-		coh901318_set_ctrl(cohc, cohd->head_ctrl);
-		coh901318_prep_linked_list(cohc, cohd->lli);
-
-		/* start dma job on this channel */
-		coh901318_start(cohc);
-
-	}
-
-	return cohd;
-}
-
-/*
- * This tasklet is called from the interrupt handler to
- * handle each descriptor (DMA job) that is sent to a channel.
- */
-static void dma_tasklet(struct tasklet_struct *t)
-{
-	struct coh901318_chan *cohc = from_tasklet(cohc, t, tasklet);
-	struct coh901318_desc *cohd_fin;
-	unsigned long flags;
-	struct dmaengine_desc_callback cb;
-
-	dev_vdbg(COHC_2_DEV(cohc), "[%s] chan_id %d"
-		 " nbr_active_done %ld\n", __func__,
-		 cohc->id, cohc->nbr_active_done);
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* get first active descriptor entry from list */
-	cohd_fin = coh901318_first_active_get(cohc);
-
-	if (cohd_fin == NULL)
-		goto err;
-
-	/* locate callback to client */
-	dmaengine_desc_get_callback(&cohd_fin->desc, &cb);
-
-	/* sign this job as completed on the channel */
-	dma_cookie_complete(&cohd_fin->desc);
-
-	/* release the lli allocation and remove the descriptor */
-	coh901318_lli_free(&cohc->base->pool, &cohd_fin->lli);
-
-	/* return desc to free-list */
-	coh901318_desc_remove(cohd_fin);
-	coh901318_desc_free(cohc, cohd_fin);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	/* Call the callback when we're done */
-	dmaengine_desc_callback_invoke(&cb, NULL);
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/*
-	 * If another interrupt fired while the tasklet was scheduling,
-	 * we don't get called twice, so we have this number of active
-	 * counter that keep track of the number of IRQs expected to
-	 * be handled for this channel. If there happen to be more than
-	 * one IRQ to be ack:ed, we simply schedule this tasklet again.
-	 */
-	cohc->nbr_active_done--;
-	if (cohc->nbr_active_done) {
-		dev_dbg(COHC_2_DEV(cohc), "scheduling tasklet again, new IRQs "
-			"came in while we were scheduling this tasklet\n");
-		if (cohc_chan_conf(cohc)->priority_high)
-			tasklet_hi_schedule(&cohc->tasklet);
-		else
-			tasklet_schedule(&cohc->tasklet);
-	}
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return;
-
- err:
-	spin_unlock_irqrestore(&cohc->lock, flags);
-	dev_err(COHC_2_DEV(cohc), "[%s] No active dma desc\n", __func__);
-}
-
-
-/* called from interrupt context */
-static void dma_tc_handle(struct coh901318_chan *cohc)
-{
-	/*
-	 * If the channel is not allocated, then we shouldn't have
-	 * any TC interrupts on it.
-	 */
-	if (!cohc->allocated) {
-		dev_err(COHC_2_DEV(cohc), "spurious interrupt from "
-			"unallocated channel\n");
-		return;
-	}
-
-	/*
-	 * When we reach this point, at least one queue item
-	 * should have been moved over from cohc->queue to
-	 * cohc->active and run to completion, that is why we're
-	 * getting a terminal count interrupt is it not?
-	 * If you get this BUG() the most probable cause is that
-	 * the individual nodes in the lli chain have IRQ enabled,
-	 * so check your platform config for lli chain ctrl.
-	 */
-	BUG_ON(list_empty(&cohc->active));
-
-	cohc->nbr_active_done++;
-
-	/*
-	 * This attempt to take a job from cohc->queue, put it
-	 * into cohc->active and start it.
-	 */
-	if (coh901318_queue_start(cohc) == NULL)
-		cohc->busy = 0;
-
-	/*
-	 * This tasklet will remove items from cohc->active
-	 * and thus terminates them.
-	 */
-	if (cohc_chan_conf(cohc)->priority_high)
-		tasklet_hi_schedule(&cohc->tasklet);
-	else
-		tasklet_schedule(&cohc->tasklet);
-}
-
-
-static irqreturn_t dma_irq_handler(int irq, void *dev_id)
-{
-	u32 status1;
-	u32 status2;
-	int i;
-	int ch;
-	struct coh901318_base *base  = dev_id;
-	struct coh901318_chan *cohc;
-	void __iomem *virtbase = base->virtbase;
-
-	status1 = readl(virtbase + COH901318_INT_STATUS1);
-	status2 = readl(virtbase + COH901318_INT_STATUS2);
-
-	if (unlikely(status1 == 0 && status2 == 0)) {
-		dev_warn(base->dev, "spurious DMA IRQ from no channel!\n");
-		return IRQ_HANDLED;
-	}
-
-	/* TODO: consider handle IRQ in tasklet here to
-	 *       minimize interrupt latency */
-
-	/* Check the first 32 DMA channels for IRQ */
-	while (status1) {
-		/* Find first bit set, return as a number. */
-		i = ffs(status1) - 1;
-		ch = i;
-
-		cohc = &base->chans[ch];
-		spin_lock(&cohc->lock);
-
-		/* Mask off this bit */
-		status1 &= ~(1 << i);
-		/* Check the individual channel bits */
-		if (test_bit(i, virtbase + COH901318_BE_INT_STATUS1)) {
-			dev_crit(COHC_2_DEV(cohc),
-				 "DMA bus error on channel %d!\n", ch);
-			BUG_ON(1);
-			/* Clear BE interrupt */
-			__set_bit(i, virtbase + COH901318_BE_INT_CLEAR1);
-		} else {
-			/* Caused by TC, really? */
-			if (unlikely(!test_bit(i, virtbase +
-					       COH901318_TC_INT_STATUS1))) {
-				dev_warn(COHC_2_DEV(cohc),
-					 "ignoring interrupt not caused by terminal count on channel %d\n", ch);
-				/* Clear TC interrupt */
-				BUG_ON(1);
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR1);
-			} else {
-				/* Enable powersave if transfer has finished */
-				if (!(readl(virtbase + COH901318_CX_STAT +
-					    COH901318_CX_STAT_SPACING*ch) &
-				      COH901318_CX_STAT_ENABLED)) {
-					enable_powersave(cohc);
-				}
-
-				/* Must clear TC interrupt before calling
-				 * dma_tc_handle
-				 * in case tc_handle initiate a new dma job
-				 */
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR1);
-
-				dma_tc_handle(cohc);
-			}
-		}
-		spin_unlock(&cohc->lock);
-	}
-
-	/* Check the remaining 32 DMA channels for IRQ */
-	while (status2) {
-		/* Find first bit set, return as a number. */
-		i = ffs(status2) - 1;
-		ch = i + 32;
-		cohc = &base->chans[ch];
-		spin_lock(&cohc->lock);
-
-		/* Mask off this bit */
-		status2 &= ~(1 << i);
-		/* Check the individual channel bits */
-		if (test_bit(i, virtbase + COH901318_BE_INT_STATUS2)) {
-			dev_crit(COHC_2_DEV(cohc),
-				 "DMA bus error on channel %d!\n", ch);
-			/* Clear BE interrupt */
-			BUG_ON(1);
-			__set_bit(i, virtbase + COH901318_BE_INT_CLEAR2);
-		} else {
-			/* Caused by TC, really? */
-			if (unlikely(!test_bit(i, virtbase +
-					       COH901318_TC_INT_STATUS2))) {
-				dev_warn(COHC_2_DEV(cohc),
-					 "ignoring interrupt not caused by terminal count on channel %d\n", ch);
-				/* Clear TC interrupt */
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR2);
-				BUG_ON(1);
-			} else {
-				/* Enable powersave if transfer has finished */
-				if (!(readl(virtbase + COH901318_CX_STAT +
-					    COH901318_CX_STAT_SPACING*ch) &
-				      COH901318_CX_STAT_ENABLED)) {
-					enable_powersave(cohc);
-				}
-				/* Must clear TC interrupt before calling
-				 * dma_tc_handle
-				 * in case tc_handle initiate a new dma job
-				 */
-				__set_bit(i, virtbase + COH901318_TC_INT_CLEAR2);
-
-				dma_tc_handle(cohc);
-			}
-		}
-		spin_unlock(&cohc->lock);
-	}
-
-	return IRQ_HANDLED;
-}
-
-static int coh901318_terminate_all(struct dma_chan *chan)
-{
-	unsigned long flags;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	struct coh901318_desc *cohd;
-	void __iomem *virtbase = cohc->base->virtbase;
-
-	/* The remainder of this function terminates the transfer */
-	coh901318_pause(chan);
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* Clear any pending BE or TC interrupt */
-	if (cohc->id < 32) {
-		writel(1 << cohc->id, virtbase + COH901318_BE_INT_CLEAR1);
-		writel(1 << cohc->id, virtbase + COH901318_TC_INT_CLEAR1);
-	} else {
-		writel(1 << (cohc->id - 32), virtbase +
-		       COH901318_BE_INT_CLEAR2);
-		writel(1 << (cohc->id - 32), virtbase +
-		       COH901318_TC_INT_CLEAR2);
-	}
-
-	enable_powersave(cohc);
-
-	while ((cohd = coh901318_first_active_get(cohc))) {
-		/* release the lli allocation*/
-		coh901318_lli_free(&cohc->base->pool, &cohd->lli);
-
-		/* return desc to free-list */
-		coh901318_desc_remove(cohd);
-		coh901318_desc_free(cohc, cohd);
-	}
-
-	while ((cohd = coh901318_first_queued(cohc))) {
-		/* release the lli allocation*/
-		coh901318_lli_free(&cohc->base->pool, &cohd->lli);
-
-		/* return desc to free-list */
-		coh901318_desc_remove(cohd);
-		coh901318_desc_free(cohc, cohd);
-	}
-
-
-	cohc->nbr_active_done = 0;
-	cohc->busy = 0;
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return 0;
-}
-
-static int coh901318_alloc_chan_resources(struct dma_chan *chan)
-{
-	struct coh901318_chan	*cohc = to_coh901318_chan(chan);
-	unsigned long flags;
-
-	dev_vdbg(COHC_2_DEV(cohc), "[%s] DMA channel %d\n",
-		 __func__, cohc->id);
-
-	if (chan->client_count > 1)
-		return -EBUSY;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	coh901318_config(cohc, NULL);
-
-	cohc->allocated = 1;
-	dma_cookie_init(chan);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return 1;
-}
-
-static void
-coh901318_free_chan_resources(struct dma_chan *chan)
-{
-	struct coh901318_chan	*cohc = to_coh901318_chan(chan);
-	int channel = cohc->id;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/* Disable HW */
-	writel(0x00000000U, cohc->base->virtbase + COH901318_CX_CFG +
-	       COH901318_CX_CFG_SPACING*channel);
-	writel(0x00000000U, cohc->base->virtbase + COH901318_CX_CTRL +
-	       COH901318_CX_CTRL_SPACING*channel);
-
-	cohc->allocated = 0;
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	coh901318_terminate_all(chan);
-}
-
-
-static dma_cookie_t
-coh901318_tx_submit(struct dma_async_tx_descriptor *tx)
-{
-	struct coh901318_desc *cohd = container_of(tx, struct coh901318_desc,
-						   desc);
-	struct coh901318_chan *cohc = to_coh901318_chan(tx->chan);
-	unsigned long flags;
-	dma_cookie_t cookie;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-	cookie = dma_cookie_assign(tx);
-
-	coh901318_desc_queue(cohc, cohd);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-
-	return cookie;
-}
-
-static struct dma_async_tx_descriptor *
-coh901318_prep_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
-		      size_t size, unsigned long flags)
-{
-	struct coh901318_lli *lli;
-	struct coh901318_desc *cohd;
-	unsigned long flg;
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	int lli_len;
-	u32 ctrl_last = cohc_chan_param(cohc)->ctrl_lli_last;
-	int ret;
-
-	spin_lock_irqsave(&cohc->lock, flg);
-
-	dev_vdbg(COHC_2_DEV(cohc),
-		 "[%s] channel %d src %pad dest %pad size %zu\n",
-		 __func__, cohc->id, &src, &dest, size);
-
-	if (flags & DMA_PREP_INTERRUPT)
-		/* Trigger interrupt after last lli */
-		ctrl_last |= COH901318_CX_CTRL_TC_IRQ_ENABLE;
-
-	lli_len = size >> MAX_DMA_PACKET_SIZE_SHIFT;
-	if ((lli_len << MAX_DMA_PACKET_SIZE_SHIFT) < size)
-		lli_len++;
-
-	lli = coh901318_lli_alloc(&cohc->base->pool, lli_len);
-
-	if (lli == NULL)
-		goto err;
-
-	ret = coh901318_lli_fill_memcpy(
-		&cohc->base->pool, lli, src, size, dest,
-		cohc_chan_param(cohc)->ctrl_lli_chained,
-		ctrl_last);
-	if (ret)
-		goto err;
-
-	COH_DBG(coh901318_list_print(cohc, lli));
-
-	/* Pick a descriptor to handle this transfer */
-	cohd = coh901318_desc_get(cohc);
-	cohd->lli = lli;
-	cohd->flags = flags;
-	cohd->desc.tx_submit = coh901318_tx_submit;
-
-	spin_unlock_irqrestore(&cohc->lock, flg);
-
-	return &cohd->desc;
- err:
-	spin_unlock_irqrestore(&cohc->lock, flg);
-	return NULL;
-}
-
-static struct dma_async_tx_descriptor *
-coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-			unsigned int sg_len, enum dma_transfer_direction direction,
-			unsigned long flags, void *context)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	struct coh901318_lli *lli;
-	struct coh901318_desc *cohd;
-	const struct coh901318_params *params;
-	struct scatterlist *sg;
-	int len = 0;
-	int size;
-	int i;
-	u32 ctrl_chained = cohc_chan_param(cohc)->ctrl_lli_chained;
-	u32 ctrl = cohc_chan_param(cohc)->ctrl_lli;
-	u32 ctrl_last = cohc_chan_param(cohc)->ctrl_lli_last;
-	u32 config;
-	unsigned long flg;
-	int ret;
-
-	if (!sgl)
-		goto out;
-	if (sg_dma_len(sgl) == 0)
-		goto out;
-
-	spin_lock_irqsave(&cohc->lock, flg);
-
-	dev_vdbg(COHC_2_DEV(cohc), "[%s] sg_len %d dir %d\n",
-		 __func__, sg_len, direction);
-
-	if (flags & DMA_PREP_INTERRUPT)
-		/* Trigger interrupt after last lli */
-		ctrl_last |= COH901318_CX_CTRL_TC_IRQ_ENABLE;
-
-	params = cohc_chan_param(cohc);
-	config = params->config;
-	/*
-	 * Add runtime-specific control on top, make
-	 * sure the bits you set per peripheral channel are
-	 * cleared in the default config from the platform.
-	 */
-	ctrl_chained |= cohc->ctrl;
-	ctrl_last |= cohc->ctrl;
-	ctrl |= cohc->ctrl;
-
-	if (direction == DMA_MEM_TO_DEV) {
-		u32 tx_flags = COH901318_CX_CTRL_PRDD_SOURCE |
-			COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE;
-
-		config |= COH901318_CX_CFG_RM_MEMORY_TO_PRIMARY;
-		ctrl_chained |= tx_flags;
-		ctrl_last |= tx_flags;
-		ctrl |= tx_flags;
-	} else if (direction == DMA_DEV_TO_MEM) {
-		u32 rx_flags = COH901318_CX_CTRL_PRDD_DEST |
-			COH901318_CX_CTRL_DST_ADDR_INC_ENABLE;
-
-		config |= COH901318_CX_CFG_RM_PRIMARY_TO_MEMORY;
-		ctrl_chained |= rx_flags;
-		ctrl_last |= rx_flags;
-		ctrl |= rx_flags;
-	} else
-		goto err_direction;
-
-	/* The dma only supports transmitting packages up to
-	 * MAX_DMA_PACKET_SIZE. Calculate to total number of
-	 * dma elemts required to send the entire sg list
-	 */
-	for_each_sg(sgl, sg, sg_len, i) {
-		unsigned int factor;
-		size = sg_dma_len(sg);
-
-		if (size <= MAX_DMA_PACKET_SIZE) {
-			len++;
-			continue;
-		}
-
-		factor = size >> MAX_DMA_PACKET_SIZE_SHIFT;
-		if ((factor << MAX_DMA_PACKET_SIZE_SHIFT) < size)
-			factor++;
-
-		len += factor;
-	}
-
-	pr_debug("Allocate %d lli:s for this transfer\n", len);
-	lli = coh901318_lli_alloc(&cohc->base->pool, len);
-
-	if (lli == NULL)
-		goto err_dma_alloc;
-
-	coh901318_dma_set_runtimeconfig(chan, &cohc->config, direction);
-
-	/* initiate allocated lli list */
-	ret = coh901318_lli_fill_sg(&cohc->base->pool, lli, sgl, sg_len,
-				    cohc->addr,
-				    ctrl_chained,
-				    ctrl,
-				    ctrl_last,
-				    direction, COH901318_CX_CTRL_TC_IRQ_ENABLE);
-	if (ret)
-		goto err_lli_fill;
-
-
-	COH_DBG(coh901318_list_print(cohc, lli));
-
-	/* Pick a descriptor to handle this transfer */
-	cohd = coh901318_desc_get(cohc);
-	cohd->head_config = config;
-	/*
-	 * Set the default head ctrl for the channel to the one from the
-	 * lli, things may have changed due to odd buffer alignment
-	 * etc.
-	 */
-	cohd->head_ctrl = lli->control;
-	cohd->dir = direction;
-	cohd->flags = flags;
-	cohd->desc.tx_submit = coh901318_tx_submit;
-	cohd->lli = lli;
-
-	spin_unlock_irqrestore(&cohc->lock, flg);
-
-	return &cohd->desc;
- err_lli_fill:
- err_dma_alloc:
- err_direction:
-	spin_unlock_irqrestore(&cohc->lock, flg);
- out:
-	return NULL;
-}
-
-static enum dma_status
-coh901318_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
-		 struct dma_tx_state *txstate)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	enum dma_status ret;
-
-	ret = dma_cookie_status(chan, cookie, txstate);
-	if (ret == DMA_COMPLETE || !txstate)
-		return ret;
-
-	dma_set_residue(txstate, coh901318_get_bytes_left(chan));
-
-	if (ret == DMA_IN_PROGRESS && cohc->stopped)
-		ret = DMA_PAUSED;
-
-	return ret;
-}
-
-static void
-coh901318_issue_pending(struct dma_chan *chan)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	unsigned long flags;
-
-	spin_lock_irqsave(&cohc->lock, flags);
-
-	/*
-	 * Busy means that pending jobs are already being processed,
-	 * and then there is no point in starting the queue: the
-	 * terminal count interrupt on the channel will take the next
-	 * job on the queue and execute it anyway.
-	 */
-	if (!cohc->busy)
-		coh901318_queue_start(cohc);
-
-	spin_unlock_irqrestore(&cohc->lock, flags);
-}
-
-/*
- * Here we wrap in the runtime dma control interface
- */
-struct burst_table {
-	int burst_8bit;
-	int burst_16bit;
-	int burst_32bit;
-	u32 reg;
-};
-
-static const struct burst_table burst_sizes[] = {
-	{
-		.burst_8bit = 64,
-		.burst_16bit = 32,
-		.burst_32bit = 16,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_64_BYTES,
-	},
-	{
-		.burst_8bit = 48,
-		.burst_16bit = 24,
-		.burst_32bit = 12,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_48_BYTES,
-	},
-	{
-		.burst_8bit = 32,
-		.burst_16bit = 16,
-		.burst_32bit = 8,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_32_BYTES,
-	},
-	{
-		.burst_8bit = 16,
-		.burst_16bit = 8,
-		.burst_32bit = 4,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_16_BYTES,
-	},
-	{
-		.burst_8bit = 8,
-		.burst_16bit = 4,
-		.burst_32bit = 2,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_8_BYTES,
-	},
-	{
-		.burst_8bit = 4,
-		.burst_16bit = 2,
-		.burst_32bit = 1,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_4_BYTES,
-	},
-	{
-		.burst_8bit = 2,
-		.burst_16bit = 1,
-		.burst_32bit = 0,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_2_BYTES,
-	},
-	{
-		.burst_8bit = 1,
-		.burst_16bit = 0,
-		.burst_32bit = 0,
-		.reg = COH901318_CX_CTRL_BURST_COUNT_1_BYTE,
-	},
-};
-
-static int coh901318_dma_set_runtimeconfig(struct dma_chan *chan,
-					   struct dma_slave_config *config,
-					   enum dma_transfer_direction direction)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-	dma_addr_t addr;
-	enum dma_slave_buswidth addr_width;
-	u32 maxburst;
-	u32 ctrl = 0;
-	int i = 0;
-
-	/* We only support mem to per or per to mem transfers */
-	if (direction == DMA_DEV_TO_MEM) {
-		addr = config->src_addr;
-		addr_width = config->src_addr_width;
-		maxburst = config->src_maxburst;
-	} else if (direction == DMA_MEM_TO_DEV) {
-		addr = config->dst_addr;
-		addr_width = config->dst_addr_width;
-		maxburst = config->dst_maxburst;
-	} else {
-		dev_err(COHC_2_DEV(cohc), "illegal channel mode\n");
-		return -EINVAL;
-	}
-
-	dev_dbg(COHC_2_DEV(cohc), "configure channel for %d byte transfers\n",
-		addr_width);
-	switch (addr_width)  {
-	case DMA_SLAVE_BUSWIDTH_1_BYTE:
-		ctrl |=
-			COH901318_CX_CTRL_SRC_BUS_SIZE_8_BITS |
-			COH901318_CX_CTRL_DST_BUS_SIZE_8_BITS;
-
-		while (i < ARRAY_SIZE(burst_sizes)) {
-			if (burst_sizes[i].burst_8bit <= maxburst)
-				break;
-			i++;
-		}
-
-		break;
-	case DMA_SLAVE_BUSWIDTH_2_BYTES:
-		ctrl |=
-			COH901318_CX_CTRL_SRC_BUS_SIZE_16_BITS |
-			COH901318_CX_CTRL_DST_BUS_SIZE_16_BITS;
-
-		while (i < ARRAY_SIZE(burst_sizes)) {
-			if (burst_sizes[i].burst_16bit <= maxburst)
-				break;
-			i++;
-		}
-
-		break;
-	case DMA_SLAVE_BUSWIDTH_4_BYTES:
-		/* Direction doesn't matter here, it's 32/32 bits */
-		ctrl |=
-			COH901318_CX_CTRL_SRC_BUS_SIZE_32_BITS |
-			COH901318_CX_CTRL_DST_BUS_SIZE_32_BITS;
-
-		while (i < ARRAY_SIZE(burst_sizes)) {
-			if (burst_sizes[i].burst_32bit <= maxburst)
-				break;
-			i++;
-		}
-
-		break;
-	default:
-		dev_err(COHC_2_DEV(cohc),
-			"bad runtimeconfig: alien address width\n");
-		return -EINVAL;
-	}
-
-	ctrl |= burst_sizes[i].reg;
-	dev_dbg(COHC_2_DEV(cohc),
-		"selected burst size %d bytes for address width %d bytes, maxburst %d\n",
-		burst_sizes[i].burst_8bit, addr_width, maxburst);
-
-	cohc->addr = addr;
-	cohc->ctrl = ctrl;
-
-	return 0;
-}
-
-static int coh901318_dma_slave_config(struct dma_chan *chan,
-					   struct dma_slave_config *config)
-{
-	struct coh901318_chan *cohc = to_coh901318_chan(chan);
-
-	memcpy(&cohc->config, config, sizeof(*config));
-
-	return 0;
-}
-
-static void coh901318_base_init(struct dma_device *dma, const int *pick_chans,
-				struct coh901318_base *base)
-{
-	int chans_i;
-	int i = 0;
-	struct coh901318_chan *cohc;
-
-	INIT_LIST_HEAD(&dma->channels);
-
-	for (chans_i = 0; pick_chans[chans_i] != -1; chans_i += 2) {
-		for (i = pick_chans[chans_i]; i <= pick_chans[chans_i+1]; i++) {
-			cohc = &base->chans[i];
-
-			cohc->base = base;
-			cohc->chan.device = dma;
-			cohc->id = i;
-
-			/* TODO: do we really need this lock if only one
-			 * client is connected to each channel?
-			 */
-
-			spin_lock_init(&cohc->lock);
-
-			cohc->nbr_active_done = 0;
-			cohc->busy = 0;
-			INIT_LIST_HEAD(&cohc->free);
-			INIT_LIST_HEAD(&cohc->active);
-			INIT_LIST_HEAD(&cohc->queue);
-
-			tasklet_setup(&cohc->tasklet, dma_tasklet);
-
-			list_add_tail(&cohc->chan.device_node,
-				      &dma->channels);
-		}
-	}
-}
-
-static int __init coh901318_probe(struct platform_device *pdev)
-{
-	int err = 0;
-	struct coh901318_base *base;
-	int irq;
-	struct resource *io;
-
-	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!io)
-		return -ENODEV;
-
-	/* Map DMA controller registers to virtual memory */
-	if (devm_request_mem_region(&pdev->dev,
-				    io->start,
-				    resource_size(io),
-				    pdev->dev.driver->name) == NULL)
-		return -ENOMEM;
-
-	base = devm_kzalloc(&pdev->dev,
-			    ALIGN(sizeof(struct coh901318_base), 4) +
-			    U300_DMA_CHANNELS *
-			    sizeof(struct coh901318_chan),
-			    GFP_KERNEL);
-	if (!base)
-		return -ENOMEM;
-
-	base->chans = ((void *)base) + ALIGN(sizeof(struct coh901318_base), 4);
-
-	base->virtbase = devm_ioremap(&pdev->dev, io->start, resource_size(io));
-	if (!base->virtbase)
-		return -ENOMEM;
-
-	base->dev = &pdev->dev;
-	spin_lock_init(&base->pm.lock);
-	base->pm.started_channels = 0;
-
-	COH901318_DEBUGFS_ASSIGN(debugfs_dma_base, base);
-
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
-
-	err = devm_request_irq(&pdev->dev, irq, dma_irq_handler, 0,
-			       "coh901318", base);
-	if (err)
-		return err;
-
-	base->irq = irq;
-
-	err = coh901318_pool_create(&base->pool, &pdev->dev,
-				    sizeof(struct coh901318_lli),
-				    32);
-	if (err)
-		return err;
-
-	/* init channels for device transfers */
-	coh901318_base_init(&base->dma_slave, dma_slave_channels,
-			    base);
-
-	dma_cap_zero(base->dma_slave.cap_mask);
-	dma_cap_set(DMA_SLAVE, base->dma_slave.cap_mask);
-
-	base->dma_slave.device_alloc_chan_resources = coh901318_alloc_chan_resources;
-	base->dma_slave.device_free_chan_resources = coh901318_free_chan_resources;
-	base->dma_slave.device_prep_slave_sg = coh901318_prep_slave_sg;
-	base->dma_slave.device_tx_status = coh901318_tx_status;
-	base->dma_slave.device_issue_pending = coh901318_issue_pending;
-	base->dma_slave.device_config = coh901318_dma_slave_config;
-	base->dma_slave.device_pause = coh901318_pause;
-	base->dma_slave.device_resume = coh901318_resume;
-	base->dma_slave.device_terminate_all = coh901318_terminate_all;
-	base->dma_slave.dev = &pdev->dev;
-
-	err = dma_async_device_register(&base->dma_slave);
-
-	if (err)
-		goto err_register_slave;
-
-	/* init channels for memcpy */
-	coh901318_base_init(&base->dma_memcpy, dma_memcpy_channels,
-			    base);
-
-	dma_cap_zero(base->dma_memcpy.cap_mask);
-	dma_cap_set(DMA_MEMCPY, base->dma_memcpy.cap_mask);
-
-	base->dma_memcpy.device_alloc_chan_resources = coh901318_alloc_chan_resources;
-	base->dma_memcpy.device_free_chan_resources = coh901318_free_chan_resources;
-	base->dma_memcpy.device_prep_dma_memcpy = coh901318_prep_memcpy;
-	base->dma_memcpy.device_tx_status = coh901318_tx_status;
-	base->dma_memcpy.device_issue_pending = coh901318_issue_pending;
-	base->dma_memcpy.device_config = coh901318_dma_slave_config;
-	base->dma_memcpy.device_pause = coh901318_pause;
-	base->dma_memcpy.device_resume = coh901318_resume;
-	base->dma_memcpy.device_terminate_all = coh901318_terminate_all;
-	base->dma_memcpy.dev = &pdev->dev;
-	/*
-	 * This controller can only access address at even 32bit boundaries,
-	 * i.e. 2^2
-	 */
-	base->dma_memcpy.copy_align = DMAENGINE_ALIGN_4_BYTES;
-	err = dma_async_device_register(&base->dma_memcpy);
-
-	if (err)
-		goto err_register_memcpy;
-
-	err = of_dma_controller_register(pdev->dev.of_node, coh901318_xlate,
-					 base);
-	if (err)
-		goto err_register_of_dma;
-
-	platform_set_drvdata(pdev, base);
-	dev_info(&pdev->dev, "Initialized COH901318 DMA on virtual base 0x%p\n",
-		base->virtbase);
-
-	return err;
-
- err_register_of_dma:
-	dma_async_device_unregister(&base->dma_memcpy);
- err_register_memcpy:
-	dma_async_device_unregister(&base->dma_slave);
- err_register_slave:
-	coh901318_pool_destroy(&base->pool);
-	return err;
-}
-static void coh901318_base_remove(struct coh901318_base *base, const int *pick_chans)
-{
-	int chans_i;
-	int i = 0;
-	struct coh901318_chan *cohc;
-
-	for (chans_i = 0; pick_chans[chans_i] != -1; chans_i += 2) {
-		for (i = pick_chans[chans_i]; i <= pick_chans[chans_i+1]; i++) {
-			cohc = &base->chans[i];
-
-			tasklet_kill(&cohc->tasklet);
-		}
-	}
-
-}
-
-static int coh901318_remove(struct platform_device *pdev)
-{
-	struct coh901318_base *base = platform_get_drvdata(pdev);
-
-	devm_free_irq(&pdev->dev, base->irq, base);
-
-	coh901318_base_remove(base, dma_slave_channels);
-	coh901318_base_remove(base, dma_memcpy_channels);
-
-	of_dma_controller_free(pdev->dev.of_node);
-	dma_async_device_unregister(&base->dma_memcpy);
-	dma_async_device_unregister(&base->dma_slave);
-	coh901318_pool_destroy(&base->pool);
-	return 0;
-}
-
-static const struct of_device_id coh901318_dt_match[] = {
-	{ .compatible = "stericsson,coh901318" },
-	{},
-};
-
-static struct platform_driver coh901318_driver = {
-	.remove = coh901318_remove,
-	.driver = {
-		.name	= "coh901318",
-		.of_match_table = coh901318_dt_match,
-	},
-};
-
-static int __init coh901318_init(void)
-{
-	return platform_driver_probe(&coh901318_driver, coh901318_probe);
-}
-subsys_initcall(coh901318_init);
-
-static void __exit coh901318_exit(void)
-{
-	platform_driver_unregister(&coh901318_driver);
-}
-module_exit(coh901318_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Per Friden");
diff --git a/drivers/dma/coh901318.h b/drivers/dma/coh901318.h
deleted file mode 100644
index bbf533600558..000000000000
--- a/drivers/dma/coh901318.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2007-2013 ST-Ericsson
- * DMA driver for COH 901 318
- * Author: Per Friden <per.friden@stericsson.com>
- */
-
-#ifndef COH901318_H
-#define COH901318_H
-
-#define MAX_DMA_PACKET_SIZE_SHIFT 11
-#define MAX_DMA_PACKET_SIZE (1 << MAX_DMA_PACKET_SIZE_SHIFT)
-
-struct device;
-
-struct coh901318_pool {
-	spinlock_t lock;
-	struct dma_pool *dmapool;
-	struct device *dev;
-
-#ifdef CONFIG_DEBUG_FS
-	int debugfs_pool_counter;
-#endif
-};
-
-/**
- * struct coh901318_lli - linked list item for DMAC
- * @control: control settings for DMAC
- * @src_addr: transfer source address
- * @dst_addr: transfer destination address
- * @link_addr:  physical address to next lli
- * @virt_link_addr: virtual address of next lli (only used by pool_free)
- * @phy_this: physical address of current lli (only used by pool_free)
- */
-struct coh901318_lli {
-	u32 control;
-	dma_addr_t src_addr;
-	dma_addr_t dst_addr;
-	dma_addr_t link_addr;
-
-	void *virt_link_addr;
-	dma_addr_t phy_this;
-};
-
-/**
- * coh901318_pool_create() - Creates an dma pool for lli:s
- * @pool: pool handle
- * @dev: dma device
- * @lli_nbr: number of lli:s in the pool
- * @algin: address alignemtn of lli:s
- * returns 0 on success otherwise none zero
- */
-int coh901318_pool_create(struct coh901318_pool *pool,
-			  struct device *dev,
-			  size_t lli_nbr, size_t align);
-
-/**
- * coh901318_pool_destroy() - Destroys the dma pool
- * @pool: pool handle
- * returns 0 on success otherwise none zero
- */
-int coh901318_pool_destroy(struct coh901318_pool *pool);
-
-/**
- * coh901318_lli_alloc() - Allocates a linked list
- *
- * @pool: pool handle
- * @len: length to list
- * return: none NULL if success otherwise NULL
- */
-struct coh901318_lli *
-coh901318_lli_alloc(struct coh901318_pool *pool,
-		    unsigned int len);
-
-/**
- * coh901318_lli_free() - Returns the linked list items to the pool
- * @pool: pool handle
- * @lli: reference to lli pointer to be freed
- */
-void coh901318_lli_free(struct coh901318_pool *pool,
-			struct coh901318_lli **lli);
-
-/**
- * coh901318_lli_fill_memcpy() - Prepares the lli:s for dma memcpy
- * @pool: pool handle
- * @lli: allocated lli
- * @src: src address
- * @size: transfer size
- * @dst: destination address
- * @ctrl_chained: ctrl for chained lli
- * @ctrl_last: ctrl for the last lli
- * returns number of CPU interrupts for the lli, negative on error.
- */
-int
-coh901318_lli_fill_memcpy(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t src, unsigned int size,
-			  dma_addr_t dst, u32 ctrl_chained, u32 ctrl_last);
-
-/**
- * coh901318_lli_fill_single() - Prepares the lli:s for dma single transfer
- * @pool: pool handle
- * @lli: allocated lli
- * @buf: transfer buffer
- * @size: transfer size
- * @dev_addr: address of periphal
- * @ctrl_chained: ctrl for chained lli
- * @ctrl_last: ctrl for the last lli
- * @dir: direction of transfer (to or from device)
- * returns number of CPU interrupts for the lli, negative on error.
- */
-int
-coh901318_lli_fill_single(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t buf, unsigned int size,
-			  dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_last,
-			  enum dma_transfer_direction dir);
-
-/**
- * coh901318_lli_fill_single() - Prepares the lli:s for dma scatter list transfer
- * @pool: pool handle
- * @lli: allocated lli
- * @sg: scatter gather list
- * @nents: number of entries in sg
- * @dev_addr: address of periphal
- * @ctrl_chained: ctrl for chained lli
- * @ctrl: ctrl of middle lli
- * @ctrl_last: ctrl for the last lli
- * @dir: direction of transfer (to or from device)
- * @ctrl_irq_mask: ctrl mask for CPU interrupt
- * returns number of CPU interrupts for the lli, negative on error.
- */
-int
-coh901318_lli_fill_sg(struct coh901318_pool *pool,
-		      struct coh901318_lli *lli,
-		      struct scatterlist *sg, unsigned int nents,
-		      dma_addr_t dev_addr, u32 ctrl_chained,
-		      u32 ctrl, u32 ctrl_last,
-		      enum dma_transfer_direction dir, u32 ctrl_irq_mask);
-
-#endif /* COH901318_H */
diff --git a/drivers/dma/coh901318_lli.c b/drivers/dma/coh901318_lli.c
deleted file mode 100644
index 6b6c2fd0865a..000000000000
--- a/drivers/dma/coh901318_lli.c
+++ /dev/null
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * driver/dma/coh901318_lli.c
- *
- * Copyright (C) 2007-2009 ST-Ericsson
- * Support functions for handling lli for dma
- * Author: Per Friden <per.friden@stericsson.com>
- */
-
-#include <linux/spinlock.h>
-#include <linux/memory.h>
-#include <linux/gfp.h>
-#include <linux/dmapool.h>
-#include <linux/dmaengine.h>
-
-#include "coh901318.h"
-
-#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_U300_DEBUG))
-#define DEBUGFS_POOL_COUNTER_RESET(pool) (pool->debugfs_pool_counter = 0)
-#define DEBUGFS_POOL_COUNTER_ADD(pool, add) (pool->debugfs_pool_counter += add)
-#else
-#define DEBUGFS_POOL_COUNTER_RESET(pool)
-#define DEBUGFS_POOL_COUNTER_ADD(pool, add)
-#endif
-
-static struct coh901318_lli *
-coh901318_lli_next(struct coh901318_lli *data)
-{
-	if (data == NULL || data->link_addr == 0)
-		return NULL;
-
-	return (struct coh901318_lli *) data->virt_link_addr;
-}
-
-int coh901318_pool_create(struct coh901318_pool *pool,
-			  struct device *dev,
-			  size_t size, size_t align)
-{
-	spin_lock_init(&pool->lock);
-	pool->dev = dev;
-	pool->dmapool = dma_pool_create("lli_pool", dev, size, align, 0);
-
-	DEBUGFS_POOL_COUNTER_RESET(pool);
-	return 0;
-}
-
-int coh901318_pool_destroy(struct coh901318_pool *pool)
-{
-
-	dma_pool_destroy(pool->dmapool);
-	return 0;
-}
-
-struct coh901318_lli *
-coh901318_lli_alloc(struct coh901318_pool *pool, unsigned int len)
-{
-	int i;
-	struct coh901318_lli *head;
-	struct coh901318_lli *lli;
-	struct coh901318_lli *lli_prev;
-	dma_addr_t phy;
-
-	if (len == 0)
-		return NULL;
-
-	spin_lock(&pool->lock);
-
-	head = dma_pool_alloc(pool->dmapool, GFP_NOWAIT, &phy);
-
-	if (head == NULL)
-		goto err;
-
-	DEBUGFS_POOL_COUNTER_ADD(pool, 1);
-
-	lli = head;
-	lli->phy_this = phy;
-	lli->link_addr = 0x00000000;
-	lli->virt_link_addr = NULL;
-
-	for (i = 1; i < len; i++) {
-		lli_prev = lli;
-
-		lli = dma_pool_alloc(pool->dmapool, GFP_NOWAIT, &phy);
-
-		if (lli == NULL)
-			goto err_clean_up;
-
-		DEBUGFS_POOL_COUNTER_ADD(pool, 1);
-		lli->phy_this = phy;
-		lli->link_addr = 0x00000000;
-		lli->virt_link_addr = NULL;
-
-		lli_prev->link_addr = phy;
-		lli_prev->virt_link_addr = lli;
-	}
-
-	spin_unlock(&pool->lock);
-
-	return head;
-
- err:
-	spin_unlock(&pool->lock);
-	return NULL;
-
- err_clean_up:
-	lli_prev->link_addr = 0x00000000U;
-	spin_unlock(&pool->lock);
-	coh901318_lli_free(pool, &head);
-	return NULL;
-}
-
-void coh901318_lli_free(struct coh901318_pool *pool,
-			struct coh901318_lli **lli)
-{
-	struct coh901318_lli *l;
-	struct coh901318_lli *next;
-
-	if (lli == NULL)
-		return;
-
-	l = *lli;
-
-	if (l == NULL)
-		return;
-
-	spin_lock(&pool->lock);
-
-	while (l->link_addr) {
-		next = l->virt_link_addr;
-		dma_pool_free(pool->dmapool, l, l->phy_this);
-		DEBUGFS_POOL_COUNTER_ADD(pool, -1);
-		l = next;
-	}
-	dma_pool_free(pool->dmapool, l, l->phy_this);
-	DEBUGFS_POOL_COUNTER_ADD(pool, -1);
-
-	spin_unlock(&pool->lock);
-	*lli = NULL;
-}
-
-int
-coh901318_lli_fill_memcpy(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t source, unsigned int size,
-			  dma_addr_t destination, u32 ctrl_chained,
-			  u32 ctrl_eom)
-{
-	int s = size;
-	dma_addr_t src = source;
-	dma_addr_t dst = destination;
-
-	lli->src_addr = src;
-	lli->dst_addr = dst;
-
-	while (lli->link_addr) {
-		lli->control = ctrl_chained | MAX_DMA_PACKET_SIZE;
-		lli->src_addr = src;
-		lli->dst_addr = dst;
-
-		s -= MAX_DMA_PACKET_SIZE;
-		lli = coh901318_lli_next(lli);
-
-		src += MAX_DMA_PACKET_SIZE;
-		dst += MAX_DMA_PACKET_SIZE;
-	}
-
-	lli->control = ctrl_eom | s;
-	lli->src_addr = src;
-	lli->dst_addr = dst;
-
-	return 0;
-}
-
-int
-coh901318_lli_fill_single(struct coh901318_pool *pool,
-			  struct coh901318_lli *lli,
-			  dma_addr_t buf, unsigned int size,
-			  dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_eom,
-			  enum dma_transfer_direction dir)
-{
-	int s = size;
-	dma_addr_t src;
-	dma_addr_t dst;
-
-
-	if (dir == DMA_MEM_TO_DEV) {
-		src = buf;
-		dst = dev_addr;
-
-	} else if (dir == DMA_DEV_TO_MEM) {
-
-		src = dev_addr;
-		dst = buf;
-	} else {
-		return -EINVAL;
-	}
-
-	while (lli->link_addr) {
-		size_t block_size = MAX_DMA_PACKET_SIZE;
-		lli->control = ctrl_chained | MAX_DMA_PACKET_SIZE;
-
-		/* If we are on the next-to-final block and there will
-		 * be less than half a DMA packet left for the last
-		 * block, then we want to make this block a little
-		 * smaller to balance the sizes. This is meant to
-		 * avoid too small transfers if the buffer size is
-		 * (MAX_DMA_PACKET_SIZE*N + 1) */
-		if (s < (MAX_DMA_PACKET_SIZE + MAX_DMA_PACKET_SIZE/2))
-			block_size = MAX_DMA_PACKET_SIZE/2;
-
-		s -= block_size;
-		lli->src_addr = src;
-		lli->dst_addr = dst;
-
-		lli = coh901318_lli_next(lli);
-
-		if (dir == DMA_MEM_TO_DEV)
-			src += block_size;
-		else if (dir == DMA_DEV_TO_MEM)
-			dst += block_size;
-	}
-
-	lli->control = ctrl_eom | s;
-	lli->src_addr = src;
-	lli->dst_addr = dst;
-
-	return 0;
-}
-
-int
-coh901318_lli_fill_sg(struct coh901318_pool *pool,
-		      struct coh901318_lli *lli,
-		      struct scatterlist *sgl, unsigned int nents,
-		      dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl,
-		      u32 ctrl_last,
-		      enum dma_transfer_direction dir, u32 ctrl_irq_mask)
-{
-	int i;
-	struct scatterlist *sg;
-	u32 ctrl_sg;
-	dma_addr_t src = 0;
-	dma_addr_t dst = 0;
-	u32 bytes_to_transfer;
-	u32 elem_size;
-
-	if (lli == NULL)
-		goto err;
-
-	spin_lock(&pool->lock);
-
-	if (dir == DMA_MEM_TO_DEV)
-		dst = dev_addr;
-	else if (dir == DMA_DEV_TO_MEM)
-		src = dev_addr;
-	else
-		goto err;
-
-	for_each_sg(sgl, sg, nents, i) {
-		if (sg_is_chain(sg)) {
-			/* sg continues to the next sg-element don't
-			 * send ctrl_finish until the last
-			 * sg-element in the chain
-			 */
-			ctrl_sg = ctrl_chained;
-		} else if (i == nents - 1)
-			ctrl_sg = ctrl_last;
-		else
-			ctrl_sg = ctrl ? ctrl : ctrl_last;
-
-
-		if (dir == DMA_MEM_TO_DEV)
-			/* increment source address */
-			src = sg_dma_address(sg);
-		else
-			/* increment destination address */
-			dst = sg_dma_address(sg);
-
-		bytes_to_transfer = sg_dma_len(sg);
-
-		while (bytes_to_transfer) {
-			u32 val;
-
-			if (bytes_to_transfer > MAX_DMA_PACKET_SIZE) {
-				elem_size = MAX_DMA_PACKET_SIZE;
-				val = ctrl_chained;
-			} else {
-				elem_size = bytes_to_transfer;
-				val = ctrl_sg;
-			}
-
-			lli->control = val | elem_size;
-			lli->src_addr = src;
-			lli->dst_addr = dst;
-
-			if (dir == DMA_DEV_TO_MEM)
-				dst += elem_size;
-			else
-				src += elem_size;
-
-			BUG_ON(lli->link_addr & 3);
-
-			bytes_to_transfer -= elem_size;
-			lli = coh901318_lli_next(lli);
-		}
-
-	}
-	spin_unlock(&pool->lock);
-
-	return 0;
- err:
-	spin_unlock(&pool->lock);
-	return -EINVAL;
-}
diff --git a/include/linux/platform_data/dma-coh901318.h b/include/linux/platform_data/dma-coh901318.h
deleted file mode 100644
index 4cca529f8d56..000000000000
--- a/include/linux/platform_data/dma-coh901318.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Platform data for the COH901318 DMA controller
- * Copyright (C) 2007-2013 ST-Ericsson
- */
-
-#ifndef PLAT_COH901318_H
-#define PLAT_COH901318_H
-
-#ifdef CONFIG_COH901318
-
-/* We only support the U300 DMA channels */
-#define U300_DMA_MSL_TX_0		0
-#define U300_DMA_MSL_TX_1		1
-#define U300_DMA_MSL_TX_2		2
-#define U300_DMA_MSL_TX_3		3
-#define U300_DMA_MSL_TX_4		4
-#define U300_DMA_MSL_TX_5		5
-#define U300_DMA_MSL_TX_6		6
-#define U300_DMA_MSL_RX_0		7
-#define U300_DMA_MSL_RX_1		8
-#define U300_DMA_MSL_RX_2		9
-#define U300_DMA_MSL_RX_3		10
-#define U300_DMA_MSL_RX_4		11
-#define U300_DMA_MSL_RX_5		12
-#define U300_DMA_MSL_RX_6		13
-#define U300_DMA_MMCSD_RX_TX		14
-#define U300_DMA_MSPRO_TX		15
-#define U300_DMA_MSPRO_RX		16
-#define U300_DMA_UART0_TX		17
-#define U300_DMA_UART0_RX		18
-#define U300_DMA_APEX_TX		19
-#define U300_DMA_APEX_RX		20
-#define U300_DMA_PCM_I2S0_TX		21
-#define U300_DMA_PCM_I2S0_RX		22
-#define U300_DMA_PCM_I2S1_TX		23
-#define U300_DMA_PCM_I2S1_RX		24
-#define U300_DMA_XGAM_CDI		25
-#define U300_DMA_XGAM_PDI		26
-#define U300_DMA_SPI_TX			27
-#define U300_DMA_SPI_RX			28
-#define U300_DMA_GENERAL_PURPOSE_0	29
-#define U300_DMA_GENERAL_PURPOSE_1	30
-#define U300_DMA_GENERAL_PURPOSE_2	31
-#define U300_DMA_GENERAL_PURPOSE_3	32
-#define U300_DMA_GENERAL_PURPOSE_4	33
-#define U300_DMA_GENERAL_PURPOSE_5	34
-#define U300_DMA_GENERAL_PURPOSE_6	35
-#define U300_DMA_GENERAL_PURPOSE_7	36
-#define U300_DMA_GENERAL_PURPOSE_8	37
-#define U300_DMA_UART1_TX		38
-#define U300_DMA_UART1_RX		39
-
-#define U300_DMA_DEVICE_CHANNELS	32
-#define U300_DMA_CHANNELS		40
-
-/**
- * coh901318_filter_id() - DMA channel filter function
- * @chan: dma channel handle
- * @chan_id: id of dma channel to be filter out
- *
- * In dma_request_channel() it specifies what channel id to be requested
- */
-bool coh901318_filter_id(struct dma_chan *chan, void *chan_id);
-#else
-static inline bool coh901318_filter_id(struct dma_chan *chan, void *chan_id)
-{
-	return false;
-}
-#endif
-
-#endif /* PLAT_COH901318_H */
-- 
cgit v1.2.3


From e247f85a9bf6af9ce6bc36d86ac242f782ae0947 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Thu, 21 Jan 2021 12:03:54 +0100
Subject: dmaengine: mmp_pdma: Remove mmp_pdma_filter_fn()

It's not used anywhere -- drop it.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Link: https://lore.kernel.org/r/20210121110356.1768635-2-lkundrak@v3.sk
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/mmp_pdma.c       | 14 --------------
 include/linux/dma/mmp-pdma.h | 16 ----------------
 2 files changed, 30 deletions(-)
 delete mode 100644 include/linux/dma/mmp-pdma.h

(limited to 'include/linux')

diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c
index b84303be8edf..89f1814ff27a 100644
--- a/drivers/dma/mmp_pdma.c
+++ b/drivers/dma/mmp_pdma.c
@@ -18,7 +18,6 @@
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
 #include <linux/of.h>
-#include <linux/dma/mmp-pdma.h>
 
 #include "dmaengine.h"
 
@@ -1148,19 +1147,6 @@ static struct platform_driver mmp_pdma_driver = {
 	.remove		= mmp_pdma_remove,
 };
 
-bool mmp_pdma_filter_fn(struct dma_chan *chan, void *param)
-{
-	struct mmp_pdma_chan *c = to_mmp_pdma_chan(chan);
-
-	if (chan->device->dev->driver != &mmp_pdma_driver.driver)
-		return false;
-
-	c->drcmr = *(unsigned int *)param;
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(mmp_pdma_filter_fn);
-
 module_platform_driver(mmp_pdma_driver);
 
 MODULE_DESCRIPTION("MARVELL MMP Peripheral DMA Driver");
diff --git a/include/linux/dma/mmp-pdma.h b/include/linux/dma/mmp-pdma.h
deleted file mode 100644
index 25cab62a28c4..000000000000
--- a/include/linux/dma/mmp-pdma.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _MMP_PDMA_H_
-#define _MMP_PDMA_H_
-
-struct dma_chan;
-
-#ifdef CONFIG_MMP_PDMA
-bool mmp_pdma_filter_fn(struct dma_chan *chan, void *param);
-#else
-static inline bool mmp_pdma_filter_fn(struct dma_chan *chan, void *param)
-{
-	return false;
-}
-#endif
-
-#endif /* _MMP_PDMA_H_ */
-- 
cgit v1.2.3


From fd765da06066215b5e75c5c2a0db1543a81aa14d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 21 Jan 2021 08:18:12 +0100
Subject: staging: hikey9xx: hi6421v600-regulator: fix delay logic

The original driver, which can be seen at
commit 42f24d9d446a ("staging: regulator: add a regulator driver for HiSilicon 6421v600 SPMI PMIC")
had a complex logic to ensure that there won't be multiple power
enable/disable commands running at the same time. At the original
logic, it were ensured that:

- a next power up/down would wait for at least the on/off period;
- an extra delay would be granted. It turns that such extra delay
  has a value of zero, but it was relying on gettimeofday()
  call, which can take some time.

This was later simplified, but there are still some possible
issues. In order to avoid that, let's simply add a delay
to wait for the power up line to stabilize after powering up
a device.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/6733dac9813ba6688def404142cb7b964accf758.1611212783.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/hikey9xx/hi6421v600-regulator.c | 5 +++--
 include/linux/mfd/hi6421-spmi-pmic.h            | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/hikey9xx/hi6421v600-regulator.c b/drivers/staging/hikey9xx/hi6421v600-regulator.c
index 5e78eebfc1f3..e5a492ee7121 100644
--- a/drivers/staging/hikey9xx/hi6421v600-regulator.c
+++ b/drivers/staging/hikey9xx/hi6421v600-regulator.c
@@ -113,14 +113,15 @@ static int hi6421_spmi_regulator_enable(struct regulator_dev *rdev)
 
 	/* cannot enable more than one regulator at one time */
 	mutex_lock(&sreg->enable_mutex);
-	usleep_range(HISI_REGS_ENA_PROTECT_TIME,
-		     HISI_REGS_ENA_PROTECT_TIME + 1000);
 
 	/* set enable register */
 	ret = hi6421_spmi_pmic_rmw(pmic, rdev->desc->enable_reg,
 				   rdev->desc->enable_mask,
 				   rdev->desc->enable_mask);
 
+	/* Avoid powering up multiple devices at the same time */
+	usleep_range(rdev->desc->off_on_delay, rdev->desc->off_on_delay + 60);
+
 	mutex_unlock(&sreg->enable_mutex);
 
 	return ret;
diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h
index 2c8896fd852e..0c2214612c4e 100644
--- a/include/linux/mfd/hi6421-spmi-pmic.h
+++ b/include/linux/mfd/hi6421-spmi-pmic.h
@@ -13,7 +13,6 @@
 
 #include <linux/irqdomain.h>
 
-#define HISI_REGS_ENA_PROTECT_TIME	(0)	/* in microseconds */
 #define HISI_ECO_MODE_ENABLE		(1)
 #define HISI_ECO_MODE_DISABLE		(0)
 
-- 
cgit v1.2.3


From 7eecea89e44f64e60f1410483e79a0cab18ad580 Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Wed, 20 Jan 2021 08:33:40 -0800
Subject: VMCI: Enforce queuepair max size for IOCTL_VMCI_QUEUEPAIR_ALLOC

When create the VMCI queue pair tracking data structures on the host
side, the IOCTL for creating the VMCI queue pair didn't validate
the queue pair size parameters. This change adds checks for this.

This avoids a memory allocation issue in qp_host_alloc_queue, as
reported by nslusarek@gmx.net. The check in qp_host_alloc_queue
has also been updated to enforce the maximum queue pair size
as defined by VMCI_MAX_GUEST_QP_MEMORY.

The fix has been verified using sample code supplied by
nslusarek@gmx.net.

Reported-by: nslusarek@gmx.net
Reviewed-by: Vishnu Dasa <vdasa@vmware.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Link: https://lore.kernel.org/r/1611160420-30573-1-git-send-email-jhansen@vmware.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 12 ++++++++----
 include/linux/vmw_vmci_defs.h           |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 525ef96d3a07..d787ddecee77 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -237,7 +237,9 @@ static struct qp_list qp_guest_endpoints = {
 #define QPE_NUM_PAGES(_QPE) ((u32) \
 			     (DIV_ROUND_UP(_QPE.produce_size, PAGE_SIZE) + \
 			      DIV_ROUND_UP(_QPE.consume_size, PAGE_SIZE) + 2))
-
+#define QP_SIZES_ARE_VALID(_prod_qsize, _cons_qsize) \
+	((_prod_qsize) + (_cons_qsize) >= max(_prod_qsize, _cons_qsize) && \
+	 (_prod_qsize) + (_cons_qsize) <= VMCI_MAX_GUEST_QP_MEMORY)
 
 /*
  * Frees kernel VA space for a given queue and its queue header, and
@@ -528,7 +530,7 @@ static struct vmci_queue *qp_host_alloc_queue(u64 size)
 	u64 num_pages;
 	const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if));
 
-	if (size > SIZE_MAX - PAGE_SIZE)
+	if (size > min_t(size_t, VMCI_MAX_GUEST_QP_MEMORY, SIZE_MAX - PAGE_SIZE))
 		return NULL;
 	num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
 	if (num_pages > (SIZE_MAX - queue_size) /
@@ -1929,6 +1931,9 @@ int vmci_qp_broker_alloc(struct vmci_handle handle,
 			 struct vmci_qp_page_store *page_store,
 			 struct vmci_ctx *context)
 {
+	if (!QP_SIZES_ARE_VALID(produce_size, consume_size))
+		return VMCI_ERROR_NO_RESOURCES;
+
 	return qp_broker_alloc(handle, peer, flags, priv_flags,
 			       produce_size, consume_size,
 			       page_store, context, NULL, NULL, NULL, NULL);
@@ -2685,8 +2690,7 @@ int vmci_qpair_alloc(struct vmci_qp **qpair,
 	 * used by the device is NO_RESOURCES, so use that here too.
 	 */
 
-	if (produce_qsize + consume_qsize < max(produce_qsize, consume_qsize) ||
-	    produce_qsize + consume_qsize > VMCI_MAX_GUEST_QP_MEMORY)
+	if (!QP_SIZES_ARE_VALID(produce_qsize, consume_qsize))
 		return VMCI_ERROR_NO_RESOURCES;
 
 	retval = vmci_route(&src, &dst, false, &route);
diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h
index be0afe6f379b..e36cb114c188 100644
--- a/include/linux/vmw_vmci_defs.h
+++ b/include/linux/vmw_vmci_defs.h
@@ -66,7 +66,7 @@ enum {
  * consists of at least two pages, the memory limit also dictates the
  * number of queue pairs a guest can create.
  */
-#define VMCI_MAX_GUEST_QP_MEMORY (128 * 1024 * 1024)
+#define VMCI_MAX_GUEST_QP_MEMORY ((size_t)(128 * 1024 * 1024))
 #define VMCI_MAX_GUEST_QP_COUNT  (VMCI_MAX_GUEST_QP_MEMORY / PAGE_SIZE / 2)
 
 /*
@@ -80,7 +80,7 @@ enum {
  * too much kernel memory (especially on vmkernel).  We limit a queuepair to
  * 32 KB, or 16 KB per queue for symmetrical pairs.
  */
-#define VMCI_MAX_PINNED_QP_MEMORY (32 * 1024)
+#define VMCI_MAX_PINNED_QP_MEMORY ((size_t)(32 * 1024))
 
 /*
  * We have a fixed set of resource IDs available in the VMX.
-- 
cgit v1.2.3


From e9103f47bf1a1bbf0ab0ea90eda3e208653a5f57 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Jan 2021 17:02:38 +0200
Subject: serial: ifx6x60: Remove driver for deprecated platform

Intel Moorestown and Medfield are quite old Intel Atom based
32-bit platforms, which were in limited use in some Android phones,
tablets and consumer electronics more than eight years ago.

There are no bugs or problems ever reported outside from Intel
for breaking any of that platforms for years. It seems no real
users exists who run more or less fresh kernel on it. The commit
05f4434bc130 ("ASoC: Intel: remove mfld_machine") also in align
with this theory.

Due to above and to reduce a burden of supporting outdated drivers
we remove the support of outdated platforms completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210125150238.16980-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig    |    7 -
 drivers/tty/serial/Makefile   |    1 -
 drivers/tty/serial/ifx6x60.c  | 1387 -----------------------------------------
 drivers/tty/serial/ifx6x60.h  |  118 ----
 include/linux/spi/ifx_modem.h |   15 -
 5 files changed, 1528 deletions(-)
 delete mode 100644 drivers/tty/serial/ifx6x60.c
 delete mode 100644 drivers/tty/serial/ifx6x60.h
 delete mode 100644 include/linux/spi/ifx_modem.h

(limited to 'include/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 22e86ba4f827..0c4cd4a348f4 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1182,13 +1182,6 @@ config SERIAL_ALTERA_UART_CONSOLE
 	help
 	  Enable a Altera UART port to be the system console.
 
-config SERIAL_IFX6X60
-	tristate "SPI protocol driver for Infineon 6x60 modem (EXPERIMENTAL)"
-	depends on GPIOLIB || COMPILE_TEST
-	depends on SPI && HAS_DMA
-	help
-	  Support for the IFX6x60 modem devices on Intel MID platforms.
-
 config SERIAL_PCH_UART
 	tristate "Intel EG20T PCH/LAPIS Semicon IOH(ML7213/ML7223/ML7831) UART"
 	depends on PCI && (X86_32 || MIPS ||  COMPILE_TEST)
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index 931dc1d6885e..7da0856cd198 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -64,7 +64,6 @@ obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
 obj-$(CONFIG_SERIAL_GRLIB_GAISLER_APBUART) += apbuart.o
 obj-$(CONFIG_SERIAL_ALTERA_JTAGUART) += altera_jtaguart.o
 obj-$(CONFIG_SERIAL_VT8500) += vt8500_serial.o
-obj-$(CONFIG_SERIAL_IFX6X60)  	+= ifx6x60.o
 obj-$(CONFIG_SERIAL_PCH_UART)	+= pch_uart.o
 obj-$(CONFIG_SERIAL_MXS_AUART) += mxs-auart.o
 obj-$(CONFIG_SERIAL_LANTIQ)	+= lantiq.o
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
deleted file mode 100644
index d4ef88ee22d0..000000000000
--- a/drivers/tty/serial/ifx6x60.c
+++ /dev/null
@@ -1,1387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/****************************************************************************
- *
- * Driver for the IFX 6x60 spi modem.
- *
- * Copyright (C) 2008 Option International
- * Copyright (C) 2008 Filip Aben <f.aben@option.com>
- *		      Denis Joseph Barrow <d.barow@option.com>
- *		      Jan Dumon <j.dumon@option.com>
- *
- * Copyright (C) 2009, 2010 Intel Corp
- * Russ Gorby <russ.gorby@intel.com>
- *
- * Driver modified by Intel from Option gtm501l_spi.c
- *
- * Notes
- * o	The driver currently assumes a single device only. If you need to
- *	change this then look for saved_ifx_dev and add a device lookup
- * o	The driver is intended to be big-endian safe but has never been
- *	tested that way (no suitable hardware). There are a couple of FIXME
- *	notes by areas that may need addressing
- * o	Some of the GPIO naming/setup assumptions may need revisiting if
- *	you need to use this driver for another platform.
- *
- *****************************************************************************/
-#include <linux/dma-mapping.h>
-#include <linux/module.h>
-#include <linux/termios.h>
-#include <linux/tty.h>
-#include <linux/device.h>
-#include <linux/spi/spi.h>
-#include <linux/kfifo.h>
-#include <linux/tty_flip.h>
-#include <linux/timer.h>
-#include <linux/serial.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/rfkill.h>
-#include <linux/fs.h>
-#include <linux/ip.h>
-#include <linux/dmapool.h>
-#include <linux/gpio/consumer.h>
-#include <linux/sched.h>
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <linux/pm.h>
-#include <linux/pm_runtime.h>
-#include <linux/spi/ifx_modem.h>
-#include <linux/delay.h>
-#include <linux/reboot.h>
-
-#include "ifx6x60.h"
-
-#define IFX_SPI_MORE_MASK		0x10
-#define IFX_SPI_MORE_BIT		4	/* bit position in u8 */
-#define IFX_SPI_CTS_BIT			6	/* bit position in u8 */
-#define IFX_SPI_MODE			SPI_MODE_1
-#define IFX_SPI_TTY_ID			0
-#define IFX_SPI_TIMEOUT_SEC		2
-#define IFX_SPI_HEADER_0		(-1)
-#define IFX_SPI_HEADER_F		(-2)
-
-#define PO_POST_DELAY		200
-
-/* forward reference */
-static void ifx_spi_handle_srdy(struct ifx_spi_device *ifx_dev);
-static int ifx_modem_reboot_callback(struct notifier_block *nfb,
-				unsigned long event, void *data);
-static int ifx_modem_power_off(struct ifx_spi_device *ifx_dev);
-
-/* local variables */
-static int spi_bpw = 16;		/* 8, 16 or 32 bit word length */
-static struct tty_driver *tty_drv;
-static struct ifx_spi_device *saved_ifx_dev;
-static struct lock_class_key ifx_spi_key;
-
-static struct notifier_block ifx_modem_reboot_notifier_block = {
-	.notifier_call = ifx_modem_reboot_callback,
-};
-
-static int ifx_modem_power_off(struct ifx_spi_device *ifx_dev)
-{
-	gpiod_set_value(ifx_dev->gpio.pmu_reset, 1);
-	msleep(PO_POST_DELAY);
-
-	return 0;
-}
-
-static int ifx_modem_reboot_callback(struct notifier_block *nfb,
-				 unsigned long event, void *data)
-{
-	if (saved_ifx_dev)
-		ifx_modem_power_off(saved_ifx_dev);
-	else
-		pr_warn("no ifx modem active;\n");
-
-	return NOTIFY_OK;
-}
-
-/* GPIO/GPE settings */
-
-/**
- *	mrdy_set_high		-	set MRDY GPIO
- *	@ifx: device we are controlling
- *
- */
-static inline void mrdy_set_high(struct ifx_spi_device *ifx)
-{
-	gpiod_set_value(ifx->gpio.mrdy, 1);
-}
-
-/**
- *	mrdy_set_low		-	clear MRDY GPIO
- *	@ifx: device we are controlling
- *
- */
-static inline void mrdy_set_low(struct ifx_spi_device *ifx)
-{
-	gpiod_set_value(ifx->gpio.mrdy, 0);
-}
-
-/**
- *	ifx_spi_power_state_set
- *	@ifx_dev: our SPI device
- *	@val: bits to set
- *
- *	Set bit in power status and signal power system if status becomes non-0
- */
-static void
-ifx_spi_power_state_set(struct ifx_spi_device *ifx_dev, unsigned char val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ifx_dev->power_lock, flags);
-
-	/*
-	 * if power status is already non-0, just update, else
-	 * tell power system
-	 */
-	if (!ifx_dev->power_status)
-		pm_runtime_get(&ifx_dev->spi_dev->dev);
-	ifx_dev->power_status |= val;
-
-	spin_unlock_irqrestore(&ifx_dev->power_lock, flags);
-}
-
-/**
- *	ifx_spi_power_state_clear	-	clear power bit
- *	@ifx_dev: our SPI device
- *	@val: bits to clear
- *
- *	clear bit in power status and signal power system if status becomes 0
- */
-static void
-ifx_spi_power_state_clear(struct ifx_spi_device *ifx_dev, unsigned char val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ifx_dev->power_lock, flags);
-
-	if (ifx_dev->power_status) {
-		ifx_dev->power_status &= ~val;
-		if (!ifx_dev->power_status)
-			pm_runtime_put(&ifx_dev->spi_dev->dev);
-	}
-
-	spin_unlock_irqrestore(&ifx_dev->power_lock, flags);
-}
-
-/**
- *	swap_buf_8
- *	@buf: our buffer
- *	@len : number of bytes (not words) in the buffer
- *	@end: end of buffer
- *
- *	Swap the contents of a buffer into big endian format
- */
-static inline void swap_buf_8(unsigned char *buf, int len, void *end)
-{
-	/* don't swap buffer if SPI word width is 8 bits */
-	return;
-}
-
-/**
- *	swap_buf_16
- *	@buf: our buffer
- *	@len : number of bytes (not words) in the buffer
- *	@end: end of buffer
- *
- *	Swap the contents of a buffer into big endian format
- */
-static inline void swap_buf_16(unsigned char *buf, int len, void *end)
-{
-	int n;
-
-	u16 *buf_16 = (u16 *)buf;
-	len = ((len + 1) >> 1);
-	if ((void *)&buf_16[len] > end) {
-		pr_err("swap_buf_16: swap exceeds boundary (%p > %p)!",
-		       &buf_16[len], end);
-		return;
-	}
-	for (n = 0; n < len; n++) {
-		*buf_16 = cpu_to_be16(*buf_16);
-		buf_16++;
-	}
-}
-
-/**
- *	swap_buf_32
- *	@buf: our buffer
- *	@len : number of bytes (not words) in the buffer
- *	@end: end of buffer
- *
- *	Swap the contents of a buffer into big endian format
- */
-static inline void swap_buf_32(unsigned char *buf, int len, void *end)
-{
-	int n;
-
-	u32 *buf_32 = (u32 *)buf;
-	len = (len + 3) >> 2;
-
-	if ((void *)&buf_32[len] > end) {
-		pr_err("swap_buf_32: swap exceeds boundary (%p > %p)!\n",
-		       &buf_32[len], end);
-		return;
-	}
-	for (n = 0; n < len; n++) {
-		*buf_32 = cpu_to_be32(*buf_32);
-		buf_32++;
-	}
-}
-
-/**
- *	mrdy_assert		-	assert MRDY line
- *	@ifx_dev: our SPI device
- *
- *	Assert mrdy and set timer to wait for SRDY interrupt, if SRDY is low
- *	now.
- *
- *	FIXME: Can SRDY even go high as we are running this code ?
- */
-static void mrdy_assert(struct ifx_spi_device *ifx_dev)
-{
-	int val = gpiod_get_value(ifx_dev->gpio.srdy);
-	if (!val) {
-		if (!test_and_set_bit(IFX_SPI_STATE_TIMER_PENDING,
-				      &ifx_dev->flags)) {
-			mod_timer(&ifx_dev->spi_timer,jiffies + IFX_SPI_TIMEOUT_SEC*HZ);
-
-		}
-	}
-	ifx_spi_power_state_set(ifx_dev, IFX_SPI_POWER_DATA_PENDING);
-	mrdy_set_high(ifx_dev);
-}
-
-/**
- *	ifx_spi_timeout		-	SPI timeout
- *	@t: timer in our SPI device
- *
- *	The SPI has timed out: hang up the tty. Users will then see a hangup
- *	and error events.
- */
-static void ifx_spi_timeout(struct timer_list *t)
-{
-	struct ifx_spi_device *ifx_dev = from_timer(ifx_dev, t, spi_timer);
-
-	dev_warn(&ifx_dev->spi_dev->dev, "*** SPI Timeout ***");
-	tty_port_tty_hangup(&ifx_dev->tty_port, false);
-	mrdy_set_low(ifx_dev);
-	clear_bit(IFX_SPI_STATE_TIMER_PENDING, &ifx_dev->flags);
-}
-
-/* char/tty operations */
-
-/**
- *	ifx_spi_tiocmget	-	get modem lines
- *	@tty: our tty device
- *
- *	Map the signal state into Linux modem flags and report the value
- *	in Linux terms
- */
-static int ifx_spi_tiocmget(struct tty_struct *tty)
-{
-	unsigned int value;
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-
-	value =
-	(test_bit(IFX_SPI_RTS, &ifx_dev->signal_state) ? TIOCM_RTS : 0) |
-	(test_bit(IFX_SPI_DTR, &ifx_dev->signal_state) ? TIOCM_DTR : 0) |
-	(test_bit(IFX_SPI_CTS, &ifx_dev->signal_state) ? TIOCM_CTS : 0) |
-	(test_bit(IFX_SPI_DSR, &ifx_dev->signal_state) ? TIOCM_DSR : 0) |
-	(test_bit(IFX_SPI_DCD, &ifx_dev->signal_state) ? TIOCM_CAR : 0) |
-	(test_bit(IFX_SPI_RI, &ifx_dev->signal_state) ? TIOCM_RNG : 0);
-	return value;
-}
-
-/**
- *	ifx_spi_tiocmset	-	set modem bits
- *	@tty: the tty structure
- *	@set: bits to set
- *	@clear: bits to clear
- *
- *	The IFX6x60 only supports DTR and RTS. Set them accordingly
- *	and flag that an update to the modem is needed.
- *
- *	FIXME: do we need to kick the tranfers when we do this ?
- */
-static int ifx_spi_tiocmset(struct tty_struct *tty,
-			    unsigned int set, unsigned int clear)
-{
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-
-	if (set & TIOCM_RTS)
-		set_bit(IFX_SPI_RTS, &ifx_dev->signal_state);
-	if (set & TIOCM_DTR)
-		set_bit(IFX_SPI_DTR, &ifx_dev->signal_state);
-	if (clear & TIOCM_RTS)
-		clear_bit(IFX_SPI_RTS, &ifx_dev->signal_state);
-	if (clear & TIOCM_DTR)
-		clear_bit(IFX_SPI_DTR, &ifx_dev->signal_state);
-
-	set_bit(IFX_SPI_UPDATE, &ifx_dev->signal_state);
-	return 0;
-}
-
-/**
- *	ifx_spi_open	-	called on tty open
- *	@tty: our tty device
- *	@filp: file handle being associated with the tty
- *
- *	Open the tty interface. We let the tty_port layer do all the work
- *	for us.
- *
- *	FIXME: Remove single device assumption and saved_ifx_dev
- */
-static int ifx_spi_open(struct tty_struct *tty, struct file *filp)
-{
-	return tty_port_open(&saved_ifx_dev->tty_port, tty, filp);
-}
-
-/**
- *	ifx_spi_close	-	called when our tty closes
- *	@tty: the tty being closed
- *	@filp: the file handle being closed
- *
- *	Perform the close of the tty. We use the tty_port layer to do all
- *	our hard work.
- */
-static void ifx_spi_close(struct tty_struct *tty, struct file *filp)
-{
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-	tty_port_close(&ifx_dev->tty_port, tty, filp);
-	/* FIXME: should we do an ifx_spi_reset here ? */
-}
-
-/**
- *	ifx_decode_spi_header	-	decode received header
- *	@buffer: the received data
- *	@length: decoded length
- *	@more: decoded more flag
- *	@received_cts: status of cts we received
- *
- *	Note how received_cts is handled -- if header is all F it is left
- *	the same as it was, if header is all 0 it is set to 0 otherwise it is
- *	taken from the incoming header.
- *
- *	FIXME: endianness
- */
-static int ifx_spi_decode_spi_header(unsigned char *buffer, int *length,
-			unsigned char *more, unsigned char *received_cts)
-{
-	u16 h1;
-	u16 h2;
-	u16 *in_buffer = (u16 *)buffer;
-
-	h1 = *in_buffer;
-	h2 = *(in_buffer+1);
-
-	if (h1 == 0 && h2 == 0) {
-		*received_cts = 0;
-		*more = 0;
-		return IFX_SPI_HEADER_0;
-	} else if (h1 == 0xffff && h2 == 0xffff) {
-		*more = 0;
-		/* spi_slave_cts remains as it was */
-		return IFX_SPI_HEADER_F;
-	}
-
-	*length = h1 & 0xfff;	/* upper bits of byte are flags */
-	*more = (buffer[1] >> IFX_SPI_MORE_BIT) & 1;
-	*received_cts = (buffer[3] >> IFX_SPI_CTS_BIT) & 1;
-	return 0;
-}
-
-/**
- *	ifx_setup_spi_header	-	set header fields
- *	@txbuffer: pointer to start of SPI buffer
- *	@tx_count: bytes
- *	@more: indicate if more to follow
- *
- *	Format up an SPI header for a transfer
- *
- *	FIXME: endianness?
- */
-static void ifx_spi_setup_spi_header(unsigned char *txbuffer, int tx_count,
-					unsigned char more)
-{
-	*(u16 *)(txbuffer) = tx_count;
-	*(u16 *)(txbuffer+2) = IFX_SPI_PAYLOAD_SIZE;
-	txbuffer[1] |= (more << IFX_SPI_MORE_BIT) & IFX_SPI_MORE_MASK;
-}
-
-/**
- *	ifx_spi_prepare_tx_buffer	-	prepare transmit frame
- *	@ifx_dev: our SPI device
- *
- *	The transmit buffr needs a header and various other bits of
- *	information followed by as much data as we can pull from the FIFO
- *	and transfer. This function formats up a suitable buffer in the
- *	ifx_dev->tx_buffer
- *
- *	FIXME: performance - should we wake the tty when the queue is half
- *			     empty ?
- */
-static int ifx_spi_prepare_tx_buffer(struct ifx_spi_device *ifx_dev)
-{
-	int temp_count;
-	int queue_length;
-	int tx_count;
-	unsigned char *tx_buffer;
-
-	tx_buffer = ifx_dev->tx_buffer;
-
-	/* make room for required SPI header */
-	tx_buffer += IFX_SPI_HEADER_OVERHEAD;
-	tx_count = IFX_SPI_HEADER_OVERHEAD;
-
-	/* clear to signal no more data if this turns out to be the
-	 * last buffer sent in a sequence */
-	ifx_dev->spi_more = 0;
-
-	/* if modem cts is set, just send empty buffer */
-	if (!ifx_dev->spi_slave_cts) {
-		/* see if there's tx data */
-		queue_length = kfifo_len(&ifx_dev->tx_fifo);
-		if (queue_length != 0) {
-			/* data to mux -- see if there's room for it */
-			temp_count = min(queue_length, IFX_SPI_PAYLOAD_SIZE);
-			temp_count = kfifo_out_locked(&ifx_dev->tx_fifo,
-					tx_buffer, temp_count,
-					&ifx_dev->fifo_lock);
-
-			/* update buffer pointer and data count in message */
-			tx_buffer += temp_count;
-			tx_count += temp_count;
-			if (temp_count == queue_length)
-				/* poke port to get more data */
-				tty_port_tty_wakeup(&ifx_dev->tty_port);
-			else /* more data in port, use next SPI message */
-				ifx_dev->spi_more = 1;
-		}
-	}
-	/* have data and info for header -- set up SPI header in buffer */
-	/* spi header needs payload size, not entire buffer size */
-	ifx_spi_setup_spi_header(ifx_dev->tx_buffer,
-					tx_count-IFX_SPI_HEADER_OVERHEAD,
-					ifx_dev->spi_more);
-	/* swap actual data in the buffer */
-	ifx_dev->swap_buf((ifx_dev->tx_buffer), tx_count,
-		&ifx_dev->tx_buffer[IFX_SPI_TRANSFER_SIZE]);
-	return tx_count;
-}
-
-/**
- *	ifx_spi_write		-	line discipline write
- *	@tty: our tty device
- *	@buf: pointer to buffer to write (kernel space)
- *	@count: size of buffer
- *
- *	Write the characters we have been given into the FIFO. If the device
- *	is not active then activate it, when the SRDY line is asserted back
- *	this will commence I/O
- */
-static int ifx_spi_write(struct tty_struct *tty, const unsigned char *buf,
-			 int count)
-{
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-	unsigned char *tmp_buf = (unsigned char *)buf;
-	unsigned long flags;
-	bool is_fifo_empty;
-	int tx_count;
-
-	spin_lock_irqsave(&ifx_dev->fifo_lock, flags);
-	is_fifo_empty = kfifo_is_empty(&ifx_dev->tx_fifo);
-	tx_count = kfifo_in(&ifx_dev->tx_fifo, tmp_buf, count);
-	spin_unlock_irqrestore(&ifx_dev->fifo_lock, flags);
-	if (is_fifo_empty)
-		mrdy_assert(ifx_dev);
-
-	return tx_count;
-}
-
-/**
- *	ifx_spi_chars_in_buffer	-	line discipline helper
- *	@tty: our tty device
- *
- *	Report how much data we can accept before we drop bytes. As we use
- *	a simple FIFO this is nice and easy.
- */
-static int ifx_spi_write_room(struct tty_struct *tty)
-{
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-	return IFX_SPI_FIFO_SIZE - kfifo_len(&ifx_dev->tx_fifo);
-}
-
-/**
- *	ifx_spi_chars_in_buffer	-	line discipline helper
- *	@tty: our tty device
- *
- *	Report how many characters we have buffered. In our case this is the
- *	number of bytes sitting in our transmit FIFO.
- */
-static int ifx_spi_chars_in_buffer(struct tty_struct *tty)
-{
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-	return kfifo_len(&ifx_dev->tx_fifo);
-}
-
-/**
- *	ifx_port_hangup
- *	@tty: our tty
- *
- *	tty port hang up. Called when tty_hangup processing is invoked either
- *	by loss of carrier, or by software (eg vhangup). Serialized against
- *	activate/shutdown by the tty layer.
- */
-static void ifx_spi_hangup(struct tty_struct *tty)
-{
-	struct ifx_spi_device *ifx_dev = tty->driver_data;
-	tty_port_hangup(&ifx_dev->tty_port);
-}
-
-/**
- *	ifx_port_activate
- *	@port: our tty port
- *	@tty: our tty device
- *
- *	tty port activate method - called for first open. Serialized
- *	with hangup and shutdown by the tty layer.
- */
-static int ifx_port_activate(struct tty_port *port, struct tty_struct *tty)
-{
-	struct ifx_spi_device *ifx_dev =
-		container_of(port, struct ifx_spi_device, tty_port);
-
-	/* clear any old data; can't do this in 'close' */
-	kfifo_reset(&ifx_dev->tx_fifo);
-
-	/* clear any flag which may be set in port shutdown procedure */
-	clear_bit(IFX_SPI_STATE_IO_IN_PROGRESS, &ifx_dev->flags);
-	clear_bit(IFX_SPI_STATE_IO_READY, &ifx_dev->flags);
-
-	/* put port data into this tty */
-	tty->driver_data = ifx_dev;
-
-	/* set flag to allows data transfer */
-	set_bit(IFX_SPI_STATE_IO_AVAILABLE, &ifx_dev->flags);
-
-	return 0;
-}
-
-/**
- *	ifx_port_shutdown
- *	@port: our tty port
- *
- *	tty port shutdown method - called for last port close. Serialized
- *	with hangup and activate by the tty layer.
- */
-static void ifx_port_shutdown(struct tty_port *port)
-{
-	struct ifx_spi_device *ifx_dev =
-		container_of(port, struct ifx_spi_device, tty_port);
-
-	clear_bit(IFX_SPI_STATE_IO_AVAILABLE, &ifx_dev->flags);
-	mrdy_set_low(ifx_dev);
-	del_timer(&ifx_dev->spi_timer);
-	clear_bit(IFX_SPI_STATE_TIMER_PENDING, &ifx_dev->flags);
-	tasklet_kill(&ifx_dev->io_work_tasklet);
-}
-
-static const struct tty_port_operations ifx_tty_port_ops = {
-	.activate = ifx_port_activate,
-	.shutdown = ifx_port_shutdown,
-};
-
-static const struct tty_operations ifx_spi_serial_ops = {
-	.open = ifx_spi_open,
-	.close = ifx_spi_close,
-	.write = ifx_spi_write,
-	.hangup = ifx_spi_hangup,
-	.write_room = ifx_spi_write_room,
-	.chars_in_buffer = ifx_spi_chars_in_buffer,
-	.tiocmget = ifx_spi_tiocmget,
-	.tiocmset = ifx_spi_tiocmset,
-};
-
-/**
- *	ifx_spi_insert_fip_string	-	queue received data
- *	@ifx_dev: our SPI device
- *	@chars: buffer we have received
- *	@size: number of chars reeived
- *
- *	Queue bytes to the tty assuming the tty side is currently open. If
- *	not the discard the data.
- */
-static void ifx_spi_insert_flip_string(struct ifx_spi_device *ifx_dev,
-				    unsigned char *chars, size_t size)
-{
-	tty_insert_flip_string(&ifx_dev->tty_port, chars, size);
-	tty_flip_buffer_push(&ifx_dev->tty_port);
-}
-
-/**
- *	ifx_spi_complete	-	SPI transfer completed
- *	@ctx: our SPI device
- *
- *	An SPI transfer has completed. Process any received data and kick off
- *	any further transmits we can commence.
- */
-static void ifx_spi_complete(void *ctx)
-{
-	struct ifx_spi_device *ifx_dev = ctx;
-	int length;
-	int actual_length;
-	unsigned char more = 0;
-	unsigned char cts;
-	int local_write_pending = 0;
-	int queue_length;
-	int srdy;
-	int decode_result;
-
-	mrdy_set_low(ifx_dev);
-
-	if (!ifx_dev->spi_msg.status) {
-		/* check header validity, get comm flags */
-		ifx_dev->swap_buf(ifx_dev->rx_buffer, IFX_SPI_HEADER_OVERHEAD,
-			&ifx_dev->rx_buffer[IFX_SPI_HEADER_OVERHEAD]);
-		decode_result = ifx_spi_decode_spi_header(ifx_dev->rx_buffer,
-				&length, &more, &cts);
-		if (decode_result == IFX_SPI_HEADER_0) {
-			dev_dbg(&ifx_dev->spi_dev->dev,
-				"ignore input: invalid header 0");
-			ifx_dev->spi_slave_cts = 0;
-			goto complete_exit;
-		} else if (decode_result == IFX_SPI_HEADER_F) {
-			dev_dbg(&ifx_dev->spi_dev->dev,
-				"ignore input: invalid header F");
-			goto complete_exit;
-		}
-
-		ifx_dev->spi_slave_cts = cts;
-
-		actual_length = min((unsigned int)length,
-					ifx_dev->spi_msg.actual_length);
-		ifx_dev->swap_buf(
-			(ifx_dev->rx_buffer + IFX_SPI_HEADER_OVERHEAD),
-			 actual_length,
-			 &ifx_dev->rx_buffer[IFX_SPI_TRANSFER_SIZE]);
-		ifx_spi_insert_flip_string(
-			ifx_dev,
-			ifx_dev->rx_buffer + IFX_SPI_HEADER_OVERHEAD,
-			(size_t)actual_length);
-	} else {
-		more = 0;
-		dev_dbg(&ifx_dev->spi_dev->dev, "SPI transfer error %d",
-		       ifx_dev->spi_msg.status);
-	}
-
-complete_exit:
-	if (ifx_dev->write_pending) {
-		ifx_dev->write_pending = 0;
-		local_write_pending = 1;
-	}
-
-	clear_bit(IFX_SPI_STATE_IO_IN_PROGRESS, &(ifx_dev->flags));
-
-	queue_length = kfifo_len(&ifx_dev->tx_fifo);
-	srdy = gpiod_get_value(ifx_dev->gpio.srdy);
-	if (!srdy)
-		ifx_spi_power_state_clear(ifx_dev, IFX_SPI_POWER_SRDY);
-
-	/* schedule output if there is more to do */
-	if (test_and_clear_bit(IFX_SPI_STATE_IO_READY, &ifx_dev->flags))
-		tasklet_schedule(&ifx_dev->io_work_tasklet);
-	else {
-		if (more || ifx_dev->spi_more || queue_length > 0 ||
-			local_write_pending) {
-			if (ifx_dev->spi_slave_cts) {
-				if (more)
-					mrdy_assert(ifx_dev);
-			} else
-				mrdy_assert(ifx_dev);
-		} else {
-			/*
-			 * poke line discipline driver if any for more data
-			 * may or may not get more data to write
-			 * for now, say not busy
-			 */
-			ifx_spi_power_state_clear(ifx_dev,
-						  IFX_SPI_POWER_DATA_PENDING);
-			tty_port_tty_wakeup(&ifx_dev->tty_port);
-		}
-	}
-}
-
-/**
- *	ifx_spio_io		-	I/O tasklet
- *	@t: tasklet construct used to fetch the SPI device
- *
- *	Queue data for transmission if possible and then kick off the
- *	transfer.
- */
-static void ifx_spi_io(struct tasklet_struct *t)
-{
-	int retval;
-	struct ifx_spi_device *ifx_dev = from_tasklet(ifx_dev, t,
-						      io_work_tasklet);
-
-	if (!test_and_set_bit(IFX_SPI_STATE_IO_IN_PROGRESS, &ifx_dev->flags) &&
-		test_bit(IFX_SPI_STATE_IO_AVAILABLE, &ifx_dev->flags)) {
-		if (ifx_dev->gpio.unack_srdy_int_nb > 0)
-			ifx_dev->gpio.unack_srdy_int_nb--;
-
-		ifx_spi_prepare_tx_buffer(ifx_dev);
-
-		spi_message_init(&ifx_dev->spi_msg);
-		INIT_LIST_HEAD(&ifx_dev->spi_msg.queue);
-
-		ifx_dev->spi_msg.context = ifx_dev;
-		ifx_dev->spi_msg.complete = ifx_spi_complete;
-
-		/* set up our spi transfer */
-		/* note len is BYTES, not transfers */
-		ifx_dev->spi_xfer.len = IFX_SPI_TRANSFER_SIZE;
-		ifx_dev->spi_xfer.cs_change = 0;
-		ifx_dev->spi_xfer.speed_hz = ifx_dev->spi_dev->max_speed_hz;
-		/* ifx_dev->spi_xfer.speed_hz = 390625; */
-		ifx_dev->spi_xfer.bits_per_word =
-			ifx_dev->spi_dev->bits_per_word;
-
-		ifx_dev->spi_xfer.tx_buf = ifx_dev->tx_buffer;
-		ifx_dev->spi_xfer.rx_buf = ifx_dev->rx_buffer;
-
-		/*
-		 * setup dma pointers
-		 */
-		if (ifx_dev->use_dma) {
-			ifx_dev->spi_msg.is_dma_mapped = 1;
-			ifx_dev->tx_dma = ifx_dev->tx_bus;
-			ifx_dev->rx_dma = ifx_dev->rx_bus;
-			ifx_dev->spi_xfer.tx_dma = ifx_dev->tx_dma;
-			ifx_dev->spi_xfer.rx_dma = ifx_dev->rx_dma;
-		} else {
-			ifx_dev->spi_msg.is_dma_mapped = 0;
-			ifx_dev->tx_dma = (dma_addr_t)0;
-			ifx_dev->rx_dma = (dma_addr_t)0;
-			ifx_dev->spi_xfer.tx_dma = (dma_addr_t)0;
-			ifx_dev->spi_xfer.rx_dma = (dma_addr_t)0;
-		}
-
-		spi_message_add_tail(&ifx_dev->spi_xfer, &ifx_dev->spi_msg);
-
-		/* Assert MRDY. This may have already been done by the write
-		 * routine.
-		 */
-		mrdy_assert(ifx_dev);
-
-		retval = spi_async(ifx_dev->spi_dev, &ifx_dev->spi_msg);
-		if (retval) {
-			clear_bit(IFX_SPI_STATE_IO_IN_PROGRESS,
-				  &ifx_dev->flags);
-			tasklet_schedule(&ifx_dev->io_work_tasklet);
-			return;
-		}
-	} else
-		ifx_dev->write_pending = 1;
-}
-
-/**
- *	ifx_spi_free_port	-	free up the tty side
- *	@ifx_dev: IFX device going away
- *
- *	Unregister and free up a port when the device goes away
- */
-static void ifx_spi_free_port(struct ifx_spi_device *ifx_dev)
-{
-	if (ifx_dev->tty_dev)
-		tty_unregister_device(tty_drv, ifx_dev->minor);
-	tty_port_destroy(&ifx_dev->tty_port);
-	kfifo_free(&ifx_dev->tx_fifo);
-}
-
-/**
- *	ifx_spi_create_port	-	create a new port
- *	@ifx_dev: our spi device
- *
- *	Allocate and initialise the tty port that goes with this interface
- *	and add it to the tty layer so that it can be opened.
- */
-static int ifx_spi_create_port(struct ifx_spi_device *ifx_dev)
-{
-	int ret = 0;
-	struct tty_port *pport = &ifx_dev->tty_port;
-
-	spin_lock_init(&ifx_dev->fifo_lock);
-	lockdep_set_class_and_subclass(&ifx_dev->fifo_lock,
-		&ifx_spi_key, 0);
-
-	if (kfifo_alloc(&ifx_dev->tx_fifo, IFX_SPI_FIFO_SIZE, GFP_KERNEL)) {
-		ret = -ENOMEM;
-		goto error_ret;
-	}
-
-	tty_port_init(pport);
-	pport->ops = &ifx_tty_port_ops;
-	ifx_dev->minor = IFX_SPI_TTY_ID;
-	ifx_dev->tty_dev = tty_port_register_device(pport, tty_drv,
-			ifx_dev->minor, &ifx_dev->spi_dev->dev);
-	if (IS_ERR(ifx_dev->tty_dev)) {
-		dev_dbg(&ifx_dev->spi_dev->dev,
-			"%s: registering tty device failed", __func__);
-		ret = PTR_ERR(ifx_dev->tty_dev);
-		goto error_port;
-	}
-	return 0;
-
-error_port:
-	tty_port_destroy(pport);
-error_ret:
-	ifx_spi_free_port(ifx_dev);
-	return ret;
-}
-
-/**
- *	ifx_spi_handle_srdy		-	handle SRDY
- *	@ifx_dev: device asserting SRDY
- *
- *	Check our device state and see what we need to kick off when SRDY
- *	is asserted. This usually means killing the timer and firing off the
- *	I/O processing.
- */
-static void ifx_spi_handle_srdy(struct ifx_spi_device *ifx_dev)
-{
-	if (test_bit(IFX_SPI_STATE_TIMER_PENDING, &ifx_dev->flags)) {
-		del_timer(&ifx_dev->spi_timer);
-		clear_bit(IFX_SPI_STATE_TIMER_PENDING, &ifx_dev->flags);
-	}
-
-	ifx_spi_power_state_set(ifx_dev, IFX_SPI_POWER_SRDY);
-
-	if (!test_bit(IFX_SPI_STATE_IO_IN_PROGRESS, &ifx_dev->flags))
-		tasklet_schedule(&ifx_dev->io_work_tasklet);
-	else
-		set_bit(IFX_SPI_STATE_IO_READY, &ifx_dev->flags);
-}
-
-/**
- *	ifx_spi_srdy_interrupt	-	SRDY asserted
- *	@irq: our IRQ number
- *	@dev: our ifx device
- *
- *	The modem asserted SRDY. Handle the srdy event
- */
-static irqreturn_t ifx_spi_srdy_interrupt(int irq, void *dev)
-{
-	struct ifx_spi_device *ifx_dev = dev;
-	ifx_dev->gpio.unack_srdy_int_nb++;
-	ifx_spi_handle_srdy(ifx_dev);
-	return IRQ_HANDLED;
-}
-
-/**
- *	ifx_spi_reset_interrupt	-	Modem has changed reset state
- *	@irq: interrupt number
- *	@dev: our device pointer
- *
- *	The modem has either entered or left reset state. Check the GPIO
- *	line to see which.
- *
- *	FIXME: review locking on MR_INPROGRESS versus
- *	parallel unsolicited reset/solicited reset
- */
-static irqreturn_t ifx_spi_reset_interrupt(int irq, void *dev)
-{
-	struct ifx_spi_device *ifx_dev = dev;
-	int val = gpiod_get_value(ifx_dev->gpio.reset_out);
-	int solreset = test_bit(MR_START, &ifx_dev->mdm_reset_state);
-
-	if (val == 0) {
-		/* entered reset */
-		set_bit(MR_INPROGRESS, &ifx_dev->mdm_reset_state);
-		if (!solreset) {
-			/* unsolicited reset  */
-			tty_port_tty_hangup(&ifx_dev->tty_port, false);
-		}
-	} else {
-		/* exited reset */
-		clear_bit(MR_INPROGRESS, &ifx_dev->mdm_reset_state);
-		if (solreset) {
-			set_bit(MR_COMPLETE, &ifx_dev->mdm_reset_state);
-			wake_up(&ifx_dev->mdm_reset_wait);
-		}
-	}
-	return IRQ_HANDLED;
-}
-
-/**
- *	ifx_spi_free_device - free device
- *	@ifx_dev: device to free
- *
- *	Free the IFX device
- */
-static void ifx_spi_free_device(struct ifx_spi_device *ifx_dev)
-{
-	ifx_spi_free_port(ifx_dev);
-	dma_free_coherent(&ifx_dev->spi_dev->dev,
-				IFX_SPI_TRANSFER_SIZE,
-				ifx_dev->tx_buffer,
-				ifx_dev->tx_bus);
-	dma_free_coherent(&ifx_dev->spi_dev->dev,
-				IFX_SPI_TRANSFER_SIZE,
-				ifx_dev->rx_buffer,
-				ifx_dev->rx_bus);
-}
-
-/**
- *	ifx_spi_reset	-	reset modem
- *	@ifx_dev: modem to reset
- *
- *	Perform a reset on the modem
- */
-static int ifx_spi_reset(struct ifx_spi_device *ifx_dev)
-{
-	int ret;
-	/*
-	 * set up modem power, reset
-	 *
-	 * delays are required on some platforms for the modem
-	 * to reset properly
-	 */
-	set_bit(MR_START, &ifx_dev->mdm_reset_state);
-	gpiod_set_value(ifx_dev->gpio.po, 0);
-	gpiod_set_value(ifx_dev->gpio.reset, 0);
-	msleep(25);
-	gpiod_set_value(ifx_dev->gpio.reset, 1);
-	msleep(1);
-	gpiod_set_value(ifx_dev->gpio.po, 1);
-	msleep(1);
-	gpiod_set_value(ifx_dev->gpio.po, 0);
-	ret = wait_event_timeout(ifx_dev->mdm_reset_wait,
-				 test_bit(MR_COMPLETE,
-					  &ifx_dev->mdm_reset_state),
-				 IFX_RESET_TIMEOUT);
-	if (!ret)
-		dev_warn(&ifx_dev->spi_dev->dev, "Modem reset timeout: (state:%lx)",
-			 ifx_dev->mdm_reset_state);
-
-	ifx_dev->mdm_reset_state = 0;
-	return ret;
-}
-
-/**
- *	ifx_spi_spi_probe	-	probe callback
- *	@spi: our possible matching SPI device
- *
- *	Probe for a 6x60 modem on SPI bus. Perform any needed device and
- *	GPIO setup.
- *
- *	FIXME:
- *	-	Support for multiple devices
- *	-	Split out MID specific GPIO handling eventually
- */
-
-static int ifx_spi_spi_probe(struct spi_device *spi)
-{
-	int ret;
-	int srdy;
-	struct ifx_modem_platform_data *pl_data;
-	struct ifx_spi_device *ifx_dev;
-	struct device *dev = &spi->dev;
-
-	if (saved_ifx_dev) {
-		dev_dbg(dev, "ignoring subsequent detection");
-		return -ENODEV;
-	}
-
-	pl_data = dev_get_platdata(dev);
-	if (!pl_data) {
-		dev_err(dev, "missing platform data!");
-		return -ENODEV;
-	}
-
-	/* initialize structure to hold our device variables */
-	ifx_dev = kzalloc(sizeof(struct ifx_spi_device), GFP_KERNEL);
-	if (!ifx_dev) {
-		dev_err(dev, "spi device allocation failed");
-		return -ENOMEM;
-	}
-	saved_ifx_dev = ifx_dev;
-	ifx_dev->spi_dev = spi;
-	clear_bit(IFX_SPI_STATE_IO_IN_PROGRESS, &ifx_dev->flags);
-	spin_lock_init(&ifx_dev->write_lock);
-	spin_lock_init(&ifx_dev->power_lock);
-	ifx_dev->power_status = 0;
-	timer_setup(&ifx_dev->spi_timer, ifx_spi_timeout, 0);
-	ifx_dev->modem = pl_data->modem_type;
-	ifx_dev->use_dma = pl_data->use_dma;
-	ifx_dev->max_hz = pl_data->max_hz;
-	/* initialize spi mode, etc */
-	spi->max_speed_hz = ifx_dev->max_hz;
-	spi->mode = IFX_SPI_MODE | (SPI_LOOP & spi->mode);
-	spi->bits_per_word = spi_bpw;
-	ret = spi_setup(spi);
-	if (ret) {
-		dev_err(dev, "SPI setup wasn't successful %d", ret);
-		kfree(ifx_dev);
-		return -ENODEV;
-	}
-
-	/* init swap_buf function according to word width configuration */
-	if (spi->bits_per_word == 32)
-		ifx_dev->swap_buf = swap_buf_32;
-	else if (spi->bits_per_word == 16)
-		ifx_dev->swap_buf = swap_buf_16;
-	else
-		ifx_dev->swap_buf = swap_buf_8;
-
-	/* ensure SPI protocol flags are initialized to enable transfer */
-	ifx_dev->spi_more = 0;
-	ifx_dev->spi_slave_cts = 0;
-
-	/*initialize transfer and dma buffers */
-	ifx_dev->tx_buffer = dma_alloc_coherent(ifx_dev->spi_dev->dev.parent,
-				IFX_SPI_TRANSFER_SIZE,
-				&ifx_dev->tx_bus,
-				GFP_KERNEL);
-	if (!ifx_dev->tx_buffer) {
-		dev_err(dev, "DMA-TX buffer allocation failed");
-		ret = -ENOMEM;
-		goto error_ret;
-	}
-	ifx_dev->rx_buffer = dma_alloc_coherent(ifx_dev->spi_dev->dev.parent,
-				IFX_SPI_TRANSFER_SIZE,
-				&ifx_dev->rx_bus,
-				GFP_KERNEL);
-	if (!ifx_dev->rx_buffer) {
-		dev_err(dev, "DMA-RX buffer allocation failed");
-		ret = -ENOMEM;
-		goto error_ret;
-	}
-
-	/* initialize waitq for modem reset */
-	init_waitqueue_head(&ifx_dev->mdm_reset_wait);
-
-	spi_set_drvdata(spi, ifx_dev);
-	tasklet_setup(&ifx_dev->io_work_tasklet, ifx_spi_io);
-
-	set_bit(IFX_SPI_STATE_PRESENT, &ifx_dev->flags);
-
-	/* create our tty port */
-	ret = ifx_spi_create_port(ifx_dev);
-	if (ret != 0) {
-		dev_err(dev, "create default tty port failed");
-		goto error_ret;
-	}
-
-	ifx_dev->gpio.reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW);
-	if (IS_ERR(ifx_dev->gpio.reset)) {
-		dev_err(dev, "could not obtain reset GPIO\n");
-		ret = PTR_ERR(ifx_dev->gpio.reset);
-		goto error_ret;
-	}
-	gpiod_set_consumer_name(ifx_dev->gpio.reset, "ifxModem reset");
-	ifx_dev->gpio.po = devm_gpiod_get(dev, "power", GPIOD_OUT_LOW);
-	if (IS_ERR(ifx_dev->gpio.po)) {
-		dev_err(dev, "could not obtain power GPIO\n");
-		ret = PTR_ERR(ifx_dev->gpio.po);
-		goto error_ret;
-	}
-	gpiod_set_consumer_name(ifx_dev->gpio.po, "ifxModem power");
-	ifx_dev->gpio.mrdy = devm_gpiod_get(dev, "mrdy", GPIOD_OUT_LOW);
-	if (IS_ERR(ifx_dev->gpio.mrdy)) {
-		dev_err(dev, "could not obtain mrdy GPIO\n");
-		ret = PTR_ERR(ifx_dev->gpio.mrdy);
-		goto error_ret;
-	}
-	gpiod_set_consumer_name(ifx_dev->gpio.mrdy, "ifxModem mrdy");
-	ifx_dev->gpio.srdy = devm_gpiod_get(dev, "srdy", GPIOD_IN);
-	if (IS_ERR(ifx_dev->gpio.srdy)) {
-		dev_err(dev, "could not obtain srdy GPIO\n");
-		ret = PTR_ERR(ifx_dev->gpio.srdy);
-		goto error_ret;
-	}
-	gpiod_set_consumer_name(ifx_dev->gpio.srdy, "ifxModem srdy");
-	ifx_dev->gpio.reset_out = devm_gpiod_get(dev, "rst_out", GPIOD_IN);
-	if (IS_ERR(ifx_dev->gpio.reset_out)) {
-		dev_err(dev, "could not obtain rst_out GPIO\n");
-		ret = PTR_ERR(ifx_dev->gpio.reset_out);
-		goto error_ret;
-	}
-	gpiod_set_consumer_name(ifx_dev->gpio.reset_out, "ifxModem reset out");
-	ifx_dev->gpio.pmu_reset = devm_gpiod_get(dev, "pmu_reset", GPIOD_ASIS);
-	if (IS_ERR(ifx_dev->gpio.pmu_reset)) {
-		dev_err(dev, "could not obtain pmu_reset GPIO\n");
-		ret = PTR_ERR(ifx_dev->gpio.pmu_reset);
-		goto error_ret;
-	}
-	gpiod_set_consumer_name(ifx_dev->gpio.pmu_reset, "ifxModem PMU reset");
-
-	ret = request_irq(gpiod_to_irq(ifx_dev->gpio.reset_out),
-			  ifx_spi_reset_interrupt,
-			  IRQF_TRIGGER_RISING|IRQF_TRIGGER_FALLING, DRVNAME,
-			  ifx_dev);
-	if (ret) {
-		dev_err(dev, "Unable to get irq %x\n",
-			gpiod_to_irq(ifx_dev->gpio.reset_out));
-		goto error_ret;
-	}
-
-	ret = ifx_spi_reset(ifx_dev);
-
-	ret = request_irq(gpiod_to_irq(ifx_dev->gpio.srdy),
-			  ifx_spi_srdy_interrupt, IRQF_TRIGGER_RISING, DRVNAME,
-			  ifx_dev);
-	if (ret) {
-		dev_err(dev, "Unable to get irq %x",
-			gpiod_to_irq(ifx_dev->gpio.srdy));
-		goto error_ret2;
-	}
-
-	/* set pm runtime power state and register with power system */
-	pm_runtime_set_active(dev);
-	pm_runtime_enable(dev);
-
-	/* handle case that modem is already signaling SRDY */
-	/* no outgoing tty open at this point, this just satisfies the
-	 * modem's read and should reset communication properly
-	 */
-	srdy = gpiod_get_value(ifx_dev->gpio.srdy);
-
-	if (srdy) {
-		mrdy_assert(ifx_dev);
-		ifx_spi_handle_srdy(ifx_dev);
-	} else
-		mrdy_set_low(ifx_dev);
-	return 0;
-
-error_ret2:
-	free_irq(gpiod_to_irq(ifx_dev->gpio.reset_out), ifx_dev);
-error_ret:
-	ifx_spi_free_device(ifx_dev);
-	saved_ifx_dev = NULL;
-	return ret;
-}
-
-/**
- *	ifx_spi_spi_remove	-	SPI device was removed
- *	@spi: SPI device
- *
- *	FIXME: We should be shutting the device down here not in
- *	the module unload path.
- */
-
-static int ifx_spi_spi_remove(struct spi_device *spi)
-{
-	struct ifx_spi_device *ifx_dev = spi_get_drvdata(spi);
-	/* stop activity */
-	tasklet_kill(&ifx_dev->io_work_tasklet);
-
-	pm_runtime_disable(&spi->dev);
-
-	/* free irq */
-	free_irq(gpiod_to_irq(ifx_dev->gpio.reset_out), ifx_dev);
-	free_irq(gpiod_to_irq(ifx_dev->gpio.srdy), ifx_dev);
-
-	/* free allocations */
-	ifx_spi_free_device(ifx_dev);
-
-	saved_ifx_dev = NULL;
-	return 0;
-}
-
-/**
- *	ifx_spi_spi_shutdown	-	called on SPI shutdown
- *	@spi: SPI device
- *
- *	No action needs to be taken here
- */
-
-static void ifx_spi_spi_shutdown(struct spi_device *spi)
-{
-	struct ifx_spi_device *ifx_dev = spi_get_drvdata(spi);
-
-	ifx_modem_power_off(ifx_dev);
-}
-
-/*
- * various suspends and resumes have nothing to do
- * no hardware to save state for
- */
-
-/**
- *	ifx_spi_pm_suspend	-	suspend modem on system suspend
- *	@dev: device being suspended
- *
- *	Suspend the modem. No action needed on Intel MID platforms, may
- *	need extending for other systems.
- */
-static int ifx_spi_pm_suspend(struct device *dev)
-{
-	return 0;
-}
-
-/**
- *	ifx_spi_pm_resume	-	resume modem on system resume
- *	@dev: device being suspended
- *
- *	Allow the modem to resume. No action needed.
- *
- *	FIXME: do we need to reset anything here ?
- */
-static int ifx_spi_pm_resume(struct device *dev)
-{
-	return 0;
-}
-
-/**
- *	ifx_spi_pm_runtime_resume	-	suspend modem
- *	@dev: device being suspended
- *
- *	Allow the modem to resume. No action needed.
- */
-static int ifx_spi_pm_runtime_resume(struct device *dev)
-{
-	return 0;
-}
-
-/**
- *	ifx_spi_pm_runtime_suspend	-	suspend modem
- *	@dev: device being suspended
- *
- *	Allow the modem to suspend and thus suspend to continue up the
- *	device tree.
- */
-static int ifx_spi_pm_runtime_suspend(struct device *dev)
-{
-	return 0;
-}
-
-/**
- *	ifx_spi_pm_runtime_idle		-	check if modem idle
- *	@dev: our device
- *
- *	Check conditions and queue runtime suspend if idle.
- */
-static int ifx_spi_pm_runtime_idle(struct device *dev)
-{
-	struct spi_device *spi = to_spi_device(dev);
-	struct ifx_spi_device *ifx_dev = spi_get_drvdata(spi);
-
-	if (!ifx_dev->power_status)
-		pm_runtime_suspend(dev);
-
-	return 0;
-}
-
-static const struct dev_pm_ops ifx_spi_pm = {
-	.resume = ifx_spi_pm_resume,
-	.suspend = ifx_spi_pm_suspend,
-	.runtime_resume = ifx_spi_pm_runtime_resume,
-	.runtime_suspend = ifx_spi_pm_runtime_suspend,
-	.runtime_idle = ifx_spi_pm_runtime_idle
-};
-
-static const struct spi_device_id ifx_id_table[] = {
-	{"ifx6160", 0},
-	{"ifx6260", 0},
-	{ }
-};
-MODULE_DEVICE_TABLE(spi, ifx_id_table);
-
-/* spi operations */
-static struct spi_driver ifx_spi_driver = {
-	.driver = {
-		.name = DRVNAME,
-		.pm = &ifx_spi_pm,
-	},
-	.probe = ifx_spi_spi_probe,
-	.shutdown = ifx_spi_spi_shutdown,
-	.remove = ifx_spi_spi_remove,
-	.id_table = ifx_id_table
-};
-
-/**
- *	ifx_spi_exit	-	module exit
- *
- *	Unload the module.
- */
-
-static void __exit ifx_spi_exit(void)
-{
-	/* unregister */
-	spi_unregister_driver(&ifx_spi_driver);
-	tty_unregister_driver(tty_drv);
-	put_tty_driver(tty_drv);
-	unregister_reboot_notifier(&ifx_modem_reboot_notifier_block);
-}
-
-/**
- *	ifx_spi_init		-	module entry point
- *
- *	Initialise the SPI and tty interfaces for the IFX SPI driver
- *	We need to initialize upper-edge spi driver after the tty
- *	driver because otherwise the spi probe will race
- */
-
-static int __init ifx_spi_init(void)
-{
-	int result;
-
-	tty_drv = alloc_tty_driver(1);
-	if (!tty_drv) {
-		pr_err("%s: alloc_tty_driver failed", DRVNAME);
-		return -ENOMEM;
-	}
-
-	tty_drv->driver_name = DRVNAME;
-	tty_drv->name = TTYNAME;
-	tty_drv->minor_start = IFX_SPI_TTY_ID;
-	tty_drv->type = TTY_DRIVER_TYPE_SERIAL;
-	tty_drv->subtype = SERIAL_TYPE_NORMAL;
-	tty_drv->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
-	tty_drv->init_termios = tty_std_termios;
-
-	tty_set_operations(tty_drv, &ifx_spi_serial_ops);
-
-	result = tty_register_driver(tty_drv);
-	if (result) {
-		pr_err("%s: tty_register_driver failed(%d)",
-			DRVNAME, result);
-		goto err_free_tty;
-	}
-
-	result = spi_register_driver(&ifx_spi_driver);
-	if (result) {
-		pr_err("%s: spi_register_driver failed(%d)",
-			DRVNAME, result);
-		goto err_unreg_tty;
-	}
-
-	result = register_reboot_notifier(&ifx_modem_reboot_notifier_block);
-	if (result) {
-		pr_err("%s: register ifx modem reboot notifier failed(%d)",
-			DRVNAME, result);
-		goto err_unreg_spi;
-	}
-
-	return 0;
-err_unreg_spi:
-	spi_unregister_driver(&ifx_spi_driver);
-err_unreg_tty:
-	tty_unregister_driver(tty_drv);
-err_free_tty:
-	put_tty_driver(tty_drv);
-
-	return result;
-}
-
-module_init(ifx_spi_init);
-module_exit(ifx_spi_exit);
-
-MODULE_AUTHOR("Intel");
-MODULE_DESCRIPTION("IFX6x60 spi driver");
-MODULE_LICENSE("GPL");
-MODULE_INFO(Version, "0.1-IFX6x60");
diff --git a/drivers/tty/serial/ifx6x60.h b/drivers/tty/serial/ifx6x60.h
deleted file mode 100644
index ecb841d928a7..000000000000
--- a/drivers/tty/serial/ifx6x60.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/****************************************************************************
- *
- * Driver for the IFX spi modem.
- *
- * Copyright (C) 2009, 2010 Intel Corp
- * Jim Stanley <jim.stanley@intel.com>
- *
- *****************************************************************************/
-#ifndef _IFX6X60_H
-#define _IFX6X60_H
-
-struct gpio_desc;
-
-#define DRVNAME				"ifx6x60"
-#define TTYNAME				"ttyIFX"
-
-#define IFX_SPI_MAX_MINORS		1
-#define IFX_SPI_TRANSFER_SIZE		2048
-#define IFX_SPI_FIFO_SIZE		4096
-
-#define IFX_SPI_HEADER_OVERHEAD		4
-#define IFX_RESET_TIMEOUT		msecs_to_jiffies(50)
-
-/* device flags bitfield definitions */
-#define IFX_SPI_STATE_PRESENT		0
-#define IFX_SPI_STATE_IO_IN_PROGRESS	1
-#define IFX_SPI_STATE_IO_READY		2
-#define IFX_SPI_STATE_TIMER_PENDING	3
-#define IFX_SPI_STATE_IO_AVAILABLE	4
-
-/* flow control bitfields */
-#define IFX_SPI_DCD			0
-#define IFX_SPI_CTS			1
-#define IFX_SPI_DSR			2
-#define IFX_SPI_RI			3
-#define IFX_SPI_DTR			4
-#define IFX_SPI_RTS			5
-#define IFX_SPI_TX_FC			6
-#define IFX_SPI_RX_FC			7
-#define IFX_SPI_UPDATE			8
-
-#define IFX_SPI_PAYLOAD_SIZE		(IFX_SPI_TRANSFER_SIZE - \
-						IFX_SPI_HEADER_OVERHEAD)
-
-#define IFX_SPI_IRQ_TYPE		DETECT_EDGE_RISING
-#define IFX_SPI_GPIO_TARGET		0
-#define IFX_SPI_GPIO0			0x105
-
-#define IFX_SPI_STATUS_TIMEOUT		(2000*HZ)
-
-/* values for bits in power status byte */
-#define IFX_SPI_POWER_DATA_PENDING	1
-#define IFX_SPI_POWER_SRDY		2
-
-struct ifx_spi_device {
-	/* Our SPI device */
-	struct spi_device *spi_dev;
-
-	/* Port specific data */
-	struct kfifo tx_fifo;
-	spinlock_t fifo_lock;
-	unsigned long signal_state;
-
-	/* TTY Layer logic */
-	struct tty_port tty_port;
-	struct device *tty_dev;
-	int minor;
-
-	/* Low level I/O work */
-	struct tasklet_struct io_work_tasklet;
-	unsigned long flags;
-	dma_addr_t rx_dma;
-	dma_addr_t tx_dma;
-
-	int modem;		/* Modem type */
-	int use_dma;		/* provide dma-able addrs in SPI msg */
-	long max_hz;		/* max SPI frequency */
-
-	spinlock_t write_lock;
-	int write_pending;
-	spinlock_t power_lock;
-	unsigned char power_status;
-
-	unsigned char *rx_buffer;
-	unsigned char *tx_buffer;
-	dma_addr_t rx_bus;
-	dma_addr_t tx_bus;
-	unsigned char spi_more;
-	unsigned char spi_slave_cts;
-
-	struct timer_list spi_timer;
-
-	struct spi_message spi_msg;
-	struct spi_transfer spi_xfer;
-
-	struct {
-		/* gpio lines */
-		struct gpio_desc *srdy;		/* slave-ready gpio */
-		struct gpio_desc *mrdy;		/* master-ready gpio */
-		struct gpio_desc *reset;	/* modem-reset gpio */
-		struct gpio_desc *po;		/* modem-on gpio */
-		struct gpio_desc *reset_out;	/* modem-in-reset gpio */
-		struct gpio_desc *pmu_reset;	/* PMU reset gpio */
-		/* state/stats */
-		int unack_srdy_int_nb;
-	} gpio;
-
-	/* modem reset */
-	unsigned long mdm_reset_state;
-#define MR_START	0
-#define MR_INPROGRESS	1
-#define MR_COMPLETE	2
-	wait_queue_head_t mdm_reset_wait;
-	void (*swap_buf)(unsigned char *buf, int len, void *end);
-};
-
-#endif /* _IFX6X60_H */
diff --git a/include/linux/spi/ifx_modem.h b/include/linux/spi/ifx_modem.h
deleted file mode 100644
index 6d19b09139d0..000000000000
--- a/include/linux/spi/ifx_modem.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_IFX_MODEM_H
-#define LINUX_IFX_MODEM_H
-
-struct ifx_modem_platform_data {
-	unsigned short tx_pwr;		/* modem power threshold */
-	unsigned char modem_type;	/* Modem type */
-	unsigned long max_hz;		/* max SPI frequency */
-	unsigned short use_dma:1;	/* spi protocol driver supplies
-					   dma-able addrs */
-};
-#define IFX_MODEM_6160	1
-#define IFX_MODEM_6260	2
-
-#endif
-- 
cgit v1.2.3


From 529b56a854c5a8bd5e6f045cdcbbcd21a1616fbd Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 7 Jan 2021 14:28:30 +0100
Subject: media: device property: Define format macros for ports and endpoints

OF, ACPI and software_nodes all implement graphs including nodes for ports
and endpoints. These are all intended to be named with a common schema,
as "port@n" and "endpoint@n" where n is an unsigned int representing the
index of the node. To ensure commonality across the subsystems, provide a
set of macros to define the format.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/linux/fwnode.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index fde4ad97564c..77414e431e89 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -50,6 +50,13 @@ struct fwnode_endpoint {
 	const struct fwnode_handle *local_fwnode;
 };
 
+/*
+ * ports and endpoints defined as software_nodes should all follow a common
+ * naming scheme; use these macros to ensure commonality.
+ */
+#define SWNODE_GRAPH_PORT_NAME_FMT		"port@%u"
+#define SWNODE_GRAPH_ENDPOINT_NAME_FMT		"endpoint@%u"
+
 #define NR_FWNODE_REFERENCE_ARGS	8
 
 /**
-- 
cgit v1.2.3


From 0fc99422bc034de018607ef6b70f92d4bc4a236d Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Mon, 18 Jan 2021 09:48:56 +0100
Subject: firmware: xilinx: Remove PM_API_MAX value

There is no reason to keep PM_API_MAX around. The commit acfdd18591ea
("firmware: xilinx: Use hash-table for api feature check") removed its
usage that's why it is not used anywhere now.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/86e94593a29a1b5b1958c539a1bfabdd08c0948e.1610959734.git.michal.simek@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 2a0da841c942..0a483249ff30 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -85,7 +85,6 @@ enum pm_api_id {
 	PM_CLOCK_GETPARENT,
 	PM_SECURE_AES = 47,
 	PM_FEATURE_CHECK = 63,
-	PM_API_MAX,
 };
 
 /* PMU-FW return status codes */
-- 
cgit v1.2.3


From 2c2b9fd6b496b3616e9b9537ea0258b3040914f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 9 Jan 2021 12:13:32 +0100
Subject: block: unexport truncate_bdev_range

truncate_bdev_range is only used in always built-in block layer code,
so remove the export and the !CONFIG_BLOCK stub.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c         | 1 -
 include/linux/blkdev.h | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 289c3dd923a4..c1fe29dac485 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -126,7 +126,6 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
 		bd_abort_claiming(bdev, truncate_bdev_range);
 	return 0;
 }
-EXPORT_SYMBOL(truncate_bdev_range);
 
 static void set_init_blocksize(struct block_device *bdev)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 20f3706b6b2e..2491e17b61c4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1999,21 +1999,16 @@ void bdev_add(struct block_device *bdev, dev_t dev);
 struct block_device *I_BDEV(struct inode *inode);
 struct block_device *bdgrab(struct block_device *bdev);
 void bdput(struct block_device *);
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
+		loff_t lend);
 
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
-int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
-			loff_t lend);
 int sync_blockdev(struct block_device *bdev);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
 {
 }
-static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
-				      loff_t lstart, loff_t lend)
-{
-	return 0;
-}
 static inline int sync_blockdev(struct block_device *bdev)
 {
 	return 0;
-- 
cgit v1.2.3


From 6b2e04bc240fe9be9e690059f710e9f95346d34d Mon Sep 17 00:00:00 2001
From: Praveen Chaudhary <praveen5582@gmail.com>
Date: Mon, 25 Jan 2021 13:44:30 -0800
Subject: net: allow user to set metric on default route learned via Router
 Advertisement

For IPv4, default route is learned via DHCPv4 and user is allowed to change
metric using config etc/network/interfaces. But for IPv6, default route can
be learned via RA, for which, currently a fixed metric value 1024 is used.

Ideally, user should be able to configure metric on default route for IPv6
similar to IPv4. This patch adds sysctl for the same.

Logs:

For IPv4:

Config in etc/network/interfaces:
auto eth0
iface eth0 inet dhcp
    metric 4261413864

IPv4 Kernel Route Table:
$ ip route list
default via 172.21.47.1 dev eth0 metric 4261413864

FRR Table, if a static route is configured:
[In real scenario, it is useful to prefer BGP learned default route over DHCPv4 default route.]
Codes: K - kernel route, C - connected, S - static, R - RIP,
       O - OSPF, I - IS-IS, B - BGP, P - PIM, E - EIGRP, N - NHRP,
       T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP,
       > - selected route, * - FIB route

S>* 0.0.0.0/0 [20/0] is directly connected, eth0, 00:00:03
K   0.0.0.0/0 [254/1000] via 172.21.47.1, eth0, 6d08h51m

i.e. User can prefer Default Router learned via Routing Protocol in IPv4.
Similar behavior is not possible for IPv6, without this fix.

After fix [for IPv6]:
sudo sysctl -w net.ipv6.conf.eth0.net.ipv6.conf.eth0.ra_defrtr_metric=1996489705

IP monitor: [When IPv6 RA is received]
default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705  pref high

Kernel IPv6 routing table
$ ip -6 route list
default via fe80::be16:65ff:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 21sec hoplimit 64 pref high

FRR Table, if a static route is configured:
[In real scenario, it is useful to prefer BGP learned default route over IPv6 RA default route.]
Codes: K - kernel route, C - connected, S - static, R - RIPng,
       O - OSPFv3, I - IS-IS, B - BGP, N - NHRP, T - Table,
       v - VNC, V - VNC-Direct, A - Babel, D - SHARP,
       > - selected route, * - FIB route

S>* ::/0 [20/0] is directly connected, eth0, 00:00:06
K   ::/0 [119/1001] via fe80::xx16:xxxx:feb3:ce8e, eth0, 6d07h43m

If the metric is changed later, the effect will be seen only when next IPv6
RA is received, because the default route must be fully controlled by RA msg.
Below metric is changed from 1996489705 to 1996489704.

$ sudo sysctl -w net.ipv6.conf.eth0.ra_defrtr_metric=1996489704
net.ipv6.conf.eth0.ra_defrtr_metric = 1996489704

IP monitor:
[On next IPv6 RA msg, Kernel deletes prev route and installs new route with updated metric]

Deleted default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 3sec hoplimit 64 pref high
default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489704 pref high

Signed-off-by: Praveen Chaudhary <pchaudhary@linkedin.com>
Signed-off-by: Zhenggen Xu <zxu@linkedin.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210125214430.24079-1-pchaudhary@linkedin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 10 ++++++++++
 include/linux/ipv6.h                   |  1 +
 include/net/ip6_route.h                |  3 ++-
 include/uapi/linux/ipv6.h              |  1 +
 include/uapi/linux/sysctl.h            |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ndisc.c                       | 12 ++++++++----
 net/ipv6/route.c                       |  5 +++--
 8 files changed, 37 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index dd2b12a32b73..0e51ddd9a2f1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1871,6 +1871,16 @@ accept_ra_defrtr - BOOLEAN
 		- enabled if accept_ra is enabled.
 		- disabled if accept_ra is disabled.
 
+ra_defrtr_metric - UNSIGNED INTEGER
+	Route metric for default route learned in Router Advertisement. This value
+	will be assigned as metric for the default route learned via IPv6 Router
+	Advertisement. Takes affect only if accept_ra_defrtr is enabled.
+
+	Possible values:
+		1 to 0xFFFFFFFF
+
+		Default: IP6_RT_PRIO_USER i.e. 1024.
+
 accept_ra_from_local - BOOLEAN
 	Accept RA with source-address that is found on local machine
 	if the RA is otherwise proper and able to be accepted.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index dda61d150a13..9d1f29f0c512 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -31,6 +31,7 @@ struct ipv6_devconf {
 	__s32		max_desync_factor;
 	__s32		max_addresses;
 	__s32		accept_ra_defrtr;
+	__u32		ra_defrtr_metric;
 	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
 	__s32		ignore_routes_with_linkdown;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 2a5277758379..f51a118bfce8 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,7 +174,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 				     struct net_device *dev);
 struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
-				     struct net_device *dev, unsigned int pref);
+				     struct net_device *dev, unsigned int pref,
+				     u32 defrtr_usr_metric);
 
 void rt6_purge_dflt_routers(struct net *net);
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 13e8751bf24a..70603775fe91 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -189,6 +189,7 @@ enum {
 	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
 	DEVCONF_NDISC_TCLASS,
 	DEVCONF_RPL_SEG_ENABLED,
+	DEVCONF_RA_DEFRTR_METRIC,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 458179df9b27..1e05d3caa712 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -571,6 +571,7 @@ enum {
 	NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
 	NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
 	NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
+	NET_IPV6_RA_DEFRTR_METRIC=28,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 9edc5bb2d531..f2337fb756ac 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -205,6 +205,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.max_desync_factor	= MAX_DESYNC_FACTOR,
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
+	.ra_defrtr_metric	= IP6_RT_PRIO_USER,
 	.accept_ra_from_local	= 0,
 	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.max_desync_factor	= MAX_DESYNC_FACTOR,
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
+	.ra_defrtr_metric	= IP6_RT_PRIO_USER,
 	.accept_ra_from_local	= 0,
 	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
@@ -5476,6 +5478,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
 	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+	array[DEVCONF_RA_DEFRTR_METRIC] = cnf->ra_defrtr_metric;
 	array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
 	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -6668,6 +6671,14 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "ra_defrtr_metric",
+		.data		= &ipv6_devconf.ra_defrtr_metric,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= (void *)SYSCTL_ONE,
+	},
 	{
 		.procname	= "accept_ra_min_hop_limit",
 		.data		= &ipv6_devconf.accept_ra_min_hop_limit,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 76717478f173..c467c6419893 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1173,6 +1173,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
 	struct fib6_info *rt = NULL;
+	u32 defrtr_usr_metric;
 	struct net *net;
 	int lifetime;
 	struct ndisc_options ndopts;
@@ -1303,18 +1304,21 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 	}
-	if (rt && lifetime == 0) {
+	/* Set default route metric as specified by user */
+	defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric;
+	/* delete the route if lifetime is 0 or if metric needs change */
+	if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) {
 		ip6_del_rt(net, rt, false);
 		rt = NULL;
 	}
 
-	ND_PRINTK(3, info, "RA: rt: %p  lifetime: %d, for dev: %s\n",
-		  rt, lifetime, skb->dev->name);
+	ND_PRINTK(3, info, "RA: rt: %p  lifetime: %d, metric: %d, for dev: %s\n",
+		  rt, lifetime, defrtr_usr_metric, skb->dev->name);
 	if (!rt && lifetime) {
 		ND_PRINTK(3, info, "RA: adding default router\n");
 
 		rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
-					 skb->dev, pref);
+					 skb->dev, pref, defrtr_usr_metric);
 		if (!rt) {
 			ND_PRINTK(0, err,
 				  "RA: %s failed to add default route\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 188e114b29b4..41d8f801b75f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4252,11 +4252,12 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev,
-				     unsigned int pref)
+				     unsigned int pref,
+				     u32 defrtr_usr_metric)
 {
 	struct fib6_config cfg = {
 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
-		.fc_metric	= IP6_RT_PRIO_USER,
+		.fc_metric	= defrtr_usr_metric,
 		.fc_ifindex	= dev->ifindex,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
-- 
cgit v1.2.3


From 6fe27d68b45666381691b6a841a2adbda8c8d153 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 20 Jan 2021 02:03:55 +0900
Subject: can: dev: export can_get_state_str() function

The can_get_state_str() function is also relevant to the drivers. Export the
symbol and make it visible in the can/dev.h header.

Link: https://lore.kernel.org/r/20210119170355.12040-1-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/dev.c | 3 ++-
 include/linux/can/dev.h   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index 01e4a194f187..d9281ae853f8 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -74,7 +74,7 @@ static int can_rx_state_to_frame(struct net_device *dev, enum can_state state)
 	}
 }
 
-static const char *can_get_state_str(const enum can_state state)
+const char *can_get_state_str(const enum can_state state)
 {
 	switch (state) {
 	case CAN_STATE_ERROR_ACTIVE:
@@ -95,6 +95,7 @@ static const char *can_get_state_str(const enum can_state state)
 
 	return "<unknown>";
 }
+EXPORT_SYMBOL_GPL(can_get_state_str);
 
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state)
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 7faf6a37d5b2..ac4d83a1ab81 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -123,6 +123,7 @@ void unregister_candev(struct net_device *dev);
 int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
+const char *can_get_state_str(const enum can_state state);
 void can_change_state(struct net_device *dev, struct can_frame *cf,
 		      enum can_state tx_state, enum can_state rx_state);
 
-- 
cgit v1.2.3


From 9cc0aaeb96e7f894d4735f069174948c1516fea7 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:06 +0800
Subject: iova: Make has_iova_flush_queue() private

Function has_iova_flush_queue() has no users outside iova.c, so make it
private.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-2-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 2 +-
 include/linux/iova.h | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d20b8b333d30..5f2caa95c1f0 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,7 +55,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
-bool has_iova_flush_queue(struct iova_domain *iovad)
+static bool has_iova_flush_queue(struct iova_domain *iovad)
 {
 	return !!iovad->fq;
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 76e16ae20729..2b76e0be1c5b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -153,7 +153,6 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
-bool has_iova_flush_queue(struct iova_domain *iovad);
 int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
@@ -223,11 +222,6 @@ static inline void init_iova_domain(struct iova_domain *iovad,
 {
 }
 
-static inline bool has_iova_flush_queue(struct iova_domain *iovad)
-{
-	return false;
-}
-
 static inline int init_iova_flush_queue(struct iova_domain *iovad,
 					iova_flush_cb flush_cb,
 					iova_entry_dtor entry_dtor)
-- 
cgit v1.2.3


From 87baa23e0236bc1c41ce744f524bf12dbe7fde66 Mon Sep 17 00:00:00 2001
From: Hemant Kumar <hemantk@codeaurora.org>
Date: Mon, 11 Jan 2021 19:07:40 +0100
Subject: bus: mhi: core: Add helper API to return number of free TREs

Introduce mhi_get_free_desc_count() API to return number
of TREs available to queue buffer. MHI clients can use this
API to know before hand if ring is full without calling queue
API.

Signed-off-by: Hemant Kumar <hemantk@codeaurora.org>
Reviewed-by: Jeffrey Hugo <jhugo@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1610388462-16322-1-git-send-email-loic.poulain@linaro.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/main.c | 12 ++++++++++++
 include/linux/mhi.h         |  9 +++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index d34d7e90e38d..1202433ecf98 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -260,6 +260,18 @@ int mhi_destroy_device(struct device *dev, void *data)
 	return 0;
 }
 
+int mhi_get_free_desc_count(struct mhi_device *mhi_dev,
+				enum dma_data_direction dir)
+{
+	struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl;
+	struct mhi_chan *mhi_chan = (dir == DMA_TO_DEVICE) ?
+		mhi_dev->ul_chan : mhi_dev->dl_chan;
+	struct mhi_ring *tre_ring = &mhi_chan->tre_ring;
+
+	return get_nr_avail_ring_elements(mhi_cntrl, tre_ring);
+}
+EXPORT_SYMBOL_GPL(mhi_get_free_desc_count);
+
 void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason)
 {
 	struct mhi_driver *mhi_drv;
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 562862ff819c..ece53a252217 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -598,6 +598,15 @@ void mhi_set_mhi_state(struct mhi_controller *mhi_cntrl,
  */
 void mhi_notify(struct mhi_device *mhi_dev, enum mhi_callback cb_reason);
 
+/**
+ * mhi_get_free_desc_count - Get transfer ring length
+ * Get # of TD available to queue buffers
+ * @mhi_dev: Device associated with the channels
+ * @dir: Direction of the channel
+ */
+int mhi_get_free_desc_count(struct mhi_device *mhi_dev,
+				enum dma_data_direction dir);
+
 /**
  * mhi_prepare_for_power_up - Do pre-initialization before power up.
  *                            This is optional, call this before power up if
-- 
cgit v1.2.3


From 622106190175dbac2b0b0ee7d4275c474e5fe051 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:07 +0800
Subject: iova: Delete copy_reserved_iova()

Since commit c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the
iommu ops"), function copy_reserved_iova() is not referenced, so delete
it.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-3-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 30 ------------------------------
 include/linux/iova.h |  6 ------
 2 files changed, 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 5f2caa95c1f0..9b114cd886d5 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -710,36 +710,6 @@ finish:
 }
 EXPORT_SYMBOL_GPL(reserve_iova);
 
-/**
- * copy_reserved_iova - copies the reserved between domains
- * @from: - source domain from where to copy
- * @to: - destination domin where to copy
- * This function copies reserved iova's from one domain to
- * other.
- */
-void
-copy_reserved_iova(struct iova_domain *from, struct iova_domain *to)
-{
-	unsigned long flags;
-	struct rb_node *node;
-
-	spin_lock_irqsave(&from->iova_rbtree_lock, flags);
-	for (node = rb_first(&from->rbroot); node; node = rb_next(node)) {
-		struct iova *iova = rb_entry(node, struct iova, node);
-		struct iova *new_iova;
-
-		if (iova->pfn_lo == IOVA_ANCHOR)
-			continue;
-
-		new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi);
-		if (!new_iova)
-			pr_err("Reserve iova range %lx@%lx failed\n",
-			       iova->pfn_lo, iova->pfn_lo);
-	}
-	spin_unlock_irqrestore(&from->iova_rbtree_lock, flags);
-}
-EXPORT_SYMBOL_GPL(copy_reserved_iova);
-
 /*
  * Magazine caches for IOVA ranges.  For an introduction to magazines,
  * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 2b76e0be1c5b..c834c01c0a5b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -150,7 +150,6 @@ unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn, bool flush_rcache);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
-void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn);
 int init_iova_flush_queue(struct iova_domain *iovad,
@@ -211,11 +210,6 @@ static inline struct iova *reserve_iova(struct iova_domain *iovad,
 	return NULL;
 }
 
-static inline void copy_reserved_iova(struct iova_domain *from,
-				      struct iova_domain *to)
-{
-}
-
 static inline void init_iova_domain(struct iova_domain *iovad,
 				    unsigned long granule,
 				    unsigned long start_pfn)
-- 
cgit v1.2.3


From ab0a7119ba67be9e377b195d2b9baa9fb8b3b53e Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:10 +0800
Subject: iommu: Delete iommu_domain_window_disable()

Function iommu_domain_window_disable() is not referenced in the tree, so
delete it.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-6-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 9 ---------
 include/linux/iommu.h | 6 ------
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 4f4edb087ce6..20201ef97b8f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2598,15 +2598,6 @@ int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
 
-void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-	if (unlikely(domain->ops->domain_window_disable == NULL))
-		return;
-
-	return domain->ops->domain_window_disable(domain, wnd_nr);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
-
 /**
  * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
  * @domain: the iommu domain where the fault has happened
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b3f0e2018c62..72059fcfa108 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -514,7 +514,6 @@ extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
-extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
@@ -746,11 +745,6 @@ static inline int iommu_domain_window_enable(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
-static inline void iommu_domain_window_disable(struct iommu_domain *domain,
-					       u32 wnd_nr)
-{
-}
-
 static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
 	return 0;
-- 
cgit v1.2.3


From 262948f8ba573dc9c61650df8f23eaea7d43bc61 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 6 Jan 2021 21:35:11 +0800
Subject: iommu: Delete iommu_dev_has_feature()

Function iommu_dev_has_feature() has never been referenced in the tree,
and there does not appear to be anything coming soon to use it, so delete
it.

Signed-off-by: John Garry <john.garry@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1609940111-28563-7-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 11 -----------
 include/linux/iommu.h |  7 -------
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 20201ef97b8f..91f7871f5a37 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2853,17 +2853,6 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
 /*
  * Per device IOMMU features.
  */
-bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
-{
-	const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-	if (ops && ops->dev_has_feat)
-		return ops->dev_has_feat(dev, feat);
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
-
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 72059fcfa108..91d94c014f47 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -626,7 +626,6 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv)
 int iommu_probe_device(struct device *dev);
 void iommu_release_device(struct device *dev);
 
-bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f);
@@ -975,12 +974,6 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
-static inline bool
-iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
-{
-	return false;
-}
-
 static inline bool
 iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
 {
-- 
cgit v1.2.3


From 2ebbd25873cef06f739489fd8ff9f707a3dfa2fa Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Thu, 7 Jan 2021 20:29:04 +0800
Subject: iommu: Add iova and size as parameters in iotlb_sync_map

iotlb_sync_map allow IOMMU drivers tlb sync after completing the whole
mapping. This patch adds iova and size as the parameters in it. then the
IOMMU driver could flush tlb with the whole range once after iova mapping
to improve performance.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107122909.16317-3-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/iommu.c      | 4 ++--
 drivers/iommu/tegra-gart.c | 7 +++++--
 include/linux/iommu.h      | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c304a6a30d42..3d099a31ddca 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2443,7 +2443,7 @@ static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
 	if (ret == 0 && ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain);
+		ops->iotlb_sync_map(domain, iova, size);
 
 	return ret;
 }
@@ -2575,7 +2575,7 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 	}
 
 	if (ops->iotlb_sync_map)
-		ops->iotlb_sync_map(domain);
+		ops->iotlb_sync_map(domain, iova, mapped);
 	return mapped;
 
 out_err:
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index fac720273889..05e8e19b8269 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -261,7 +261,8 @@ static int gart_iommu_of_xlate(struct device *dev,
 	return 0;
 }
 
-static void gart_iommu_sync_map(struct iommu_domain *domain)
+static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
+				size_t size)
 {
 	FLUSH_GART_REGS(gart_handle);
 }
@@ -269,7 +270,9 @@ static void gart_iommu_sync_map(struct iommu_domain *domain)
 static void gart_iommu_sync(struct iommu_domain *domain,
 			    struct iommu_iotlb_gather *gather)
 {
-	gart_iommu_sync_map(domain);
+	size_t length = gather->end - gather->start;
+
+	gart_iommu_sync_map(domain, gather->start, length);
 }
 
 static const struct iommu_ops gart_iommu_ops = {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b3f0e2018c62..9ce0aa9e236b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -246,7 +246,8 @@ struct iommu_ops {
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
-	void (*iotlb_sync_map)(struct iommu_domain *domain);
+	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
+			       size_t size);
 	void (*iotlb_sync)(struct iommu_domain *domain,
 			   struct iommu_iotlb_gather *iotlb_gather);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
-- 
cgit v1.2.3


From 862c3715de8f3e5350489240c951d697f04bd8c9 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Thu, 7 Jan 2021 20:29:06 +0800
Subject: iommu: Switch gather->end to the inclusive end

Currently gather->end is "unsigned long" which may be overflow in
arch32 in the corner case: 0xfff00000 + 0x100000(iova + size).
Although it doesn't affect the size(end - start), it affects the checking
"gather->end < end"

This patch changes this "end" to the real end address
(end = start + size - 1). Correspondingly, update the length to
"end - start + 1".

Fixes: a7d20dc19d9e ("iommu: Introduce struct iommu_iotlb_gather for batching TLB flushes")
Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107122909.16317-5-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +-
 drivers/iommu/mtk_iommu.c                   | 2 +-
 drivers/iommu/tegra-gart.c                  | 2 +-
 include/linux/iommu.h                       | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8ca7415d785d..c70d6e79f534 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2280,7 +2280,7 @@ static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
-	arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start,
+	arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start + 1,
 			       gather->pgsize, true, smmu_domain);
 }
 
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index f579fc21971e..66a00a2cb445 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -443,7 +443,7 @@ static void mtk_iommu_iotlb_sync(struct iommu_domain *domain,
 				 struct iommu_iotlb_gather *gather)
 {
 	struct mtk_iommu_data *data = mtk_iommu_get_m4u_data();
-	size_t length = gather->end - gather->start;
+	size_t length = gather->end - gather->start + 1;
 
 	if (gather->start == ULONG_MAX)
 		return;
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 05e8e19b8269..6f130e51f072 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -270,7 +270,7 @@ static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova,
 static void gart_iommu_sync(struct iommu_domain *domain,
 			    struct iommu_iotlb_gather *gather)
 {
-	size_t length = gather->end - gather->start;
+	size_t length = gather->end - gather->start + 1;
 
 	gart_iommu_sync_map(domain, gather->start, length);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9ce0aa9e236b..ae8eddd4621b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -170,7 +170,7 @@ enum iommu_dev_features {
  * struct iommu_iotlb_gather - Range information for a pending IOTLB flush
  *
  * @start: IOVA representing the start of the range to be flushed
- * @end: IOVA representing the end of the range to be flushed (exclusive)
+ * @end: IOVA representing the end of the range to be flushed (inclusive)
  * @pgsize: The interval at which to perform the flush
  *
  * This structure is intended to be updated by multiple calls to the
@@ -539,7 +539,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 					       struct iommu_iotlb_gather *gather,
 					       unsigned long iova, size_t size)
 {
-	unsigned long start = iova, end = start + size;
+	unsigned long start = iova, end = start + size - 1;
 
 	/*
 	 * If the new page is disjoint from the current range or is mapped at
-- 
cgit v1.2.3


From 77e0992aee4e980e8c553e512a5dfa3e704cf030 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Thu, 7 Jan 2021 20:29:07 +0800
Subject: iommu/io-pgtable: Allow io_pgtable_tlb ops optional

This patch allows io_pgtable_tlb ops could be null since the IOMMU drivers
may use the tlb ops from iommu framework.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107122909.16317-6-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/io-pgtable.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ea727eb1a1a9..2a5686ca2ba3 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -214,14 +214,16 @@ struct io_pgtable_domain_attr {
 
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
-	iop->cfg.tlb->tlb_flush_all(iop->cookie);
+	if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_all)
+		iop->cfg.tlb->tlb_flush_all(iop->cookie);
 }
 
 static inline void
 io_pgtable_tlb_flush_walk(struct io_pgtable *iop, unsigned long iova,
 			  size_t size, size_t granule)
 {
-	iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie);
+	if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_walk)
+		iop->cfg.tlb->tlb_flush_walk(iova, size, granule, iop->cookie);
 }
 
 static inline void
@@ -229,7 +231,7 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop,
 			struct iommu_iotlb_gather * gather, unsigned long iova,
 			size_t granule)
 {
-	if (iop->cfg.tlb->tlb_add_page)
+	if (iop->cfg.tlb && iop->cfg.tlb->tlb_add_page)
 		iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie);
 }
 
-- 
cgit v1.2.3


From 8ba59e9dee31246fc34b4d4bec032093e9c06510 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 22 Jan 2021 13:43:58 +0200
Subject: misc: pti: Remove driver for deprecated platform

Intel Moorestown and Medfield are quite old Intel Atom based
32-bit platforms, which were in limited use in some Android phones,
tablets and consumer electronics more than eight years ago.

There are no bugs or problems ever reported outside from Intel
for breaking any of that platforms for years. It seems no real
users exists who run more or less fresh kernel on it. The commit
05f4434bc130 ("ASoC: Intel: remove mfld_machine") also in align
with this theory.

Due to above and to reduce a burden of supporting outdated drivers
we remove the support of outdated platforms completely.

Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210122114358.39299-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/pti_intel_mid.rst | 108 ----
 drivers/misc/Kconfig                       |  13 -
 drivers/misc/Makefile                      |   1 -
 drivers/misc/pti.c                         | 978 -----------------------------
 drivers/tty/Makefile                       |   2 -
 drivers/tty/n_tracerouter.c                | 233 -------
 drivers/tty/n_tracesink.c                  | 228 -------
 drivers/tty/n_tracesink.h                  |  26 -
 include/linux/intel-pti.h                  |  35 --
 9 files changed, 1624 deletions(-)
 delete mode 100644 Documentation/driver-api/pti_intel_mid.rst
 delete mode 100644 drivers/misc/pti.c
 delete mode 100644 drivers/tty/n_tracerouter.c
 delete mode 100644 drivers/tty/n_tracesink.c
 delete mode 100644 drivers/tty/n_tracesink.h
 delete mode 100644 include/linux/intel-pti.h

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pti_intel_mid.rst b/Documentation/driver-api/pti_intel_mid.rst
deleted file mode 100644
index bacc2a4ee89f..000000000000
--- a/Documentation/driver-api/pti_intel_mid.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=============
-Intel MID PTI
-=============
-
-The Intel MID PTI project is HW implemented in Intel Atom
-system-on-a-chip designs based on the Parallel Trace
-Interface for MIPI P1149.7 cJTAG standard.  The kernel solution
-for this platform involves the following files::
-
-	./include/linux/pti.h
-	./drivers/.../n_tracesink.h
-	./drivers/.../n_tracerouter.c
-	./drivers/.../n_tracesink.c
-	./drivers/.../pti.c
-
-pti.c is the driver that enables various debugging features
-popular on platforms from certain mobile manufacturers.
-n_tracerouter.c and n_tracesink.c allow extra system information to
-be collected and routed to the pti driver, such as trace
-debugging data from a modem.  Although n_tracerouter
-and n_tracesink are a part of the complete PTI solution,
-these two line disciplines can work separately from
-pti.c and route any data stream from one /dev/tty node
-to another /dev/tty node via kernel-space.  This provides
-a stable, reliable connection that will not break unless
-the user-space application shuts down (plus avoids
-kernel->user->kernel context switch overheads of routing
-data).
-
-An example debugging usage for this driver system:
-
-  * Hook /dev/ttyPTI0 to syslogd.  Opening this port will also start
-    a console device to further capture debugging messages to PTI.
-  * Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW.
-    This is where n_tracerouter and n_tracesink are used.
-  * Hook /dev/pti to a user-level debugging application for writing
-    to PTI HW.
-  * `Use mipi_` Kernel Driver API in other device drivers for
-    debugging to PTI by first requesting a PTI write address via
-    mipi_request_masterchannel(1).
-
-Below is example pseudo-code on how a 'privileged' application
-can hook up n_tracerouter and n_tracesink to any tty on
-a system.  'Privileged' means the application has enough
-privileges to successfully manipulate the ldisc drivers
-but is not just blindly executing as 'root'. Keep in mind
-the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter
-and n_tracesink line discpline drivers but is a generic
-operation for a program to use a line discpline driver
-on a tty port other than the default n_tty:
-
-.. code-block:: c
-
-  /////////// To hook up n_tracerouter and n_tracesink /////////
-
-  // Note that n_tracerouter depends on n_tracesink.
-  #include <errno.h>
-  #define ONE_TTY "/dev/ttyOne"
-  #define TWO_TTY "/dev/ttyTwo"
-
-  // needed global to hand onto ldisc connection
-  static int g_fd_source = -1;
-  static int g_fd_sink  = -1;
-
-  // these two vars used to grab LDISC values from loaded ldisc drivers
-  // in OS.  Look at /proc/tty/ldiscs to get the right numbers from
-  // the ldiscs loaded in the system.
-  int source_ldisc_num, sink_ldisc_num = -1;
-  int retval;
-
-  g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W
-  g_fd_sink   = open(TWO_TTY, O_RDWR); // must be R/W
-
-  if (g_fd_source <= 0) || (g_fd_sink <= 0) {
-     // doubt you'll want to use these exact error lines of code
-     printf("Error on open(). errno: %d\n",errno);
-     return errno;
-  }
-
-  retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num);
-  if (retval < 0) {
-     printf("Error on ioctl().  errno: %d\n", errno);
-     return errno;
-  }
-
-  retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num);
-  if (retval < 0) {
-     printf("Error on ioctl().  errno: %d\n", errno);
-     return errno;
-  }
-
-  /////////// To disconnect n_tracerouter and n_tracesink ////////
-
-  // First make sure data through the ldiscs has stopped.
-
-  // Second, disconnect ldiscs.  This provides a
-  // little cleaner shutdown on tty stack.
-  sink_ldisc_num = 0;
-  source_ldisc_num = 0;
-  ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num);
-  ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num);
-
-  // Three, program closes connection, and cleanup:
-  close(g_fd_uart);
-  close(g_fd_gadget);
-  g_fd_uart = g_fd_gadget = NULL;
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 8085213913cc..f532c59bb59b 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -104,19 +104,6 @@ config PHANTOM
 	  If you choose to build module, its name will be phantom. If unsure,
 	  say N here.
 
-config INTEL_MID_PTI
-	tristate "Parallel Trace Interface for MIPI P1149.7 cJTAG standard"
-	depends on PCI && TTY && (X86_INTEL_MID || COMPILE_TEST)
-	help
-	  The PTI (Parallel Trace Interface) driver directs
-	  trace data routed from various parts in the system out
-	  through an Intel Penwell PTI port and out of the mobile
-	  device for analysis with a debugging tool (Lauterbach or Fido).
-
-	  You should select this driver if the target kernel is meant for
-	  an Intel Atom (non-netbook) mobile device containing a MIPI
-	  P1149.7 standard implementation.
-
 config TIFM_CORE
 	tristate "TI Flash Media interface support"
 	depends on PCI
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 456aa8a4c896..99b6f15a3c70 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_IBMVMC)		+= ibmvmc.o
 obj-$(CONFIG_AD525X_DPOT)	+= ad525x_dpot.o
 obj-$(CONFIG_AD525X_DPOT_I2C)	+= ad525x_dpot-i2c.o
 obj-$(CONFIG_AD525X_DPOT_SPI)	+= ad525x_dpot-spi.o
-obj-$(CONFIG_INTEL_MID_PTI)	+= pti.o
 obj-$(CONFIG_ATMEL_SSC)		+= atmel-ssc.o
 obj-$(CONFIG_DUMMY_IRQ)		+= dummy-irq.o
 obj-$(CONFIG_ICS932S401)	+= ics932s401.o
diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c
deleted file mode 100644
index 7236ae527b19..000000000000
--- a/drivers/misc/pti.c
+++ /dev/null
@@ -1,978 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  pti.c - PTI driver for cJTAG data extration
- *
- *  Copyright (C) Intel 2010
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- */
-
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/pci.h>
-#include <linux/mutex.h>
-#include <linux/miscdevice.h>
-#include <linux/intel-pti.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#define DRIVERNAME		"pti"
-#define PCINAME			"pciPTI"
-#define TTYNAME			"ttyPTI"
-#define CHARNAME		"pti"
-#define PTITTY_MINOR_START	0
-#define PTITTY_MINOR_NUM	2
-#define MAX_APP_IDS		16   /* 128 channel ids / u8 bit size */
-#define MAX_OS_IDS		16   /* 128 channel ids / u8 bit size */
-#define MAX_MODEM_IDS		16   /* 128 channel ids / u8 bit size */
-#define MODEM_BASE_ID		71   /* modem master ID address    */
-#define CONTROL_ID		72   /* control master ID address  */
-#define CONSOLE_ID		73   /* console master ID address  */
-#define OS_BASE_ID		74   /* base OS master ID address  */
-#define APP_BASE_ID		80   /* base App master ID address */
-#define CONTROL_FRAME_LEN	32   /* PTI control frame maximum size */
-#define USER_COPY_SIZE		8192 /* 8Kb buffer for user space copy */
-#define APERTURE_14		0x3800000 /* offset to first OS write addr */
-#define APERTURE_LEN		0x400000  /* address length */
-
-struct pti_tty {
-	struct pti_masterchannel *mc;
-};
-
-struct pti_dev {
-	struct tty_port port[PTITTY_MINOR_NUM];
-	unsigned long pti_addr;
-	unsigned long aperture_base;
-	void __iomem *pti_ioaddr;
-	u8 ia_app[MAX_APP_IDS];
-	u8 ia_os[MAX_OS_IDS];
-	u8 ia_modem[MAX_MODEM_IDS];
-};
-
-/*
- * This protects access to ia_app, ia_os, and ia_modem,
- * which keeps track of channels allocated in
- * an aperture write id.
- */
-static DEFINE_MUTEX(alloclock);
-
-static const struct pci_device_id pci_ids[] = {
-		{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x82B)},
-		{0}
-};
-
-static struct tty_driver *pti_tty_driver;
-static struct pti_dev *drv_data;
-
-static unsigned int pti_console_channel;
-static unsigned int pti_control_channel;
-
-/**
- *  pti_write_to_aperture()- The private write function to PTI HW.
- *
- *  @mc: The 'aperture'. It's part of a write address that holds
- *       a master and channel ID.
- *  @buf: Data being written to the HW that will ultimately be seen
- *        in a debugging tool (Fido, Lauterbach).
- *  @len: Size of buffer.
- *
- *  Since each aperture is specified by a unique
- *  master/channel ID, no two processes will be writing
- *  to the same aperture at the same time so no lock is required. The
- *  PTI-Output agent will send these out in the order that they arrived, and
- *  thus, it will intermix these messages. The debug tool can then later
- *  regroup the appropriate message segments together reconstituting each
- *  message.
- */
-static void pti_write_to_aperture(struct pti_masterchannel *mc,
-				  u8 *buf,
-				  int len)
-{
-	int dwordcnt;
-	int final;
-	int i;
-	u32 ptiword;
-	u32 __iomem *aperture;
-	u8 *p = buf;
-
-	/*
-	 * calculate the aperture offset from the base using the master and
-	 * channel id's.
-	 */
-	aperture = drv_data->pti_ioaddr + (mc->master << 15)
-		+ (mc->channel << 8);
-
-	dwordcnt = len >> 2;
-	final = len - (dwordcnt << 2);	    /* final = trailing bytes    */
-	if (final == 0 && dwordcnt != 0) {  /* always need a final dword */
-		final += 4;
-		dwordcnt--;
-	}
-
-	for (i = 0; i < dwordcnt; i++) {
-		ptiword = be32_to_cpu(*(u32 *)p);
-		p += 4;
-		iowrite32(ptiword, aperture);
-	}
-
-	aperture += PTI_LASTDWORD_DTS;	/* adding DTS signals that is EOM */
-
-	ptiword = 0;
-	for (i = 0; i < final; i++)
-		ptiword |= *p++ << (24-(8*i));
-
-	iowrite32(ptiword, aperture);
-	return;
-}
-
-/**
- *  pti_control_frame_built_and_sent()- control frame build and send function.
- *
- *  @mc:          The master / channel structure on which the function
- *                built a control frame.
- *  @thread_name: The thread name associated with the master / channel or
- *                'NULL' if using the 'current' global variable.
- *
- *  To be able to post process the PTI contents on host side, a control frame
- *  is added before sending any PTI content. So the host side knows on
- *  each PTI frame the name of the thread using a dedicated master / channel.
- *  The thread name is retrieved from 'current' global variable if 'thread_name'
- *  is 'NULL', else it is retrieved from 'thread_name' parameter.
- *  This function builds this frame and sends it to a master ID CONTROL_ID.
- *  The overhead is only 32 bytes since the driver only writes to HW
- *  in 32 byte chunks.
- */
-static void pti_control_frame_built_and_sent(struct pti_masterchannel *mc,
-					     const char *thread_name)
-{
-	/*
-	 * Since we access the comm member in current's task_struct, we only
-	 * need to be as large as what 'comm' in that structure is.
-	 */
-	char comm[TASK_COMM_LEN];
-	struct pti_masterchannel mccontrol = {.master = CONTROL_ID,
-					      .channel = 0};
-	const char *thread_name_p;
-	const char *control_format = "%3d %3d %s";
-	u8 control_frame[CONTROL_FRAME_LEN];
-
-	if (!thread_name) {
-		if (!in_interrupt())
-			get_task_comm(comm, current);
-		else
-			strncpy(comm, "Interrupt", TASK_COMM_LEN);
-
-		/* Absolutely ensure our buffer is zero terminated. */
-		comm[TASK_COMM_LEN-1] = 0;
-		thread_name_p = comm;
-	} else {
-		thread_name_p = thread_name;
-	}
-
-	mccontrol.channel = pti_control_channel;
-	pti_control_channel = (pti_control_channel + 1) & 0x7f;
-
-	snprintf(control_frame, CONTROL_FRAME_LEN, control_format, mc->master,
-		mc->channel, thread_name_p);
-	pti_write_to_aperture(&mccontrol, control_frame, strlen(control_frame));
-}
-
-/**
- *  pti_write_full_frame_to_aperture()- high level function to
- *					write to PTI.
- *
- *  @mc:  The 'aperture'. It's part of a write address that holds
- *        a master and channel ID.
- *  @buf: Data being written to the HW that will ultimately be seen
- *        in a debugging tool (Fido, Lauterbach).
- *  @len: Size of buffer.
- *
- *  All threads sending data (either console, user space application, ...)
- *  are calling the high level function to write to PTI meaning that it is
- *  possible to add a control frame before sending the content.
- */
-static void pti_write_full_frame_to_aperture(struct pti_masterchannel *mc,
-						const unsigned char *buf,
-						int len)
-{
-	pti_control_frame_built_and_sent(mc, NULL);
-	pti_write_to_aperture(mc, (u8 *)buf, len);
-}
-
-/**
- * get_id()- Allocate a master and channel ID.
- *
- * @id_array:    an array of bits representing what channel
- *               id's are allocated for writing.
- * @max_ids:     The max amount of available write IDs to use.
- * @base_id:     The starting SW channel ID, based on the Intel
- *               PTI arch.
- * @thread_name: The thread name associated with the master / channel or
- *               'NULL' if using the 'current' global variable.
- *
- * Returns:
- *	pti_masterchannel struct with master, channel ID address
- *	0 for error
- *
- * Each bit in the arrays ia_app and ia_os correspond to a master and
- * channel id. The bit is one if the id is taken and 0 if free. For
- * every master there are 128 channel id's.
- */
-static struct pti_masterchannel *get_id(u8 *id_array,
-					int max_ids,
-					int base_id,
-					const char *thread_name)
-{
-	struct pti_masterchannel *mc;
-	int i, j, mask;
-
-	mc = kmalloc(sizeof(struct pti_masterchannel), GFP_KERNEL);
-	if (mc == NULL)
-		return NULL;
-
-	/* look for a byte with a free bit */
-	for (i = 0; i < max_ids; i++)
-		if (id_array[i] != 0xff)
-			break;
-	if (i == max_ids) {
-		kfree(mc);
-		return NULL;
-	}
-	/* find the bit in the 128 possible channel opportunities */
-	mask = 0x80;
-	for (j = 0; j < 8; j++) {
-		if ((id_array[i] & mask) == 0)
-			break;
-		mask >>= 1;
-	}
-
-	/* grab it */
-	id_array[i] |= mask;
-	mc->master  = base_id;
-	mc->channel = ((i & 0xf)<<3) + j;
-	/* write new master Id / channel Id allocation to channel control */
-	pti_control_frame_built_and_sent(mc, thread_name);
-	return mc;
-}
-
-/*
- * The following three functions:
- * pti_request_mastercahannel(), mipi_release_masterchannel()
- * and pti_writedata() are an API for other kernel drivers to
- * access PTI.
- */
-
-/**
- * pti_request_masterchannel()- Kernel API function used to allocate
- *				a master, channel ID address
- *				to write to PTI HW.
- *
- * @type:        0- request Application  master, channel aperture ID
- *                  write address.
- *               1- request OS master, channel aperture ID write
- *                  address.
- *               2- request Modem master, channel aperture ID
- *                  write address.
- *               Other values, error.
- * @thread_name: The thread name associated with the master / channel or
- *               'NULL' if using the 'current' global variable.
- *
- * Returns:
- *	pti_masterchannel struct
- *	0 for error
- */
-struct pti_masterchannel *pti_request_masterchannel(u8 type,
-						    const char *thread_name)
-{
-	struct pti_masterchannel *mc;
-
-	mutex_lock(&alloclock);
-
-	switch (type) {
-
-	case 0:
-		mc = get_id(drv_data->ia_app, MAX_APP_IDS,
-			    APP_BASE_ID, thread_name);
-		break;
-
-	case 1:
-		mc = get_id(drv_data->ia_os, MAX_OS_IDS,
-			    OS_BASE_ID, thread_name);
-		break;
-
-	case 2:
-		mc = get_id(drv_data->ia_modem, MAX_MODEM_IDS,
-			    MODEM_BASE_ID, thread_name);
-		break;
-	default:
-		mc = NULL;
-	}
-
-	mutex_unlock(&alloclock);
-	return mc;
-}
-EXPORT_SYMBOL_GPL(pti_request_masterchannel);
-
-/**
- * pti_release_masterchannel()- Kernel API function used to release
- *				a master, channel ID address
- *				used to write to PTI HW.
- *
- * @mc: master, channel apeture ID address to be released.  This
- *      will de-allocate the structure via kfree().
- */
-void pti_release_masterchannel(struct pti_masterchannel *mc)
-{
-	u8 master, channel, i;
-
-	mutex_lock(&alloclock);
-
-	if (mc) {
-		master = mc->master;
-		channel = mc->channel;
-
-		if (master == APP_BASE_ID) {
-			i = channel >> 3;
-			drv_data->ia_app[i] &=  ~(0x80>>(channel & 0x7));
-		} else if (master == OS_BASE_ID) {
-			i = channel >> 3;
-			drv_data->ia_os[i] &= ~(0x80>>(channel & 0x7));
-		} else {
-			i = channel >> 3;
-			drv_data->ia_modem[i] &= ~(0x80>>(channel & 0x7));
-		}
-
-		kfree(mc);
-	}
-
-	mutex_unlock(&alloclock);
-}
-EXPORT_SYMBOL_GPL(pti_release_masterchannel);
-
-/**
- * pti_writedata()- Kernel API function used to write trace
- *                  debugging data to PTI HW.
- *
- * @mc:    Master, channel aperture ID address to write to.
- *         Null value will return with no write occurring.
- * @buf:   Trace debuging data to write to the PTI HW.
- *         Null value will return with no write occurring.
- * @count: Size of buf. Value of 0 or a negative number will
- *         return with no write occuring.
- */
-void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count)
-{
-	/*
-	 * since this function is exported, this is treated like an
-	 * API function, thus, all parameters should
-	 * be checked for validity.
-	 */
-	if ((mc != NULL) && (buf != NULL) && (count > 0))
-		pti_write_to_aperture(mc, buf, count);
-	return;
-}
-EXPORT_SYMBOL_GPL(pti_writedata);
-
-/*
- * for the tty_driver_*() basic function descriptions, see tty_driver.h.
- * Specific header comments made for PTI-related specifics.
- */
-
-/**
- * pti_tty_driver_open()- Open an Application master, channel aperture
- * ID to the PTI device via tty device.
- *
- * @tty: tty interface.
- * @filp: filp interface pased to tty_port_open() call.
- *
- * Returns:
- *	int, 0 for success
- *	otherwise, fail value
- *
- * The main purpose of using the tty device interface is for
- * each tty port to have a unique PTI write aperture.  In an
- * example use case, ttyPTI0 gets syslogd and an APP aperture
- * ID and ttyPTI1 is where the n_tracesink ldisc hooks to route
- * modem messages into PTI.  Modem trace data does not have to
- * go to ttyPTI1, but ttyPTI0 and ttyPTI1 do need to be distinct
- * master IDs.  These messages go through the PTI HW and out of
- * the handheld platform and to the Fido/Lauterbach device.
- */
-static int pti_tty_driver_open(struct tty_struct *tty, struct file *filp)
-{
-	/*
-	 * we actually want to allocate a new channel per open, per
-	 * system arch.  HW gives more than plenty channels for a single
-	 * system task to have its own channel to write trace data. This
-	 * also removes a locking requirement for the actual write
-	 * procedure.
-	 */
-	return tty_port_open(tty->port, tty, filp);
-}
-
-/**
- * pti_tty_driver_close()- close tty device and release Application
- * master, channel aperture ID to the PTI device via tty device.
- *
- * @tty: tty interface.
- * @filp: filp interface pased to tty_port_close() call.
- *
- * The main purpose of using the tty device interface is to route
- * syslog daemon messages to the PTI HW and out of the handheld platform
- * and to the Fido/Lauterbach device.
- */
-static void pti_tty_driver_close(struct tty_struct *tty, struct file *filp)
-{
-	tty_port_close(tty->port, tty, filp);
-}
-
-/**
- * pti_tty_install()- Used to set up specific master-channels
- *		      to tty ports for organizational purposes when
- *		      tracing viewed from debuging tools.
- *
- * @driver: tty driver information.
- * @tty: tty struct containing pti information.
- *
- * Returns:
- *	0 for success
- *	otherwise, error
- */
-static int pti_tty_install(struct tty_driver *driver, struct tty_struct *tty)
-{
-	int idx = tty->index;
-	struct pti_tty *pti_tty_data;
-	int ret = tty_standard_install(driver, tty);
-
-	if (ret == 0) {
-		pti_tty_data = kmalloc(sizeof(struct pti_tty), GFP_KERNEL);
-		if (pti_tty_data == NULL)
-			return -ENOMEM;
-
-		if (idx == PTITTY_MINOR_START)
-			pti_tty_data->mc = pti_request_masterchannel(0, NULL);
-		else
-			pti_tty_data->mc = pti_request_masterchannel(2, NULL);
-
-		if (pti_tty_data->mc == NULL) {
-			kfree(pti_tty_data);
-			return -ENXIO;
-		}
-		tty->driver_data = pti_tty_data;
-	}
-
-	return ret;
-}
-
-/**
- * pti_tty_cleanup()- Used to de-allocate master-channel resources
- *		      tied to tty's of this driver.
- *
- * @tty: tty struct containing pti information.
- */
-static void pti_tty_cleanup(struct tty_struct *tty)
-{
-	struct pti_tty *pti_tty_data = tty->driver_data;
-	if (pti_tty_data == NULL)
-		return;
-	pti_release_masterchannel(pti_tty_data->mc);
-	kfree(pti_tty_data);
-	tty->driver_data = NULL;
-}
-
-/**
- * pti_tty_driver_write()-  Write trace debugging data through the char
- * interface to the PTI HW.  Part of the misc device implementation.
- *
- * @tty: tty struct containing pti information.
- * @buf: trace data to be written.
- * @len:  # of byte to write.
- *
- * Returns:
- *	int, # of bytes written
- *	otherwise, error
- */
-static int pti_tty_driver_write(struct tty_struct *tty,
-	const unsigned char *buf, int len)
-{
-	struct pti_tty *pti_tty_data = tty->driver_data;
-	if ((pti_tty_data != NULL) && (pti_tty_data->mc != NULL)) {
-		pti_write_to_aperture(pti_tty_data->mc, (u8 *)buf, len);
-		return len;
-	}
-	/*
-	 * we can't write to the pti hardware if the private driver_data
-	 * and the mc address is not there.
-	 */
-	else
-		return -EFAULT;
-}
-
-/**
- * pti_tty_write_room()- Always returns 2048.
- *
- * @tty: contains tty info of the pti driver.
- */
-static int pti_tty_write_room(struct tty_struct *tty)
-{
-	return 2048;
-}
-
-/**
- * pti_char_open()- Open an Application master, channel aperture
- * ID to the PTI device. Part of the misc device implementation.
- *
- * @inode: not used.
- * @filp:  Output- will have a masterchannel struct set containing
- *                 the allocated application PTI aperture write address.
- *
- * Returns:
- *	int, 0 for success
- *	otherwise, a fail value
- */
-static int pti_char_open(struct inode *inode, struct file *filp)
-{
-	struct pti_masterchannel *mc;
-
-	/*
-	 * We really do want to fail immediately if
-	 * pti_request_masterchannel() fails,
-	 * before assigning the value to filp->private_data.
-	 * Slightly easier to debug if this driver needs debugging.
-	 */
-	mc = pti_request_masterchannel(0, NULL);
-	if (mc == NULL)
-		return -ENOMEM;
-	filp->private_data = mc;
-	return 0;
-}
-
-/**
- * pti_char_release()-  Close a char channel to the PTI device. Part
- * of the misc device implementation.
- *
- * @inode: Not used in this implementaiton.
- * @filp:  Contains private_data that contains the master, channel
- *         ID to be released by the PTI device.
- *
- * Returns:
- *	always 0
- */
-static int pti_char_release(struct inode *inode, struct file *filp)
-{
-	pti_release_masterchannel(filp->private_data);
-	filp->private_data = NULL;
-	return 0;
-}
-
-/**
- * pti_char_write()-  Write trace debugging data through the char
- * interface to the PTI HW.  Part of the misc device implementation.
- *
- * @filp:  Contains private data which is used to obtain
- *         master, channel write ID.
- * @data:  trace data to be written.
- * @len:   # of byte to write.
- * @ppose: Not used in this function implementation.
- *
- * Returns:
- *	int, # of bytes written
- *	otherwise, error value
- *
- * Notes: From side discussions with Alan Cox and experimenting
- * with PTI debug HW like Nokia's Fido box and Lauterbach
- * devices, 8192 byte write buffer used by USER_COPY_SIZE was
- * deemed an appropriate size for this type of usage with
- * debugging HW.
- */
-static ssize_t pti_char_write(struct file *filp, const char __user *data,
-			      size_t len, loff_t *ppose)
-{
-	struct pti_masterchannel *mc;
-	void *kbuf;
-	const char __user *tmp;
-	size_t size = USER_COPY_SIZE;
-	size_t n = 0;
-
-	tmp = data;
-	mc = filp->private_data;
-
-	kbuf = kmalloc(size, GFP_KERNEL);
-	if (kbuf == NULL)  {
-		pr_err("%s(%d): buf allocation failed\n",
-			__func__, __LINE__);
-		return -ENOMEM;
-	}
-
-	do {
-		if (len - n > USER_COPY_SIZE)
-			size = USER_COPY_SIZE;
-		else
-			size = len - n;
-
-		if (copy_from_user(kbuf, tmp, size)) {
-			kfree(kbuf);
-			return n ? n : -EFAULT;
-		}
-
-		pti_write_to_aperture(mc, kbuf, size);
-		n  += size;
-		tmp += size;
-
-	} while (len > n);
-
-	kfree(kbuf);
-	return len;
-}
-
-static const struct tty_operations pti_tty_driver_ops = {
-	.open		= pti_tty_driver_open,
-	.close		= pti_tty_driver_close,
-	.write		= pti_tty_driver_write,
-	.write_room	= pti_tty_write_room,
-	.install	= pti_tty_install,
-	.cleanup	= pti_tty_cleanup
-};
-
-static const struct file_operations pti_char_driver_ops = {
-	.owner		= THIS_MODULE,
-	.write		= pti_char_write,
-	.open		= pti_char_open,
-	.release	= pti_char_release,
-};
-
-static struct miscdevice pti_char_driver = {
-	.minor		= MISC_DYNAMIC_MINOR,
-	.name		= CHARNAME,
-	.fops		= &pti_char_driver_ops
-};
-
-/**
- * pti_console_write()-  Write to the console that has been acquired.
- *
- * @c:   Not used in this implementaiton.
- * @buf: Data to be written.
- * @len: Length of buf.
- */
-static void pti_console_write(struct console *c, const char *buf, unsigned len)
-{
-	static struct pti_masterchannel mc = {.master  = CONSOLE_ID,
-					      .channel = 0};
-
-	mc.channel = pti_console_channel;
-	pti_console_channel = (pti_console_channel + 1) & 0x7f;
-
-	pti_write_full_frame_to_aperture(&mc, buf, len);
-}
-
-/**
- * pti_console_device()-  Return the driver tty structure and set the
- *			  associated index implementation.
- *
- * @c:     Console device of the driver.
- * @index: index associated with c.
- *
- * Returns:
- *	always value of pti_tty_driver structure when this function
- *	is called.
- */
-static struct tty_driver *pti_console_device(struct console *c, int *index)
-{
-	*index = c->index;
-	return pti_tty_driver;
-}
-
-/**
- * pti_console_setup()-  Initialize console variables used by the driver.
- *
- * @c:     Not used.
- * @opts:  Not used.
- *
- * Returns:
- *	always 0.
- */
-static int pti_console_setup(struct console *c, char *opts)
-{
-	pti_console_channel = 0;
-	pti_control_channel = 0;
-	return 0;
-}
-
-/*
- * pti_console struct, used to capture OS printk()'s and shift
- * out to the PTI device for debugging.  This cannot be
- * enabled upon boot because of the possibility of eating
- * any serial console printk's (race condition discovered).
- * The console should be enabled upon when the tty port is
- * used for the first time.  Since the primary purpose for
- * the tty port is to hook up syslog to it, the tty port
- * will be open for a really long time.
- */
-static struct console pti_console = {
-	.name		= TTYNAME,
-	.write		= pti_console_write,
-	.device		= pti_console_device,
-	.setup		= pti_console_setup,
-	.flags		= CON_PRINTBUFFER,
-	.index		= 0,
-};
-
-/**
- * pti_port_activate()- Used to start/initialize any items upon
- * first opening of tty_port().
- *
- * @port: The tty port number of the PTI device.
- * @tty:  The tty struct associated with this device.
- *
- * Returns:
- *	always returns 0
- *
- * Notes: The primary purpose of the PTI tty port 0 is to hook
- * the syslog daemon to it; thus this port will be open for a
- * very long time.
- */
-static int pti_port_activate(struct tty_port *port, struct tty_struct *tty)
-{
-	if (port->tty->index == PTITTY_MINOR_START)
-		console_start(&pti_console);
-	return 0;
-}
-
-/**
- * pti_port_shutdown()- Used to stop/shutdown any items upon the
- * last tty port close.
- *
- * @port: The tty port number of the PTI device.
- *
- * Notes: The primary purpose of the PTI tty port 0 is to hook
- * the syslog daemon to it; thus this port will be open for a
- * very long time.
- */
-static void pti_port_shutdown(struct tty_port *port)
-{
-	if (port->tty->index == PTITTY_MINOR_START)
-		console_stop(&pti_console);
-}
-
-static const struct tty_port_operations tty_port_ops = {
-	.activate = pti_port_activate,
-	.shutdown = pti_port_shutdown,
-};
-
-/*
- * Note the _probe() call sets everything up and ties the char and tty
- * to successfully detecting the PTI device on the pci bus.
- */
-
-/**
- * pti_pci_probe()- Used to detect pti on the pci bus and set
- *		    things up in the driver.
- *
- * @pdev: pci_dev struct values for pti.
- * @ent:  pci_device_id struct for pti driver.
- *
- * Returns:
- *	0 for success
- *	otherwise, error
- */
-static int pti_pci_probe(struct pci_dev *pdev,
-		const struct pci_device_id *ent)
-{
-	unsigned int a;
-	int retval;
-	int pci_bar = 1;
-
-	dev_dbg(&pdev->dev, "%s %s(%d): PTI PCI ID %04x:%04x\n", __FILE__,
-			__func__, __LINE__, pdev->vendor, pdev->device);
-
-	retval = misc_register(&pti_char_driver);
-	if (retval) {
-		pr_err("%s(%d): CHAR registration failed of pti driver\n",
-			__func__, __LINE__);
-		pr_err("%s(%d): Error value returned: %d\n",
-			__func__, __LINE__, retval);
-		goto err;
-	}
-
-	retval = pci_enable_device(pdev);
-	if (retval != 0) {
-		dev_err(&pdev->dev,
-			"%s: pci_enable_device() returned error %d\n",
-			__func__, retval);
-		goto err_unreg_misc;
-	}
-
-	drv_data = kzalloc(sizeof(*drv_data), GFP_KERNEL);
-	if (drv_data == NULL) {
-		retval = -ENOMEM;
-		dev_err(&pdev->dev,
-			"%s(%d): kmalloc() returned NULL memory.\n",
-			__func__, __LINE__);
-		goto err_disable_pci;
-	}
-	drv_data->pti_addr = pci_resource_start(pdev, pci_bar);
-
-	retval = pci_request_region(pdev, pci_bar, dev_name(&pdev->dev));
-	if (retval != 0) {
-		dev_err(&pdev->dev,
-			"%s(%d): pci_request_region() returned error %d\n",
-			__func__, __LINE__, retval);
-		goto err_free_dd;
-	}
-	drv_data->aperture_base = drv_data->pti_addr+APERTURE_14;
-	drv_data->pti_ioaddr =
-		ioremap((u32)drv_data->aperture_base,
-		APERTURE_LEN);
-	if (!drv_data->pti_ioaddr) {
-		retval = -ENOMEM;
-		goto err_rel_reg;
-	}
-
-	pci_set_drvdata(pdev, drv_data);
-
-	for (a = 0; a < PTITTY_MINOR_NUM; a++) {
-		struct tty_port *port = &drv_data->port[a];
-		tty_port_init(port);
-		port->ops = &tty_port_ops;
-
-		tty_port_register_device(port, pti_tty_driver, a, &pdev->dev);
-	}
-
-	register_console(&pti_console);
-
-	return 0;
-err_rel_reg:
-	pci_release_region(pdev, pci_bar);
-err_free_dd:
-	kfree(drv_data);
-err_disable_pci:
-	pci_disable_device(pdev);
-err_unreg_misc:
-	misc_deregister(&pti_char_driver);
-err:
-	return retval;
-}
-
-/**
- * pti_pci_remove()- Driver exit method to remove PTI from
- *		   PCI bus.
- * @pdev: variable containing pci info of PTI.
- */
-static void pti_pci_remove(struct pci_dev *pdev)
-{
-	struct pti_dev *drv_data = pci_get_drvdata(pdev);
-	unsigned int a;
-
-	unregister_console(&pti_console);
-
-	for (a = 0; a < PTITTY_MINOR_NUM; a++) {
-		tty_unregister_device(pti_tty_driver, a);
-		tty_port_destroy(&drv_data->port[a]);
-	}
-
-	iounmap(drv_data->pti_ioaddr);
-	kfree(drv_data);
-	pci_release_region(pdev, 1);
-	pci_disable_device(pdev);
-
-	misc_deregister(&pti_char_driver);
-}
-
-static struct pci_driver pti_pci_driver = {
-	.name		= PCINAME,
-	.id_table	= pci_ids,
-	.probe		= pti_pci_probe,
-	.remove		= pti_pci_remove,
-};
-
-/**
- * pti_init()- Overall entry/init call to the pti driver.
- *             It starts the registration process with the kernel.
- *
- * Returns:
- *	int __init, 0 for success
- *	otherwise value is an error
- *
- */
-static int __init pti_init(void)
-{
-	int retval;
-
-	/* First register module as tty device */
-
-	pti_tty_driver = alloc_tty_driver(PTITTY_MINOR_NUM);
-	if (pti_tty_driver == NULL) {
-		pr_err("%s(%d): Memory allocation failed for ptiTTY driver\n",
-			__func__, __LINE__);
-		return -ENOMEM;
-	}
-
-	pti_tty_driver->driver_name		= DRIVERNAME;
-	pti_tty_driver->name			= TTYNAME;
-	pti_tty_driver->major			= 0;
-	pti_tty_driver->minor_start		= PTITTY_MINOR_START;
-	pti_tty_driver->type			= TTY_DRIVER_TYPE_SYSTEM;
-	pti_tty_driver->subtype			= SYSTEM_TYPE_SYSCONS;
-	pti_tty_driver->flags			= TTY_DRIVER_REAL_RAW |
-						  TTY_DRIVER_DYNAMIC_DEV;
-	pti_tty_driver->init_termios		= tty_std_termios;
-
-	tty_set_operations(pti_tty_driver, &pti_tty_driver_ops);
-
-	retval = tty_register_driver(pti_tty_driver);
-	if (retval) {
-		pr_err("%s(%d): TTY registration failed of pti driver\n",
-			__func__, __LINE__);
-		pr_err("%s(%d): Error value returned: %d\n",
-			__func__, __LINE__, retval);
-
-		goto put_tty;
-	}
-
-	retval = pci_register_driver(&pti_pci_driver);
-	if (retval) {
-		pr_err("%s(%d): PCI registration failed of pti driver\n",
-			__func__, __LINE__);
-		pr_err("%s(%d): Error value returned: %d\n",
-			__func__, __LINE__, retval);
-		goto unreg_tty;
-	}
-
-	return 0;
-unreg_tty:
-	tty_unregister_driver(pti_tty_driver);
-put_tty:
-	put_tty_driver(pti_tty_driver);
-	pti_tty_driver = NULL;
-	return retval;
-}
-
-/**
- * pti_exit()- Unregisters this module as a tty and pci driver.
- */
-static void __exit pti_exit(void)
-{
-	tty_unregister_driver(pti_tty_driver);
-	pci_unregister_driver(&pti_pci_driver);
-	put_tty_driver(pti_tty_driver);
-}
-
-module_init(pti_init);
-module_exit(pti_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ken Mills, Jay Freyensee");
-MODULE_DESCRIPTION("PTI Driver");
-
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index b3ccae932660..730de6bf048b 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_AUDIT)		+= tty_audit.o
 obj-$(CONFIG_MAGIC_SYSRQ)	+= sysrq.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_N_GSM)		+= n_gsm.o
-obj-$(CONFIG_TRACE_ROUTER)	+= n_tracerouter.o
-obj-$(CONFIG_TRACE_SINK)	+= n_tracesink.o
 obj-$(CONFIG_R3964)		+= n_r3964.o
 
 obj-y				+= vt/
diff --git a/drivers/tty/n_tracerouter.c b/drivers/tty/n_tracerouter.c
deleted file mode 100644
index 4479af4d2fa5..000000000000
--- a/drivers/tty/n_tracerouter.c
+++ /dev/null
@@ -1,233 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  n_tracerouter.c - Trace data router through tty space
- *
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This trace router uses the Linux line discipline framework to route
- * trace data coming from a HW Modem to a PTI (Parallel Trace Module) port.
- * The solution is not specific to a HW modem and this line disciple can
- * be used to route any stream of data in kernel space.
- * This is part of a solution for the P1149.7, compact JTAG, standard.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <linux/tty.h>
-#include <linux/tty_ldisc.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/bug.h>
-#include "n_tracesink.h"
-
-/*
- * Other ldisc drivers use 65536 which basically means,
- * 'I can always accept 64k' and flow control is off.
- * This number is deemed appropriate for this driver.
- */
-#define RECEIVE_ROOM	65536
-#define DRIVERNAME	"n_tracerouter"
-
-/*
- * struct to hold private configuration data for this ldisc.
- * opencalled is used to hold if this ldisc has been opened.
- * kref_tty holds the tty reference the ldisc sits on top of.
- */
-struct tracerouter_data {
-	u8 opencalled;
-	struct tty_struct *kref_tty;
-};
-static struct tracerouter_data *tr_data;
-
-/* lock for when tty reference is being used */
-static DEFINE_MUTEX(routelock);
-
-/**
- * n_tracerouter_open() - Called when a tty is opened by a SW entity.
- * @tty: terminal device to the ldisc.
- *
- * Return:
- *      0 for success.
- *
- * Caveats: This should only be opened one time per SW entity.
- */
-static int n_tracerouter_open(struct tty_struct *tty)
-{
-	int retval = -EEXIST;
-
-	mutex_lock(&routelock);
-	if (tr_data->opencalled == 0) {
-
-		tr_data->kref_tty = tty_kref_get(tty);
-		if (tr_data->kref_tty == NULL) {
-			retval = -EFAULT;
-		} else {
-			tr_data->opencalled = 1;
-			tty->disc_data      = tr_data;
-			tty->receive_room   = RECEIVE_ROOM;
-			tty_driver_flush_buffer(tty);
-			retval = 0;
-		}
-	}
-	mutex_unlock(&routelock);
-	return retval;
-}
-
-/**
- * n_tracerouter_close() - close connection
- * @tty: terminal device to the ldisc.
- *
- * Called when a software entity wants to close a connection.
- */
-static void n_tracerouter_close(struct tty_struct *tty)
-{
-	struct tracerouter_data *tptr = tty->disc_data;
-
-	mutex_lock(&routelock);
-	WARN_ON(tptr->kref_tty != tr_data->kref_tty);
-	tty_driver_flush_buffer(tty);
-	tty_kref_put(tr_data->kref_tty);
-	tr_data->kref_tty = NULL;
-	tr_data->opencalled = 0;
-	tty->disc_data = NULL;
-	mutex_unlock(&routelock);
-}
-
-/**
- * n_tracerouter_read() - read request from user space
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * function that allows read() functionality in userspace. By default if this
- * is not implemented it returns -EIO. This module is functioning like a
- * router via n_tracerouter_receivebuf(), and there is no real requirement
- * to implement this function. However, an error return value other than
- * -EIO should be used just to show that there was an intent not to have
- * this function implemented.  Return value based on read() man pages.
- *
- * Return:
- *	 -EINVAL
- */
-static ssize_t n_tracerouter_read(struct tty_struct *tty, struct file *file,
-				  unsigned char __user *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracerouter_write() - Function that allows write() in userspace.
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * By default if this is not implemented, it returns -EIO.
- * This should not be implemented, ever, because
- * 1. this driver is functioning like a router via
- *    n_tracerouter_receivebuf()
- * 2. No writes to HW will ever go through this line discpline driver.
- * However, an error return value other than -EIO should be used
- * just to show that there was an intent not to have this function
- * implemented.  Return value based on write() man pages.
- *
- * Return:
- *	-EINVAL
- */
-static ssize_t n_tracerouter_write(struct tty_struct *tty, struct file *file,
-				   const unsigned char *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracerouter_receivebuf() - Routing function for driver.
- * @tty: terminal device passed into the ldisc.  It's assumed
- *       tty will never be NULL.
- * @cp:  buffer, block of characters to be eventually read by
- *       someone, somewhere (user read() call or some kernel function).
- * @fp:  flag buffer.
- * @count: number of characters (aka, bytes) in cp.
- *
- * This function takes the input buffer, cp, and passes it to
- * an external API function for processing.
- */
-static void n_tracerouter_receivebuf(struct tty_struct *tty,
-					const unsigned char *cp,
-					char *fp, int count)
-{
-	mutex_lock(&routelock);
-	n_tracesink_datadrain((u8 *) cp, count);
-	mutex_unlock(&routelock);
-}
-
-/*
- * Flush buffer is not impelemented as the ldisc has no internal buffering
- * so the tty_driver_flush_buffer() is sufficient for this driver's needs.
- */
-
-static struct tty_ldisc_ops tty_ptirouter_ldisc = {
-	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
-	.name		= DRIVERNAME,
-	.open		= n_tracerouter_open,
-	.close		= n_tracerouter_close,
-	.read		= n_tracerouter_read,
-	.write		= n_tracerouter_write,
-	.receive_buf	= n_tracerouter_receivebuf
-};
-
-/**
- * n_tracerouter_init -	module initialisation
- *
- * Registers this module as a line discipline driver.
- *
- * Return:
- *	0 for success, any other value error.
- */
-static int __init n_tracerouter_init(void)
-{
-	int retval;
-
-	tr_data = kzalloc(sizeof(struct tracerouter_data), GFP_KERNEL);
-	if (tr_data == NULL)
-		return -ENOMEM;
-
-
-	/* Note N_TRACEROUTER is defined in linux/tty.h */
-	retval = tty_register_ldisc(N_TRACEROUTER, &tty_ptirouter_ldisc);
-	if (retval < 0) {
-		pr_err("%s: Registration failed: %d\n", __func__, retval);
-		kfree(tr_data);
-	}
-	return retval;
-}
-
-/**
- * n_tracerouter_exit -	module unload
- *
- * Removes this module as a line discipline driver.
- */
-static void __exit n_tracerouter_exit(void)
-{
-	int retval = tty_unregister_ldisc(N_TRACEROUTER);
-
-	if (retval < 0)
-		pr_err("%s: Unregistration failed: %d\n", __func__,  retval);
-	else
-		kfree(tr_data);
-}
-
-module_init(n_tracerouter_init);
-module_exit(n_tracerouter_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jay Freyensee");
-MODULE_ALIAS_LDISC(N_TRACEROUTER);
-MODULE_DESCRIPTION("Trace router ldisc driver");
diff --git a/drivers/tty/n_tracesink.c b/drivers/tty/n_tracesink.c
deleted file mode 100644
index d96ba82cc356..000000000000
--- a/drivers/tty/n_tracesink.c
+++ /dev/null
@@ -1,228 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  n_tracesink.c - Trace data router and sink path through tty space.
- *
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The trace sink uses the Linux line discipline framework to receive
- * trace data coming from the PTI source line discipline driver
- * to a user-desired tty port, like USB.
- * This is to provide a way to extract modem trace data on
- * devices that do not have a PTI HW module, or just need modem
- * trace data to come out of a different HW output port.
- * This is part of a solution for the P1149.7, compact JTAG, standard.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <linux/tty.h>
-#include <linux/tty_ldisc.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/bug.h>
-#include "n_tracesink.h"
-
-/*
- * Other ldisc drivers use 65536 which basically means,
- * 'I can always accept 64k' and flow control is off.
- * This number is deemed appropriate for this driver.
- */
-#define RECEIVE_ROOM	65536
-#define DRIVERNAME	"n_tracesink"
-
-/*
- * there is a quirk with this ldisc is he can write data
- * to a tty from anyone calling his kernel API, which
- * meets customer requirements in the drivers/misc/pti.c
- * project.  So he needs to know when he can and cannot write when
- * the API is called. In theory, the API can be called
- * after an init() but before a successful open() which
- * would crash the system if tty is not checked.
- */
-static struct tty_struct *this_tty;
-static DEFINE_MUTEX(writelock);
-
-/**
- * n_tracesink_open() - Called when a tty is opened by a SW entity.
- * @tty: terminal device to the ldisc.
- *
- * Return:
- *      0 for success,
- *      -EFAULT = couldn't get a tty kref n_tracesink will sit
- *       on top of
- *      -EEXIST = open() called successfully once and it cannot
- *      be called again.
- *
- * Caveats: open() should only be successful the first time a
- * SW entity calls it.
- */
-static int n_tracesink_open(struct tty_struct *tty)
-{
-	int retval = -EEXIST;
-
-	mutex_lock(&writelock);
-	if (this_tty == NULL) {
-		this_tty = tty_kref_get(tty);
-		if (this_tty == NULL) {
-			retval = -EFAULT;
-		} else {
-			tty->disc_data = this_tty;
-			tty_driver_flush_buffer(tty);
-			retval = 0;
-		}
-	}
-	mutex_unlock(&writelock);
-
-	return retval;
-}
-
-/**
- * n_tracesink_close() - close connection
- * @tty: terminal device to the ldisc.
- *
- * Called when a software entity wants to close a connection.
- */
-static void n_tracesink_close(struct tty_struct *tty)
-{
-	mutex_lock(&writelock);
-	tty_driver_flush_buffer(tty);
-	tty_kref_put(this_tty);
-	this_tty = NULL;
-	tty->disc_data = NULL;
-	mutex_unlock(&writelock);
-}
-
-/**
- * n_tracesink_read() - read request from user space
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * function that allows read() functionality in userspace. By default if this
- * is not implemented it returns -EIO. This module is functioning like a
- * router via n_tracesink_receivebuf(), and there is no real requirement
- * to implement this function. However, an error return value other than
- * -EIO should be used just to show that there was an intent not to have
- * this function implemented.  Return value based on read() man pages.
- *
- * Return:
- *	 -EINVAL
- */
-static ssize_t n_tracesink_read(struct tty_struct *tty, struct file *file,
-				unsigned char __user *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracesink_write() - Function that allows write() in userspace.
- * @tty:  terminal device passed into the ldisc.
- * @file: pointer to open file object.
- * @buf:  pointer to the data buffer that gets eventually returned.
- * @nr:   number of bytes of the data buffer that is returned.
- *
- * By default if this is not implemented, it returns -EIO.
- * This should not be implemented, ever, because
- * 1. this driver is functioning like a router via
- *    n_tracesink_receivebuf()
- * 2. No writes to HW will ever go through this line discpline driver.
- * However, an error return value other than -EIO should be used
- * just to show that there was an intent not to have this function
- * implemented.  Return value based on write() man pages.
- *
- * Return:
- *	-EINVAL
- */
-static ssize_t n_tracesink_write(struct tty_struct *tty, struct file *file,
-				 const unsigned char *buf, size_t nr) {
-	return -EINVAL;
-}
-
-/**
- * n_tracesink_datadrain() - Kernel API function used to route
- *			     trace debugging data to user-defined
- *			     port like USB.
- *
- * @buf:   Trace debuging data buffer to write to tty target
- *         port. Null value will return with no write occurring.
- * @count: Size of buf. Value of 0 or a negative number will
- *         return with no write occuring.
- *
- * Caveat: If this line discipline does not set the tty it sits
- * on top of via an open() call, this API function will not
- * call the tty's write() call because it will have no pointer
- * to call the write().
- */
-void n_tracesink_datadrain(u8 *buf, int count)
-{
-	mutex_lock(&writelock);
-
-	if ((buf != NULL) && (count > 0) && (this_tty != NULL))
-		this_tty->ops->write(this_tty, buf, count);
-
-	mutex_unlock(&writelock);
-}
-EXPORT_SYMBOL_GPL(n_tracesink_datadrain);
-
-/*
- * Flush buffer is not impelemented as the ldisc has no internal buffering
- * so the tty_driver_flush_buffer() is sufficient for this driver's needs.
- */
-
-/*
- * tty_ldisc function operations for this driver.
- */
-static struct tty_ldisc_ops tty_n_tracesink = {
-	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
-	.name		= DRIVERNAME,
-	.open		= n_tracesink_open,
-	.close		= n_tracesink_close,
-	.read		= n_tracesink_read,
-	.write		= n_tracesink_write
-};
-
-/**
- * n_tracesink_init-	module initialisation
- *
- * Registers this module as a line discipline driver.
- *
- * Return:
- *	0 for success, any other value error.
- */
-static int __init n_tracesink_init(void)
-{
-	/* Note N_TRACESINK is defined in linux/tty.h */
-	int retval = tty_register_ldisc(N_TRACESINK, &tty_n_tracesink);
-
-	if (retval < 0)
-		pr_err("%s: Registration failed: %d\n", __func__, retval);
-
-	return retval;
-}
-
-/**
- * n_tracesink_exit -	module unload
- *
- * Removes this module as a line discipline driver.
- */
-static void __exit n_tracesink_exit(void)
-{
-	int retval = tty_unregister_ldisc(N_TRACESINK);
-
-	if (retval < 0)
-		pr_err("%s: Unregistration failed: %d\n", __func__,  retval);
-}
-
-module_init(n_tracesink_init);
-module_exit(n_tracesink_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jay Freyensee");
-MODULE_ALIAS_LDISC(N_TRACESINK);
-MODULE_DESCRIPTION("Trace sink ldisc driver");
diff --git a/drivers/tty/n_tracesink.h b/drivers/tty/n_tracesink.h
deleted file mode 100644
index 7031d515a700..000000000000
--- a/drivers/tty/n_tracesink.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  n_tracesink.h - Kernel driver API to route trace data in kernel space.
- *
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- *
- * This header file is used by n_tracerouter to be able to send the
- * data of it's tty port to the tty port this module sits.  This
- * mechanism can also be used independent of the PTI module.
- *
- */
-
-#ifndef N_TRACESINK_H_
-#define N_TRACESINK_H_
-
-void n_tracesink_datadrain(u8 *buf, int count);
-
-#endif
diff --git a/include/linux/intel-pti.h b/include/linux/intel-pti.h
deleted file mode 100644
index fcd841a90f2f..000000000000
--- a/include/linux/intel-pti.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *  Copyright (C) Intel 2011
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * The PTI (Parallel Trace Interface) driver directs trace data routed from
- * various parts in the system out through the Intel Penwell PTI port and
- * out of the mobile device for analysis with a debugging tool
- * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7,
- * compact JTAG, standard.
- *
- * This header file will allow other parts of the OS to use the
- * interface to write out it's contents for debugging a mobile system.
- */
-
-#ifndef LINUX_INTEL_PTI_H_
-#define LINUX_INTEL_PTI_H_
-
-/* offset for last dword of any PTI message. Part of MIPI P1149.7 */
-#define PTI_LASTDWORD_DTS	0x30
-
-/* basic structure used as a write address to the PTI HW */
-struct pti_masterchannel {
-	u8 master;
-	u8 channel;
-};
-
-/* the following functions are defined in misc/pti.c */
-void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count);
-struct pti_masterchannel *pti_request_masterchannel(u8 type,
-						    const char *thread_name);
-void pti_release_masterchannel(struct pti_masterchannel *mc);
-
-#endif /* LINUX_INTEL_PTI_H_ */
-- 
cgit v1.2.3


From 8544717cdacc2f33f0f53a3b34c5125b37e13ce9 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 14 Jan 2021 19:07:48 +0200
Subject: bus: fsl-mc: move fsl_mc_command struct in a uapi header

Define "struct fsl_mc_command" as a structure that can cross the
user/kernel boundary.

Acked-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Link: https://lore.kernel.org/r/20210114170752.2927915-2-ciorneiioana@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                 |  1 +
 include/linux/fsl/mc.h      |  8 +-------
 include/uapi/linux/fsl_mc.h | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/fsl_mc.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 23273a59da6f..128455811980 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14652,6 +14652,7 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
 F:	Documentation/networking/device_drivers/ethernet/freescale/dpaa2/overview.rst
 F:	drivers/bus/fsl-mc/
+F:	include/uapi/linux/fsl_mc.h
 
 QT1010 MEDIA DRIVER
 M:	Antti Palosaari <crope@iki.fi>
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index db244874e834..63b56aba925a 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/interrupt.h>
+#include <uapi/linux/fsl_mc.h>
 
 #define FSL_MC_VENDOR_FREESCALE	0x1957
 
@@ -209,8 +210,6 @@ struct fsl_mc_device {
 #define to_fsl_mc_device(_dev) \
 	container_of(_dev, struct fsl_mc_device, dev)
 
-#define MC_CMD_NUM_OF_PARAMS	7
-
 struct mc_cmd_header {
 	u8 src_id;
 	u8 flags_hw;
@@ -220,11 +219,6 @@ struct mc_cmd_header {
 	__le16 cmd_id;
 };
 
-struct fsl_mc_command {
-	__le64 header;
-	__le64 params[MC_CMD_NUM_OF_PARAMS];
-};
-
 enum mc_cmd_status {
 	MC_CMD_STATUS_OK = 0x0, /* Completed successfully */
 	MC_CMD_STATUS_READY = 0x1, /* Ready to be processed */
diff --git a/include/uapi/linux/fsl_mc.h b/include/uapi/linux/fsl_mc.h
new file mode 100644
index 000000000000..cf56d46f052e
--- /dev/null
+++ b/include/uapi/linux/fsl_mc.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Management Complex (MC) userspace public interface
+ *
+ * Copyright 2021 NXP
+ *
+ */
+#ifndef _UAPI_FSL_MC_H_
+#define _UAPI_FSL_MC_H_
+
+#include <linux/types.h>
+
+#define MC_CMD_NUM_OF_PARAMS	7
+
+/**
+ * struct fsl_mc_command - Management Complex (MC) command structure
+ * @header: MC command header
+ * @params: MC command parameters
+ */
+struct fsl_mc_command {
+	__le64 header;
+	__le64 params[MC_CMD_NUM_OF_PARAMS];
+};
+
+#endif /* _UAPI_FSL_MC_H_ */
-- 
cgit v1.2.3


From 1423de718e6a0ce00372ab119fa40060ff50883e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 2 Jun 2020 15:56:55 -0500
Subject: PCI/ACPI: Make acpi_pci_osc_control_set() static

acpi_pci_osc_control_set() is only called from pci_root.c, so stop
exporting it and make it static.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/pci_root.c | 3 +--
 include/linux/acpi.h    | 3 ---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 0bf072cef6cf..7d5a4bd4207c 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_dev);
  * _OSC bits the BIOS has granted control of, but its contents are meaningless
  * on failure.
  **/
-acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req)
+static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req)
 {
 	struct acpi_pci_root *root;
 	acpi_status status = AE_OK;
@@ -406,7 +406,6 @@ out:
 	mutex_unlock(&osc_lock);
 	return status;
 }
-EXPORT_SYMBOL(acpi_pci_osc_control_set);
 
 static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
 				 bool is_pcie)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 053bf05fb1f7..4703daafcce9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -581,9 +581,6 @@ extern bool osc_pc_lpi_support_confirmed;
 #define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES	0x0000000E
 #define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS	0x0000000F
 
-extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
-					     u32 *mask, u32 req);
-
 /* Enable _OST when all relevant hotplug operations are enabled */
 #if defined(CONFIG_ACPI_HOTPLUG_CPU) &&			\
 	defined(CONFIG_ACPI_HOTPLUG_MEMORY) &&		\
-- 
cgit v1.2.3


From 620a6dc40754dc218f5b6389b5d335e9a107fd29 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 22 Jan 2021 12:39:43 +0000
Subject: sched/topology: Make sched_init_numa() use a set for the
 deduplicating sort
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The deduplicating sort in sched_init_numa() assumes that the first line in
the distance table contains all unique values in the entire table. I've
been trying to pen what this exactly means for the topology, but it's not
straightforward. For instance, topology.c uses this example:

  node   0   1   2   3
    0:  10  20  20  30
    1:  20  10  20  20
    2:  20  20  10  20
    3:  30  20  20  10

  0 ----- 1
  |     / |
  |   /   |
  | /     |
  2 ----- 3

Which works out just fine. However, if we swap nodes 0 and 1:

  1 ----- 0
  |     / |
  |   /   |
  | /     |
  2 ----- 3

we get this distance table:

  node   0  1  2  3
    0:  10 20 20 20
    1:  20 10 20 30
    2:  20 20 10 20
    3:  20 30 20 10

Which breaks the deduplicating sort (non-representative first line). In
this case this would just be a renumbering exercise, but it so happens that
we can have a deduplicating sort that goes through the whole table in O(n²)
at the extra cost of a temporary memory allocation (i.e. any form of set).

The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following
this, implement the set as a 256-bits bitmap. Should this not be
satisfactory (i.e. we want to support 32-bit values), then we'll have to go
for some other sparse set implementation.

This has the added benefit of letting us allocate just the right amount of
memory for sched_domains_numa_distance[], rather than an arbitrary
(nr_node_ids + 1).

Note: DT binding equivalent (distance-map) decodes distances as 32-bit
values.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210122123943.1217-2-valentin.schneider@arm.com
---
 include/linux/topology.h |  1 +
 kernel/sched/topology.c  | 99 +++++++++++++++++++++++-------------------------
 2 files changed, 49 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index ad03df1cc266..7634cd737061 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -48,6 +48,7 @@ int arch_update_cpu_topology(void);
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
 #define REMOTE_DISTANCE		20
+#define DISTANCE_BITS           8
 #ifndef node_distance
 #define node_distance(from,to)	((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5d3675c7a76b..bf5c9bd10bfe 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void)
 	}
 }
 
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
 void sched_init_numa(void)
 {
-	int next_distance, curr_distance = node_distance(0, 0);
 	struct sched_domain_topology_level *tl;
-	int level = 0;
-	int i, j, k;
-
-	sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
-	if (!sched_domains_numa_distance)
-		return;
-
-	/* Includes NUMA identity node at level 0. */
-	sched_domains_numa_distance[level++] = curr_distance;
-	sched_domains_numa_levels = level;
+	unsigned long *distance_map;
+	int nr_levels = 0;
+	int i, j;
 
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
-	 *
-	 * Assumes node_distance(0,j) includes all distances in
-	 * node_distance(i,j) in order to avoid cubic time.
 	 */
-	next_distance = curr_distance;
+	distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+	if (!distance_map)
+		return;
+
+	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
 	for (i = 0; i < nr_node_ids; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
-			for (k = 0; k < nr_node_ids; k++) {
-				int distance = node_distance(i, k);
-
-				if (distance > curr_distance &&
-				    (distance < next_distance ||
-				     next_distance == curr_distance))
-					next_distance = distance;
-
-				/*
-				 * While not a strong assumption it would be nice to know
-				 * about cases where if node A is connected to B, B is not
-				 * equally connected to A.
-				 */
-				if (sched_debug() && node_distance(k, i) != distance)
-					sched_numa_warn("Node-distance not symmetric");
+			int distance = node_distance(i, j);
 
-				if (sched_debug() && i && !find_numa_distance(distance))
-					sched_numa_warn("Node-0 not representative");
+			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+				sched_numa_warn("Invalid distance value range");
+				return;
 			}
-			if (next_distance != curr_distance) {
-				sched_domains_numa_distance[level++] = next_distance;
-				sched_domains_numa_levels = level;
-				curr_distance = next_distance;
-			} else break;
+
+			bitmap_set(distance_map, distance, 1);
 		}
+	}
+	/*
+	 * We can now figure out how many unique distance values there are and
+	 * allocate memory accordingly.
+	 */
+	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
 
-		/*
-		 * In case of sched_debug() we verify the above assumption.
-		 */
-		if (!sched_debug())
-			break;
+	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+	if (!sched_domains_numa_distance) {
+		bitmap_free(distance_map);
+		return;
+	}
+
+	for (i = 0, j = 0; i < nr_levels; i++, j++) {
+		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+		sched_domains_numa_distance[i] = j;
 	}
 
+	bitmap_free(distance_map);
+
 	/*
-	 * 'level' contains the number of unique distances
+	 * 'nr_levels' contains the number of unique distances
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
@@ -1664,15 +1656,15 @@ void sched_init_numa(void)
 	/*
 	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
 	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
-	 * the array will contain less then 'level' members. This could be
+	 * the array will contain less then 'nr_levels' members. This could be
 	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
 	 * in other functions.
 	 *
-	 * We reset it to 'level' at the end of this function.
+	 * We reset it to 'nr_levels' at the end of this function.
 	 */
 	sched_domains_numa_levels = 0;
 
-	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
 	if (!sched_domains_numa_masks)
 		return;
 
@@ -1680,7 +1672,7 @@ void sched_init_numa(void)
 	 * Now for each level, construct a mask per node which contains all
 	 * CPUs of nodes that are that many hops away from us.
 	 */
-	for (i = 0; i < level; i++) {
+	for (i = 0; i < nr_levels; i++) {
 		sched_domains_numa_masks[i] =
 			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
 		if (!sched_domains_numa_masks[i])
@@ -1688,12 +1680,17 @@ void sched_init_numa(void)
 
 		for (j = 0; j < nr_node_ids; j++) {
 			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+			int k;
+
 			if (!mask)
 				return;
 
 			sched_domains_numa_masks[i][j] = mask;
 
 			for_each_node(k) {
+				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+					sched_numa_warn("Node-distance not symmetric");
+
 				if (node_distance(j, k) > sched_domains_numa_distance[i])
 					continue;
 
@@ -1705,7 +1702,7 @@ void sched_init_numa(void)
 	/* Compute default topology size */
 	for (i = 0; sched_domain_topology[i].mask; i++);
 
-	tl = kzalloc((i + level + 1) *
+	tl = kzalloc((i + nr_levels) *
 			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
 	if (!tl)
 		return;
@@ -1728,7 +1725,7 @@ void sched_init_numa(void)
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
-	for (j = 1; j < level; i++, j++) {
+	for (j = 1; j < nr_levels; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.mask = sd_numa_mask,
 			.sd_flags = cpu_numa_flags,
@@ -1740,8 +1737,8 @@ void sched_init_numa(void)
 
 	sched_domain_topology = tl;
 
-	sched_domains_numa_levels = level;
-	sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+	sched_domains_numa_levels = nr_levels;
+	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
 
 	init_numa_topology_type();
 }
-- 
cgit v1.2.3


From 1875dc5b8ff4690547c446ef222083e28e2d9463 Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Tue, 26 Jan 2021 11:34:49 -0800
Subject: sched: Correctly sort struct predeclarations

The comment says:
  /* task_struct member predeclarations (sorted alphabetically): */

So move io_uring_task where it belongs (alphabetically).

Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210126193449.487547-1-posk@google.com
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31169e70476a..e1152227bb79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,6 +47,7 @@ struct cfs_rq;
 struct fs_struct;
 struct futex_pi_state;
 struct io_context;
+struct io_uring_task;
 struct mempolicy;
 struct nameidata;
 struct nsproxy;
@@ -65,7 +66,6 @@ struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
-struct io_uring_task;
 
 /*
  * Task state bitmask. NOTE! These bits are also
-- 
cgit v1.2.3


From 3175199ab0ac8c874ec25c6bf169f74888917435 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jan 2021 15:52:34 +0100
Subject: block: split bio_kmalloc from bio_alloc_bioset

bio_kmalloc shares almost no logic with the bio_set based fast path
in bio_alloc_bioset.  Split it into an entirely separate implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Acked-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 167 ++++++++++++++++++++++++++--------------------------
 include/linux/bio.h |   6 +-
 2 files changed, 86 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index dfd7740a3230..d4375619348c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -396,123 +396,101 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  * @nr_iovecs:	number of iovecs to pre-allocate
  * @bs:		the bio_set to allocate from.
  *
- * Description:
- *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- *   backed by the @bs's mempool.
+ * Allocate a bio from the mempools in @bs.
  *
- *   When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
- *   always be able to allocate a bio. This is due to the mempool guarantees.
- *   To make this work, callers must never allocate more than 1 bio at a time
- *   from this pool. Callers that need to allocate more than 1 bio must always
- *   submit the previously allocated bio for IO before attempting to allocate
- *   a new one. Failure to do so can cause deadlocks under memory pressure.
+ * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
+ * allocate a bio.  This is due to the mempool guarantees.  To make this work,
+ * callers must never allocate more than 1 bio at a time from the general pool.
+ * Callers that need to allocate more than 1 bio must always submit the
+ * previously allocated bio for IO before attempting to allocate a new one.
+ * Failure to do so can cause deadlocks under memory pressure.
  *
- *   Note that when running under submit_bio_noacct() (i.e. any block
- *   driver), bios are not submitted until after you return - see the code in
- *   submit_bio_noacct() that converts recursion into iteration, to prevent
- *   stack overflows.
+ * Note that when running under submit_bio_noacct() (i.e. any block driver),
+ * bios are not submitted until after you return - see the code in
+ * submit_bio_noacct() that converts recursion into iteration, to prevent
+ * stack overflows.
  *
- *   This would normally mean allocating multiple bios under
- *   submit_bio_noacct() would be susceptible to deadlocks, but we have
- *   deadlock avoidance code that resubmits any blocked bios from a rescuer
- *   thread.
+ * This would normally mean allocating multiple bios under submit_bio_noacct()
+ * would be susceptible to deadlocks, but we have
+ * deadlock avoidance code that resubmits any blocked bios from a rescuer
+ * thread.
  *
- *   However, we do not guarantee forward progress for allocations from other
- *   mempools. Doing multiple allocations from the same mempool under
- *   submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
- *   for per bio allocations.
+ * However, we do not guarantee forward progress for allocations from other
+ * mempools. Doing multiple allocations from the same mempool under
+ * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
+ * for per bio allocations.
  *
- *   RETURNS:
- *   Pointer to new bio on success, NULL on failure.
+ * Returns: Pointer to new bio on success, NULL on failure.
  */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
-	unsigned front_pad;
-	unsigned inline_vecs;
-	struct bio_vec *bvl = NULL;
 	struct bio *bio;
 	void *p;
 
-	if (!bs) {
-		if (nr_iovecs > UIO_MAXIOV)
-			return NULL;
-
-		p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
-		front_pad = 0;
-		inline_vecs = nr_iovecs;
-	} else {
-		/* should not use nobvec bioset for nr_iovecs > 0 */
-		if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
-				 nr_iovecs > 0))
-			return NULL;
-		/*
-		 * submit_bio_noacct() converts recursion to iteration; this
-		 * means if we're running beneath it, any bios we allocate and
-		 * submit will not be submitted (and thus freed) until after we
-		 * return.
-		 *
-		 * This exposes us to a potential deadlock if we allocate
-		 * multiple bios from the same bio_set() while running
-		 * underneath submit_bio_noacct(). If we were to allocate
-		 * multiple bios (say a stacking block driver that was splitting
-		 * bios), we would deadlock if we exhausted the mempool's
-		 * reserve.
-		 *
-		 * We solve this, and guarantee forward progress, with a rescuer
-		 * workqueue per bio_set. If we go to allocate and there are
-		 * bios on current->bio_list, we first try the allocation
-		 * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
-		 * bios we would be blocking to the rescuer workqueue before
-		 * we retry with the original gfp_flags.
-		 */
-
-		if (current->bio_list &&
-		    (!bio_list_empty(&current->bio_list[0]) ||
-		     !bio_list_empty(&current->bio_list[1])) &&
-		    bs->rescue_workqueue)
-			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+	/* should not use nobvec bioset for nr_iovecs > 0 */
+	if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
+		return NULL;
 
+	/*
+	 * submit_bio_noacct() converts recursion to iteration; this means if
+	 * we're running beneath it, any bios we allocate and submit will not be
+	 * submitted (and thus freed) until after we return.
+	 *
+	 * This exposes us to a potential deadlock if we allocate multiple bios
+	 * from the same bio_set() while running underneath submit_bio_noacct().
+	 * If we were to allocate multiple bios (say a stacking block driver
+	 * that was splitting bios), we would deadlock if we exhausted the
+	 * mempool's reserve.
+	 *
+	 * We solve this, and guarantee forward progress, with a rescuer
+	 * workqueue per bio_set. If we go to allocate and there are bios on
+	 * current->bio_list, we first try the allocation without
+	 * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
+	 * blocking to the rescuer workqueue before we retry with the original
+	 * gfp_flags.
+	 */
+	if (current->bio_list &&
+	    (!bio_list_empty(&current->bio_list[0]) ||
+	     !bio_list_empty(&current->bio_list[1])) &&
+	    bs->rescue_workqueue)
+		gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+
+	p = mempool_alloc(&bs->bio_pool, gfp_mask);
+	if (!p && gfp_mask != saved_gfp) {
+		punt_bios_to_rescuer(bs);
+		gfp_mask = saved_gfp;
 		p = mempool_alloc(&bs->bio_pool, gfp_mask);
-		if (!p && gfp_mask != saved_gfp) {
-			punt_bios_to_rescuer(bs);
-			gfp_mask = saved_gfp;
-			p = mempool_alloc(&bs->bio_pool, gfp_mask);
-		}
-
-		front_pad = bs->front_pad;
-		inline_vecs = BIO_INLINE_VECS;
 	}
-
 	if (unlikely(!p))
 		return NULL;
 
-	bio = p + front_pad;
-	bio_init(bio, NULL, 0);
-
-	if (nr_iovecs > inline_vecs) {
+	bio = p + bs->front_pad;
+	if (nr_iovecs > BIO_INLINE_VECS) {
 		unsigned long idx = 0;
+		struct bio_vec *bvl = NULL;
 
 		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx,
+					 &bs->bvec_pool);
 		}
 
 		if (unlikely(!bvl))
 			goto err_free;
 
 		bio->bi_flags |= idx << BVEC_POOL_OFFSET;
-		bio->bi_max_vecs = bvec_nr_vecs(idx);
+		bio_init(bio, bvl, bvec_nr_vecs(idx));
 	} else if (nr_iovecs) {
-		bvl = bio->bi_inline_vecs;
-		bio->bi_max_vecs = inline_vecs;
+		bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
+	} else {
+		bio_init(bio, NULL, 0);
 	}
 
 	bio->bi_pool = bs;
-	bio->bi_io_vec = bvl;
 	return bio;
 
 err_free:
@@ -521,6 +499,31 @@ err_free:
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
+/**
+ * bio_kmalloc - kmalloc a bio for I/O
+ * @gfp_mask:   the GFP_* mask given to the slab allocator
+ * @nr_iovecs:	number of iovecs to pre-allocate
+ *
+ * Use kmalloc to allocate and initialize a bio.
+ *
+ * Returns: Pointer to new bio on success, NULL on failure.
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	struct bio *bio;
+
+	if (nr_iovecs > UIO_MAXIOV)
+		return NULL;
+
+	bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
+	if (unlikely(!bio))
+		return NULL;
+	bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
+	bio->bi_pool = NULL;
+	return bio;
+}
+EXPORT_SYMBOL(bio_kmalloc);
+
 void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
 	unsigned long flags;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 676870b2c88d..c74857cf1252 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -408,6 +408,7 @@ extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
 
 extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs);
 extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
@@ -420,11 +421,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
 
-static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
-}
-
 extern blk_qc_t submit_bio(struct bio *);
 
 extern void bio_endio(struct bio *);
-- 
cgit v1.2.3


From c6bf3f0e25f4c0f0ecce6cf8d1c589bd9d74d3cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jan 2021 15:52:35 +0100
Subject: block: use an on-stack bio in blkdev_issue_flush

There is no point in allocating memory for a synchronous flush.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Acked-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c                 | 17 ++++++-----------
 drivers/md/dm-zoned-metadata.c    |  6 +++---
 drivers/md/raid5-ppl.c            |  2 +-
 drivers/nvme/target/io-cmd-bdev.c |  2 +-
 fs/block_dev.c                    |  2 +-
 fs/exfat/file.c                   |  2 +-
 fs/ext4/fast_commit.c             |  4 ++--
 fs/ext4/fsync.c                   |  2 +-
 fs/ext4/ialloc.c                  |  2 +-
 fs/ext4/super.c                   |  2 +-
 fs/fat/file.c                     |  2 +-
 fs/hfsplus/inode.c                |  2 +-
 fs/hfsplus/super.c                |  2 +-
 fs/jbd2/checkpoint.c              |  2 +-
 fs/jbd2/commit.c                  |  4 ++--
 fs/jbd2/recovery.c                |  2 +-
 fs/libfs.c                        |  2 +-
 fs/nilfs2/the_nilfs.h             |  2 +-
 fs/ocfs2/file.c                   |  2 +-
 fs/reiserfs/file.c                |  2 +-
 fs/xfs/xfs_super.c                |  2 +-
 fs/zonefs/super.c                 |  2 +-
 include/linux/blkdev.h            |  4 ++--
 23 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 76c1624cb06c..7942ca6ed321 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -432,23 +432,18 @@ void blk_insert_flush(struct request *rq)
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
- * @gfp_mask:	memory allocation flags (for bio_alloc)
  *
  * Description:
  *    Issue a flush for the block device in question.
  */
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+int blkdev_issue_flush(struct block_device *bdev)
 {
-	struct bio *bio;
-	int ret = 0;
+	struct bio bio;
 
-	bio = bio_alloc(gfp_mask, 0);
-	bio_set_dev(bio, bdev);
-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
-	ret = submit_bio_wait(bio);
-	bio_put(bio);
-	return ret;
+	bio_init(&bio, NULL, 0);
+	bio_set_dev(&bio, bdev);
+	bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+	return submit_bio_wait(&bio);
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index b298fefb022e..039d17b28938 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -819,7 +819,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
 	ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
 			     mblk->page);
 	if (ret == 0)
-		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
+		ret = blkdev_issue_flush(dev->bdev);
 
 	return ret;
 }
@@ -862,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 
 	/* Flush drive cache (this will also sync data) */
 	if (ret == 0)
-		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
+		ret = blkdev_issue_flush(dev->bdev);
 
 	return ret;
 }
@@ -933,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
 
 	/* If there are no dirty metadata blocks, just flush the device cache */
 	if (list_empty(&write_list)) {
-		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO);
+		ret = blkdev_issue_flush(dev->bdev);
 		goto err;
 	}
 
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index d0f540296fe9..e8c118e05dfd 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr,
 	}
 
 	/* flush the disk cache after recovery if necessary */
-	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL);
+	ret = blkdev_issue_flush(rdev->bdev);
 out:
 	__free_page(page);
 	return ret;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 125dde3f410e..bf6e0ac9ad28 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -333,7 +333,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
 
 u16 nvmet_bdev_flush(struct nvmet_req *req)
 {
-	if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL))
+	if (blkdev_issue_flush(req->ns->bdev))
 		return NVME_SC_INTERNAL | NVME_SC_DNR;
 	return 0;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c1fe29dac485..9d4b1a884d76 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -680,7 +680,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	 * i_mutex and doing so causes performance issues with concurrent
 	 * O_SYNC writers to a block device.
 	 */
-	error = blkdev_issue_flush(bdev, GFP_KERNEL);
+	error = blkdev_issue_flush(bdev);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a92478eabfa4..183ffdf4d43c 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -361,7 +361,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (err)
 		return err;
 
-	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+	return blkdev_issue_flush(inode->i_sb->s_bdev);
 }
 
 const struct file_operations exfat_file_operations = {
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0a14a7c87bf8..6e8208acfc62 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	 * flush before we start writing fast commit blocks.
 	 */
 	if (journal->j_fs_dev != journal->j_dev)
-		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+		blkdev_issue_flush(journal->j_fs_dev);
 
 	blk_start_plug(&plug);
 	if (sbi->s_fc_bytes == 0) {
@@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
 out:
 	iput(inode);
 	if (!ret)
-		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+		blkdev_issue_flush(sb->s_bdev);
 
 	return 0;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 113bfb023a4a..027a7d7037a0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
 
 	if (needs_barrier) {
-		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+		err = blkdev_issue_flush(inode->i_sb->s_bdev);
 		if (!ret)
 			ret = err;
 	}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b215c564bc31..20f2fcb799f5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1583,7 +1583,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 	if (ret < 0)
 		goto err_out;
 	if (barrier)
-		blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
+		blkdev_issue_flush(sb->s_bdev);
 
 skip_zeroout:
 	ext4_lock_group(sb, group);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9a6f9875aa34..fb5985102c1d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5709,7 +5709,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 		needs_barrier = true;
 	if (needs_barrier) {
 		int err;
-		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+		err = blkdev_issue_flush(sb->s_bdev);
 		if (!ret)
 			ret = err;
 	}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f9ee27cf4d7c..5fee74f1ad61 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (err)
 		return err;
 
-	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+	return blkdev_issue_flush(inode->i_sb->s_bdev);
 }
 
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index e3da9e96b835..ca464328b79c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 	}
 
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+		blkdev_issue_flush(inode->i_sb->s_bdev);
 
 	inode_unlock(inode);
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 807119ae5adf..b9e3db3f855f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -239,7 +239,7 @@ out:
 	mutex_unlock(&sbi->vh_mutex);
 
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
-		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+		blkdev_issue_flush(sb->s_bdev);
 
 	return error;
 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 472932b9e6bc..63b526d44886 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 * jbd2_cleanup_journal_tail() doesn't get called all that often.
 	 */
 	if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+		blkdev_issue_flush(journal->j_fs_dev);
 
 	return __jbd2_update_log_tail(journal, first_tid, blocknr);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b121d7d434c6..3cc4ab2ba7f4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -825,7 +825,7 @@ start_journal_io:
 	if (commit_transaction->t_need_data_flush &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+		blkdev_issue_flush(journal->j_fs_dev);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (jbd2_has_feature_async_commit(journal)) {
@@ -932,7 +932,7 @@ start_journal_io:
 	stats.run.rs_blocks_logged++;
 	if (jbd2_has_feature_async_commit(journal) &&
 	    journal->j_flags & JBD2_BARRIER) {
-		blkdev_issue_flush(journal->j_dev, GFP_NOFS);
+		blkdev_issue_flush(journal->j_dev);
 	}
 
 	if (err)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index dc0694fcfcd1..69f18fe20923 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal)
 		err = err2;
 	/* Make sure all replayed data is on permanent storage */
 	if (journal->j_flags & JBD2_BARRIER) {
-		err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL);
+		err2 = blkdev_issue_flush(journal->j_fs_dev);
 		if (!err)
 			err = err2;
 	}
diff --git a/fs/libfs.c b/fs/libfs.c
index d1c3bade9f30..8398a0efb401 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1117,7 +1117,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
 	err = __generic_file_fsync(file, start, end, datasync);
 	if (err)
 		return err;
-	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+	return blkdev_issue_flush(inode->i_sb->s_bdev);
 }
 EXPORT_SYMBOL(generic_file_fsync);
 
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index b55cdeb4d169..987c8ab02aee 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs)
 	 */
 	smp_wmb();
 
-	err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL);
+	err = blkdev_issue_flush(nilfs->ns_bdev);
 	if (err != -EIO)
 		err = 0;
 	return err;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 85979e2214b3..df6d709d2ae3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
 		needs_barrier = true;
 	err = jbd2_complete_transaction(journal, commit_tid);
 	if (needs_barrier) {
-		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 		if (!err)
 			err = ret;
 	}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 0b641ae694f1..1db0254bc38b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+		blkdev_issue_flush(inode->i_sb->s_bdev);
 	inode_unlock(inode);
 	if (barrier_done < 0)
 		return barrier_done;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 813be879a5e5..c3e32789829f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -342,7 +342,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
+	blkdev_issue_flush(buftarg->bt_bdev);
 }
 
 STATIC void
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index faea2ed34b4a..ab68e27bb322 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -541,7 +541,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 	if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
 		ret = file_write_and_wait_range(file, start, end);
 	if (!ret)
-		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 
 	if (ret)
 		zonefs_io_error(inode, true);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2491e17b61c4..0dea268bd61b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1288,7 +1288,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 		 !list_empty(&plug->cb_list));
 }
 
-int blkdev_issue_flush(struct block_device *, gfp_t);
+int blkdev_issue_flush(struct block_device *bdev);
 long nr_blockdev_pages(void);
 #else /* CONFIG_BLOCK */
 struct blk_plug {
@@ -1316,7 +1316,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	return false;
 }
 
-static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+static inline int blkdev_issue_flush(struct block_device *bdev)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 48d15436fde6feebcded7bd0fdc8ea4a9181b8fa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 26 Jan 2021 15:52:47 +0100
Subject: mm: remove get_swap_bio

Just reuse the block_device and sector from the swap_info structure,
just as used by the SWP_SYNCHRONOUS path.  Also remove the checks for
NULL returns from bio_alloc as that can't happen for sleeping
allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Acked-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/swap.h |  1 -
 mm/page_io.c         | 45 +++++++++++++--------------------------------
 mm/swapfile.c        | 10 ----------
 3 files changed, 13 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 596bc2f4d9b0..3f1f7ae0fbe9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -468,7 +468,6 @@ extern int free_swap_and_cache(swp_entry_t);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
-extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int __swap_count(swp_entry_t entry);
diff --git a/mm/page_io.c b/mm/page_io.c
index a75f35464a4e..92f7941c6d01 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,25 +26,6 @@
 #include <linux/uio.h>
 #include <linux/sched/task.h>
 
-static struct bio *get_swap_bio(gfp_t gfp_flags,
-				struct page *page, bio_end_io_t end_io)
-{
-	struct bio *bio;
-
-	bio = bio_alloc(gfp_flags, 1);
-	if (bio) {
-		struct block_device *bdev;
-
-		bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
-		bio_set_dev(bio, bdev);
-		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
-		bio->bi_end_io = end_io;
-
-		bio_add_page(bio, page, thp_size(page), 0);
-	}
-	return bio;
-}
-
 void end_swap_bio_write(struct bio *bio)
 {
 	struct page *page = bio_first_page_all(bio);
@@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		return 0;
 	}
 
-	bio = get_swap_bio(GFP_NOIO, page, end_write_func);
-	if (bio == NULL) {
-		set_page_dirty(page);
-		unlock_page(page);
-		return -ENOMEM;
-	}
+	bio = bio_alloc(GFP_NOIO, 1);
+	bio_set_dev(bio, sis->bdev);
+	bio->bi_iter.bi_sector = swap_page_sector(page);
 	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+	bio->bi_end_io = end_write_func;
+	bio_add_page(bio, page, thp_size(page), 0);
+
 	bio_associate_blkg_from_page(bio, page);
 	count_swpout_vm_event(page);
 	set_page_writeback(page);
@@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous)
 	}
 
 	ret = 0;
-	bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
-	if (bio == NULL) {
-		unlock_page(page);
-		ret = -ENOMEM;
-		goto out;
-	}
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio_set_dev(bio, sis->bdev);
+	bio->bi_opf = REQ_OP_READ;
+	bio->bi_iter.bi_sector = swap_page_sector(page);
+	bio->bi_end_io = end_swap_bio_read;
+	bio_add_page(bio, page, thp_size(page), 0);
+
 	disk = bio->bi_bdev->bd_disk;
 	/*
 	 * Keep this task valid during swap readpage because the oom killer may
 	 * attempt to access it in the page fault retry time check.
 	 */
-	bio_set_op_attrs(bio, REQ_OP_READ, 0);
 	if (synchronous) {
 		bio->bi_opf |= REQ_HIPRI;
 		get_task_struct(current);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9fffc5af29d1..bfa9e8b0c2ef 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2301,16 +2301,6 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 	return se->start_block + (offset - se->start_page);
 }
 
-/*
- * Returns the page offset into bdev for the specified page's swap entry.
- */
-sector_t map_swap_page(struct page *page, struct block_device **bdev)
-{
-	swp_entry_t entry;
-	entry.val = page_private(page);
-	return map_swap_entry(entry, bdev);
-}
-
 /*
  * Free all of a swapdev's extent information
  */
-- 
cgit v1.2.3


From 84f9017c37c479c4f70456a645d24d2296ad2208 Mon Sep 17 00:00:00 2001
From: Jiaxun Yang <jiaxun.yang@flygoat.com>
Date: Mon, 25 Jan 2021 12:59:57 +0100
Subject: ACPI: platform-profile: Introduce object pointers to callbacks

Add an object pointer to handler callbacks to avoid the need for
drivers to have a global variable to get to their driver-data
struct.

Link: https://lore.kernel.org/linux-acpi/6a29f338-d9e4-150c-81dd-2ffb54f5bc35@redhat.com/
Link: https://lore.kernel.org/r/20210114073429.176462-3-jiaxun.yang@flygoat.com
Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/platform_profile.c  | 4 ++--
 include/linux/platform_profile.h | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index f65c61db7921..80e9df427eb8 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -64,7 +64,7 @@ static ssize_t platform_profile_show(struct device *dev,
 		return -ENODEV;
 	}
 
-	err = cur_profile->profile_get(&profile);
+	err = cur_profile->profile_get(cur_profile, &profile);
 	mutex_unlock(&profile_lock);
 	if (err)
 		return err;
@@ -104,7 +104,7 @@ static ssize_t platform_profile_store(struct device *dev,
 		return -EOPNOTSUPP;
 	}
 
-	err = cur_profile->profile_set(i);
+	err = cur_profile->profile_set(cur_profile, i);
 	mutex_unlock(&profile_lock);
 	if (err)
 		return err;
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index c797fdb3d91a..a26542d53058 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -28,8 +28,10 @@ enum platform_profile_option {
 
 struct platform_profile_handler {
 	unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
-	int (*profile_get)(enum platform_profile_option *profile);
-	int (*profile_set)(enum platform_profile_option profile);
+	int (*profile_get)(struct platform_profile_handler *pprof,
+				enum platform_profile_option *profile);
+	int (*profile_set)(struct platform_profile_handler *pprof,
+				enum platform_profile_option profile);
 };
 
 int platform_profile_register(struct platform_profile_handler *pprof);
-- 
cgit v1.2.3


From 0bfa0820c274b019583b3454c6c889c99c24558d Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <npitre@baylibre.com>
Date: Mon, 25 Jan 2021 14:29:18 -0500
Subject: PM: clk: make PM clock layer compatible with clocks that must sleep

The clock API splits its interface into sleepable ant atomic contexts:

 - clk_prepare/clk_unprepare for stuff that might sleep

 - clk_enable_clk_disable for anything that may be done in atomic context

The code handling runtime PM for clocks only calls clk_disable() on
suspend requests, and clk_enable on resume requests. This means that
runtime PM with clock providers that only have the prepare/unprepare
methods implemented is basically useless.

Many clock implementations can't accommodate atomic contexts. This is
often the case when communication with the clock happens through another
subsystem like I2C or SCMI.

Let's make the clock PM code useful with such clocks by safely invoking
clk_prepare/clk_unprepare upon resume/suspend requests. Of course, when
such clocks are registered with the PM layer then pm_runtime_irq_safe()
can't be used, and neither pm_runtime_suspend() nor pm_runtime_resume()
may be invoked in atomic context.

For clocks that do implement the enable and disable methods then
everything just works as before.

A note on sparse:
According to https://lwn.net/Articles/109066/ there are things
that sparse can't cope with. In particular, pm_clk_op_lock() and
pm_clk_op_unlock() may or may not lock/unlock psd->lock depending on
some runtime condition. To work around that we tell it the lock is
always untaken for the purpose of static analisys.

Thanks to Naresh Kamboju for reporting issues with the initial patch.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/clock_ops.c | 223 +++++++++++++++++++++++++++++++++--------
 drivers/clk/clk.c              |  21 ++++
 include/linux/clk.h            |  24 ++++-
 include/linux/pm.h             |   2 +
 4 files changed, 228 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index ced6863a16a5..84d5acb6301b 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -23,6 +23,7 @@
 enum pce_status {
 	PCE_STATUS_NONE = 0,
 	PCE_STATUS_ACQUIRED,
+	PCE_STATUS_PREPARED,
 	PCE_STATUS_ENABLED,
 	PCE_STATUS_ERROR,
 };
@@ -32,8 +33,112 @@ struct pm_clock_entry {
 	char *con_id;
 	struct clk *clk;
 	enum pce_status status;
+	bool enabled_when_prepared;
 };
 
+/**
+ * pm_clk_list_lock - ensure exclusive access for modifying the PM clock
+ *		      entry list.
+ * @psd: pm_subsys_data instance corresponding to the PM clock entry list
+ *	 and clk_op_might_sleep count to be modified.
+ *
+ * Get exclusive access before modifying the PM clock entry list and the
+ * clock_op_might_sleep count to guard against concurrent modifications.
+ * This also protects against a concurrent clock_op_might_sleep and PM clock
+ * entry list usage in pm_clk_suspend()/pm_clk_resume() that may or may not
+ * happen in atomic context, hence both the mutex and the spinlock must be
+ * taken here.
+ */
+static void pm_clk_list_lock(struct pm_subsys_data *psd)
+	__acquires(&psd->lock)
+{
+	mutex_lock(&psd->clock_mutex);
+	spin_lock_irq(&psd->lock);
+}
+
+/**
+ * pm_clk_list_unlock - counterpart to pm_clk_list_lock().
+ * @psd: the same pm_subsys_data instance previously passed to
+ *	 pm_clk_list_lock().
+ */
+static void pm_clk_list_unlock(struct pm_subsys_data *psd)
+	__releases(&psd->lock)
+{
+	spin_unlock_irq(&psd->lock);
+	mutex_unlock(&psd->clock_mutex);
+}
+
+/**
+ * pm_clk_op_lock - ensure exclusive access for performing clock operations.
+ * @psd: pm_subsys_data instance corresponding to the PM clock entry list
+ *	 and clk_op_might_sleep count being used.
+ * @flags: stored irq flags.
+ * @fn: string for the caller function's name.
+ *
+ * This is used by pm_clk_suspend() and pm_clk_resume() to guard
+ * against concurrent modifications to the clock entry list and the
+ * clock_op_might_sleep count. If clock_op_might_sleep is != 0 then
+ * only the mutex can be locked and those functions can only be used in
+ * non atomic context. If clock_op_might_sleep == 0 then these functions
+ * may be used in any context and only the spinlock can be locked.
+ * Returns -EINVAL if called in atomic context when clock ops might sleep.
+ */
+static int pm_clk_op_lock(struct pm_subsys_data *psd, unsigned long *flags,
+			  const char *fn)
+	/* sparse annotations don't work here as exit state isn't static */
+{
+	bool atomic_context = in_atomic() || irqs_disabled();
+
+try_again:
+	spin_lock_irqsave(&psd->lock, *flags);
+	if (!psd->clock_op_might_sleep) {
+		/* the __release is there to work around sparse limitations */
+		__release(&psd->lock);
+		return 0;
+	}
+
+	/* bail out if in atomic context */
+	if (atomic_context) {
+		pr_err("%s: atomic context with clock_ops_might_sleep = %d",
+		       fn, psd->clock_op_might_sleep);
+		spin_unlock_irqrestore(&psd->lock, *flags);
+		might_sleep();
+		return -EPERM;
+	}
+
+	/* we must switch to the mutex */
+	spin_unlock_irqrestore(&psd->lock, *flags);
+	mutex_lock(&psd->clock_mutex);
+
+	/*
+	 * There was a possibility for psd->clock_op_might_sleep
+	 * to become 0 above. Keep the mutex only if not the case.
+	 */
+	if (likely(psd->clock_op_might_sleep))
+		return 0;
+
+	mutex_unlock(&psd->clock_mutex);
+	goto try_again;
+}
+
+/**
+ * pm_clk_op_unlock - counterpart to pm_clk_op_lock().
+ * @psd: the same pm_subsys_data instance previously passed to
+ *	 pm_clk_op_lock().
+ * @flags: irq flags provided by pm_clk_op_lock().
+ */
+static void pm_clk_op_unlock(struct pm_subsys_data *psd, unsigned long *flags)
+	/* sparse annotations don't work here as entry state isn't static */
+{
+	if (psd->clock_op_might_sleep) {
+		mutex_unlock(&psd->clock_mutex);
+	} else {
+		/* the __acquire is there to work around sparse limitations */
+		__acquire(&psd->lock);
+		spin_unlock_irqrestore(&psd->lock, *flags);
+	}
+}
+
 /**
  * pm_clk_enable - Enable a clock, reporting any errors
  * @dev: The device for the given clock
@@ -43,14 +148,21 @@ static inline void __pm_clk_enable(struct device *dev, struct pm_clock_entry *ce
 {
 	int ret;
 
-	if (ce->status < PCE_STATUS_ERROR) {
+	switch (ce->status) {
+	case PCE_STATUS_ACQUIRED:
+		ret = clk_prepare_enable(ce->clk);
+		break;
+	case PCE_STATUS_PREPARED:
 		ret = clk_enable(ce->clk);
-		if (!ret)
-			ce->status = PCE_STATUS_ENABLED;
-		else
-			dev_err(dev, "%s: failed to enable clk %p, error %d\n",
-				__func__, ce->clk, ret);
+		break;
+	default:
+		return;
 	}
+	if (!ret)
+		ce->status = PCE_STATUS_ENABLED;
+	else
+		dev_err(dev, "%s: failed to enable clk %p, error %d\n",
+			__func__, ce->clk, ret);
 }
 
 /**
@@ -64,17 +176,20 @@ static void pm_clk_acquire(struct device *dev, struct pm_clock_entry *ce)
 		ce->clk = clk_get(dev, ce->con_id);
 	if (IS_ERR(ce->clk)) {
 		ce->status = PCE_STATUS_ERROR;
+		return;
+	} else if (clk_is_enabled_when_prepared(ce->clk)) {
+		/* we defer preparing the clock in that case */
+		ce->status = PCE_STATUS_ACQUIRED;
+		ce->enabled_when_prepared = true;
+	} else if (clk_prepare(ce->clk)) {
+		ce->status = PCE_STATUS_ERROR;
+		dev_err(dev, "clk_prepare() failed\n");
+		return;
 	} else {
-		if (clk_prepare(ce->clk)) {
-			ce->status = PCE_STATUS_ERROR;
-			dev_err(dev, "clk_prepare() failed\n");
-		} else {
-			ce->status = PCE_STATUS_ACQUIRED;
-			dev_dbg(dev,
-				"Clock %pC con_id %s managed by runtime PM.\n",
-				ce->clk, ce->con_id);
-		}
+		ce->status = PCE_STATUS_PREPARED;
 	}
+	dev_dbg(dev, "Clock %pC con_id %s managed by runtime PM.\n",
+		ce->clk, ce->con_id);
 }
 
 static int __pm_clk_add(struct device *dev, const char *con_id,
@@ -106,9 +221,11 @@ static int __pm_clk_add(struct device *dev, const char *con_id,
 
 	pm_clk_acquire(dev, ce);
 
-	spin_lock_irq(&psd->lock);
+	pm_clk_list_lock(psd);
 	list_add_tail(&ce->node, &psd->clock_list);
-	spin_unlock_irq(&psd->lock);
+	if (ce->enabled_when_prepared)
+		psd->clock_op_might_sleep++;
+	pm_clk_list_unlock(psd);
 	return 0;
 }
 
@@ -239,14 +356,20 @@ static void __pm_clk_remove(struct pm_clock_entry *ce)
 	if (!ce)
 		return;
 
-	if (ce->status < PCE_STATUS_ERROR) {
-		if (ce->status == PCE_STATUS_ENABLED)
-			clk_disable(ce->clk);
-
-		if (ce->status >= PCE_STATUS_ACQUIRED) {
-			clk_unprepare(ce->clk);
+	switch (ce->status) {
+	case PCE_STATUS_ENABLED:
+		clk_disable(ce->clk);
+		fallthrough;
+	case PCE_STATUS_PREPARED:
+		clk_unprepare(ce->clk);
+		fallthrough;
+	case PCE_STATUS_ACQUIRED:
+	case PCE_STATUS_ERROR:
+		if (!IS_ERR(ce->clk))
 			clk_put(ce->clk);
-		}
+		break;
+	default:
+		break;
 	}
 
 	kfree(ce->con_id);
@@ -269,7 +392,7 @@ void pm_clk_remove(struct device *dev, const char *con_id)
 	if (!psd)
 		return;
 
-	spin_lock_irq(&psd->lock);
+	pm_clk_list_lock(psd);
 
 	list_for_each_entry(ce, &psd->clock_list, node) {
 		if (!con_id && !ce->con_id)
@@ -280,12 +403,14 @@ void pm_clk_remove(struct device *dev, const char *con_id)
 			goto remove;
 	}
 
-	spin_unlock_irq(&psd->lock);
+	pm_clk_list_unlock(psd);
 	return;
 
  remove:
 	list_del(&ce->node);
-	spin_unlock_irq(&psd->lock);
+	if (ce->enabled_when_prepared)
+		psd->clock_op_might_sleep--;
+	pm_clk_list_unlock(psd);
 
 	__pm_clk_remove(ce);
 }
@@ -307,19 +432,21 @@ void pm_clk_remove_clk(struct device *dev, struct clk *clk)
 	if (!psd || !clk)
 		return;
 
-	spin_lock_irq(&psd->lock);
+	pm_clk_list_lock(psd);
 
 	list_for_each_entry(ce, &psd->clock_list, node) {
 		if (clk == ce->clk)
 			goto remove;
 	}
 
-	spin_unlock_irq(&psd->lock);
+	pm_clk_list_unlock(psd);
 	return;
 
  remove:
 	list_del(&ce->node);
-	spin_unlock_irq(&psd->lock);
+	if (ce->enabled_when_prepared)
+		psd->clock_op_might_sleep--;
+	pm_clk_list_unlock(psd);
 
 	__pm_clk_remove(ce);
 }
@@ -330,13 +457,16 @@ EXPORT_SYMBOL_GPL(pm_clk_remove_clk);
  * @dev: Device to initialize the list of PM clocks for.
  *
  * Initialize the lock and clock_list members of the device's pm_subsys_data
- * object.
+ * object, set the count of clocks that might sleep to 0.
  */
 void pm_clk_init(struct device *dev)
 {
 	struct pm_subsys_data *psd = dev_to_psd(dev);
-	if (psd)
+	if (psd) {
 		INIT_LIST_HEAD(&psd->clock_list);
+		mutex_init(&psd->clock_mutex);
+		psd->clock_op_might_sleep = 0;
+	}
 }
 EXPORT_SYMBOL_GPL(pm_clk_init);
 
@@ -372,12 +502,13 @@ void pm_clk_destroy(struct device *dev)
 
 	INIT_LIST_HEAD(&list);
 
-	spin_lock_irq(&psd->lock);
+	pm_clk_list_lock(psd);
 
 	list_for_each_entry_safe_reverse(ce, c, &psd->clock_list, node)
 		list_move(&ce->node, &list);
+	psd->clock_op_might_sleep = 0;
 
-	spin_unlock_irq(&psd->lock);
+	pm_clk_list_unlock(psd);
 
 	dev_pm_put_subsys_data(dev);
 
@@ -397,23 +528,30 @@ int pm_clk_suspend(struct device *dev)
 	struct pm_subsys_data *psd = dev_to_psd(dev);
 	struct pm_clock_entry *ce;
 	unsigned long flags;
+	int ret;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
 	if (!psd)
 		return 0;
 
-	spin_lock_irqsave(&psd->lock, flags);
+	ret = pm_clk_op_lock(psd, &flags, __func__);
+	if (ret)
+		return ret;
 
 	list_for_each_entry_reverse(ce, &psd->clock_list, node) {
-		if (ce->status < PCE_STATUS_ERROR) {
-			if (ce->status == PCE_STATUS_ENABLED)
+		if (ce->status == PCE_STATUS_ENABLED) {
+			if (ce->enabled_when_prepared) {
+				clk_disable_unprepare(ce->clk);
+				ce->status = PCE_STATUS_ACQUIRED;
+			} else {
 				clk_disable(ce->clk);
-			ce->status = PCE_STATUS_ACQUIRED;
+				ce->status = PCE_STATUS_PREPARED;
+			}
 		}
 	}
 
-	spin_unlock_irqrestore(&psd->lock, flags);
+	pm_clk_op_unlock(psd, &flags);
 
 	return 0;
 }
@@ -428,18 +566,21 @@ int pm_clk_resume(struct device *dev)
 	struct pm_subsys_data *psd = dev_to_psd(dev);
 	struct pm_clock_entry *ce;
 	unsigned long flags;
+	int ret;
 
 	dev_dbg(dev, "%s()\n", __func__);
 
 	if (!psd)
 		return 0;
 
-	spin_lock_irqsave(&psd->lock, flags);
+	ret = pm_clk_op_lock(psd, &flags, __func__);
+	if (ret)
+		return ret;
 
 	list_for_each_entry(ce, &psd->clock_list, node)
 		__pm_clk_enable(dev, ce);
 
-	spin_unlock_irqrestore(&psd->lock, flags);
+	pm_clk_op_unlock(psd, &flags);
 
 	return 0;
 }
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 8c1d04db990d..3d751ae5bc70 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -1164,6 +1164,27 @@ int clk_enable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_enable);
 
+/**
+ * clk_is_enabled_when_prepared - indicate if preparing a clock also enables it.
+ * @clk: clock source
+ *
+ * Returns true if clk_prepare() implicitly enables the clock, effectively
+ * making clk_enable()/clk_disable() no-ops, false otherwise.
+ *
+ * This is of interest mainly to power management code where actually
+ * disabling the clock also requires unpreparing it to have any material
+ * effect.
+ *
+ * Regardless of the value returned here, the caller must always invoke
+ * clk_enable() or clk_prepare_enable()  and counterparts for usage counts
+ * to be right.
+ */
+bool clk_is_enabled_when_prepared(struct clk *clk)
+{
+	return clk && !(clk->core->ops->enable && clk->core->ops->disable);
+}
+EXPORT_SYMBOL_GPL(clk_is_enabled_when_prepared);
+
 static int clk_core_prepare_enable(struct clk_core *core)
 {
 	int ret;
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 31ff1bf1b79f..a4a86aa8b11a 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -238,6 +238,7 @@ static inline bool clk_is_match(const struct clk *p, const struct clk *q)
 
 #endif
 
+#ifdef CONFIG_HAVE_CLK_PREPARE
 /**
  * clk_prepare - prepare a clock source
  * @clk: clock source
@@ -246,10 +247,26 @@ static inline bool clk_is_match(const struct clk *p, const struct clk *q)
  *
  * Must not be called from within atomic context.
  */
-#ifdef CONFIG_HAVE_CLK_PREPARE
 int clk_prepare(struct clk *clk);
 int __must_check clk_bulk_prepare(int num_clks,
 				  const struct clk_bulk_data *clks);
+
+/**
+ * clk_is_enabled_when_prepared - indicate if preparing a clock also enables it.
+ * @clk: clock source
+ *
+ * Returns true if clk_prepare() implicitly enables the clock, effectively
+ * making clk_enable()/clk_disable() no-ops, false otherwise.
+ *
+ * This is of interest mainly to the power management code where actually
+ * disabling the clock also requires unpreparing it to have any material
+ * effect.
+ *
+ * Regardless of the value returned here, the caller must always invoke
+ * clk_enable() or clk_prepare_enable()  and counterparts for usage counts
+ * to be right.
+ */
+bool clk_is_enabled_when_prepared(struct clk *clk);
 #else
 static inline int clk_prepare(struct clk *clk)
 {
@@ -263,6 +280,11 @@ clk_bulk_prepare(int num_clks, const struct clk_bulk_data *clks)
 	might_sleep();
 	return 0;
 }
+
+static inline bool clk_is_enabled_when_prepared(struct clk *clk)
+{
+	return false;
+}
 #endif
 
 /**
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 47aca6bac1d6..482313a8ccfc 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -537,6 +537,8 @@ struct pm_subsys_data {
 	spinlock_t lock;
 	unsigned int refcount;
 #ifdef CONFIG_PM_CLK
+	unsigned int clock_op_might_sleep;
+	struct mutex clock_mutex;
 	struct list_head clock_list;
 #endif
 #ifdef CONFIG_PM_GENERIC_DOMAINS
-- 
cgit v1.2.3


From 3fde13f817e23f05ce407d136325df4cbc913e67 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 22 Jan 2021 17:46:43 +0800
Subject: f2fs: compress: support compress level

Expand 'compress_algorithm' mount option to accept parameter as format of
<algorithm>:<level>, by this way, it gives a way to allow user to do more
specified config on lz4 and zstd compression level, then f2fs compression
can provide higher compress ratio.

In order to set compress level for lz4 algorithm, it needs to set
CONFIG_LZ4HC_COMPRESS and CONFIG_F2FS_FS_LZ4HC config to enable lz4hc
compress algorithm.

CR and performance number on lz4/lz4hc algorithm:

dd if=enwik9 of=compressed_file conv=fsync

Original blocks:	244382

			lz4			lz4hc-9
compressed blocks	170647			163270
compress ratio		69.8%			66.8%
speed			16.4207 s, 60.9 MB/s	26.7299 s, 37.4 MB/s

compress ratio = after / before

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst |  5 +++
 fs/f2fs/Kconfig                    | 10 +++++
 fs/f2fs/compress.c                 | 41 ++++++++++++++++--
 fs/f2fs/f2fs.h                     |  9 ++++
 fs/f2fs/super.c                    | 89 +++++++++++++++++++++++++++++++++++++-
 include/linux/f2fs_fs.h            |  3 ++
 6 files changed, 152 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index dae15c96e659..5eff4009e77e 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -249,6 +249,11 @@ checkpoint=%s[:%u[%]]	 Set to "disable" to turn off checkpointing. Set to "enabl
 			 This space is reclaimed once checkpoint=enable.
 compress_algorithm=%s	 Control compress algorithm, currently f2fs supports "lzo",
 			 "lz4", "zstd" and "lzo-rle" algorithm.
+compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
+			 "lz4" and "zstd" support compress level config.
+			 algorithm	level range
+			 lz4		3 - 16
+			 zstd		1 - 22
 compress_log_size=%u	 Support configuring compress cluster size, the size will
 			 be 4KB * (1 << %u), 16KB is minimum size, also it's
 			 default size.
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index d13c5c6a9787..63c1fc1a0e3b 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -119,6 +119,16 @@ config F2FS_FS_LZ4
 	help
 	  Support LZ4 compress algorithm, if unsure, say Y.
 
+config F2FS_FS_LZ4HC
+	bool "LZ4HC compression support"
+	depends on F2FS_FS_COMPRESSION
+	depends on F2FS_FS_LZ4
+	select LZ4HC_COMPRESS
+	default y
+	help
+	  Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
+	  layout with LZ4, if unsure, say Y.
+
 config F2FS_FS_ZSTD
 	bool "ZSTD compression support"
 	depends on F2FS_FS_COMPRESSION
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 4bcbacfe3325..a345a41e2119 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -252,8 +252,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = {
 #ifdef CONFIG_F2FS_FS_LZ4
 static int lz4_init_compress_ctx(struct compress_ctx *cc)
 {
-	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
-				LZ4_MEM_COMPRESS, GFP_NOFS);
+	unsigned int size = LZ4_MEM_COMPRESS;
+
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET)
+		size = LZ4HC_MEM_COMPRESS;
+#endif
+
+	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS);
 	if (!cc->private)
 		return -ENOMEM;
 
@@ -272,10 +278,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
 	cc->private = NULL;
 }
 
+#ifdef CONFIG_F2FS_FS_LZ4HC
+static int lz4hc_compress_pages(struct compress_ctx *cc)
+{
+	unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+						COMPRESS_LEVEL_OFFSET;
+	int len;
+
+	if (level)
+		len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+					cc->clen, level, cc->private);
+	else
+		len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+						cc->clen, cc->private);
+	if (!len)
+		return -EAGAIN;
+
+	cc->clen = len;
+	return 0;
+}
+#endif
+
 static int lz4_compress_pages(struct compress_ctx *cc)
 {
 	int len;
 
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	return lz4hc_compress_pages(cc);
+#endif
 	len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
 						cc->clen, cc->private);
 	if (!len)
@@ -325,8 +355,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
 	ZSTD_CStream *stream;
 	void *workspace;
 	unsigned int workspace_size;
+	unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+						COMPRESS_LEVEL_OFFSET;
+
+	if (!level)
+		level = F2FS_ZSTD_DEFAULT_CLEVEL;
 
-	params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+	params = ZSTD_getParams(level, cc->rlen, 0);
 	workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
 
 	workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index bb11759191dc..36012181c17f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -146,6 +146,7 @@ struct f2fs_mount_info {
 	/* For compression */
 	unsigned char compress_algorithm;	/* algorithm type */
 	unsigned char compress_log_size;	/* cluster log size */
+	unsigned char compress_level;		/* compress level */
 	bool compress_chksum;			/* compressed data chksum */
 	unsigned char compress_ext_cnt;		/* extension count */
 	int compress_mode;			/* compression mode */
@@ -735,6 +736,7 @@ struct f2fs_inode_info {
 	atomic_t i_compr_blocks;		/* # of compressed blocks */
 	unsigned char i_compress_algorithm;	/* algorithm type */
 	unsigned char i_log_cluster_size;	/* log of cluster size */
+	unsigned char i_compress_level;		/* compress level (lz4hc,zstd) */
 	unsigned short i_compress_flag;		/* compress flag */
 	unsigned int i_cluster_size;		/* cluster size */
 };
@@ -1310,6 +1312,8 @@ struct compress_data {
 
 #define F2FS_COMPRESSED_PAGE_MAGIC	0xF5F2C000
 
+#define	COMPRESS_LEVEL_OFFSET	8
+
 /* compress context */
 struct compress_ctx {
 	struct inode *inode;		/* inode the context belong to */
@@ -3934,6 +3938,11 @@ static inline void set_compress_context(struct inode *inode)
 				1 << COMPRESS_CHKSUM : 0;
 	F2FS_I(inode)->i_cluster_size =
 			1 << F2FS_I(inode)->i_log_cluster_size;
+	if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+			F2FS_OPTION(sbi).compress_level)
+		F2FS_I(inode)->i_compress_flag |=
+				F2FS_OPTION(sbi).compress_level <<
+				COMPRESS_LEVEL_OFFSET;
 	F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
 	set_inode_flag(inode, FI_COMPRESSED_FILE);
 	stat_inc_compr_inode(inode);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a275bd312ae5..c8be27a9eed6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -25,6 +25,8 @@
 #include <linux/quota.h>
 #include <linux/unicode.h>
 #include <linux/part_stat.h>
+#include <linux/zstd.h>
+#include <linux/lz4.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -464,6 +466,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
 	return 0;
 }
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+#ifdef CONFIG_F2FS_FS_LZ4
+static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+{
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	unsigned int level;
+#endif
+
+	if (strlen(str) == 3) {
+		F2FS_OPTION(sbi).compress_level = 0;
+		return 0;
+	}
+
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	str += 3;
+
+	if (str[0] != ':') {
+		f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+		return -EINVAL;
+	}
+	if (kstrtouint(str + 1, 10, &level))
+		return -EINVAL;
+
+	if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) {
+		f2fs_info(sbi, "invalid lz4hc compress level: %d", level);
+		return -EINVAL;
+	}
+
+	F2FS_OPTION(sbi).compress_level = level;
+	return 0;
+#else
+	f2fs_info(sbi, "kernel doesn't support lz4hc compression");
+	return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef CONFIG_F2FS_FS_ZSTD
+static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+{
+	unsigned int level;
+	int len = 4;
+
+	if (strlen(str) == len) {
+		F2FS_OPTION(sbi).compress_level = 0;
+		return 0;
+	}
+
+	str += len;
+
+	if (str[0] != ':') {
+		f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+		return -EINVAL;
+	}
+	if (kstrtouint(str + 1, 10, &level))
+		return -EINVAL;
+
+	if (!level || level > ZSTD_maxCLevel()) {
+		f2fs_info(sbi, "invalid zstd compress level: %d", level);
+		return -EINVAL;
+	}
+
+	F2FS_OPTION(sbi).compress_level = level;
+	return 0;
+}
+#endif
+#endif
+
 static int parse_options(struct super_block *sb, char *options, bool is_remount)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -883,20 +953,31 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 				return -ENOMEM;
 			if (!strcmp(name, "lzo")) {
 #ifdef CONFIG_F2FS_FS_LZO
+				F2FS_OPTION(sbi).compress_level = 0;
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_LZO;
 #else
 				f2fs_info(sbi, "kernel doesn't support lzo compression");
 #endif
-			} else if (!strcmp(name, "lz4")) {
+			} else if (!strncmp(name, "lz4", 3)) {
 #ifdef CONFIG_F2FS_FS_LZ4
+				ret = f2fs_set_lz4hc_level(sbi, name);
+				if (ret) {
+					kfree(name);
+					return -EINVAL;
+				}
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_LZ4;
 #else
 				f2fs_info(sbi, "kernel doesn't support lz4 compression");
 #endif
-			} else if (!strcmp(name, "zstd")) {
+			} else if (!strncmp(name, "zstd", 4)) {
 #ifdef CONFIG_F2FS_FS_ZSTD
+				ret = f2fs_set_zstd_level(sbi, name);
+				if (ret) {
+					kfree(name);
+					return -EINVAL;
+				}
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_ZSTD;
 #else
@@ -904,6 +985,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 #endif
 			} else if (!strcmp(name, "lzo-rle")) {
 #ifdef CONFIG_F2FS_FS_LZORLE
+				F2FS_OPTION(sbi).compress_level = 0;
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_LZORLE;
 #else
@@ -1555,6 +1637,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
 	}
 	seq_printf(seq, ",compress_algorithm=%s", algtype);
 
+	if (F2FS_OPTION(sbi).compress_level)
+		seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level);
+
 	seq_printf(seq, ",compress_log_size=%u",
 			F2FS_OPTION(sbi).compress_log_size);
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 7dc2a06cf19a..c6cc0a566ef5 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -274,6 +274,9 @@ struct f2fs_inode {
 			__u8 i_compress_algorithm;	/* compress algorithm */
 			__u8 i_log_cluster_size;	/* log of cluster size */
 			__le16 i_compress_flag;		/* compress flag */
+						/* 0 bit: chksum flag
+						 * [10,15] bits: compress level
+						 */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		} __packed;
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
-- 
cgit v1.2.3


From 794c43f716845e2d48ce195ed5c4179a4e05ce5f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 28 Dec 2020 15:25:29 -0800
Subject: libfs: unexport generic_ci_d_compare() and generic_ci_d_hash()

Now that generic_set_encrypted_ci_d_ops() has been added and ext4 and
f2fs are using it, it's no longer necessary to export
generic_ci_d_compare() and generic_ci_d_hash() to filesystems.

Reviewed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/libfs.c         | 8 +++-----
 include/linux/fs.h | 5 -----
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index d1c3bade9f30..79721571e014 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1388,8 +1388,8 @@ static bool needs_casefold(const struct inode *dir)
  *
  * Return: 0 if names match, 1 if mismatch, or -ERRNO
  */
-int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
-			  const char *str, const struct qstr *name)
+static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+				const char *str, const struct qstr *name)
 {
 	const struct dentry *parent = READ_ONCE(dentry->d_parent);
 	const struct inode *dir = READ_ONCE(parent->d_inode);
@@ -1426,7 +1426,6 @@ fallback:
 		return 1;
 	return !!memcmp(str, name->name, len);
 }
-EXPORT_SYMBOL(generic_ci_d_compare);
 
 /**
  * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@ -1435,7 +1434,7 @@ EXPORT_SYMBOL(generic_ci_d_compare);
  *
  * Return: 0 if hash was successful or unchanged, and -EINVAL on error
  */
-int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 {
 	const struct inode *dir = READ_ONCE(dentry->d_inode);
 	struct super_block *sb = dentry->d_sb;
@@ -1450,7 +1449,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 		return -EINVAL;
 	return 0;
 }
-EXPORT_SYMBOL(generic_ci_d_hash);
 
 static const struct dentry_operations generic_ci_dentry_ops = {
 	.d_hash = generic_ci_d_hash,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..6d8b1e7337e4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3192,11 +3192,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
 
 extern int generic_check_addressable(unsigned, u64);
 
-#ifdef CONFIG_UNICODE
-extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
-extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
-				const char *str, const struct qstr *name);
-#endif
 extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
 
 #ifdef CONFIG_MIGRATION
-- 
cgit v1.2.3


From 8063e184e49011f6f3f34f6c358dc8a83890bb5b Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Wed, 27 Jan 2021 14:15:01 -0800
Subject: skmsg: Make sk_psock_destroy() static

sk_psock_destroy() is a RCU callback, I can't see any reason why
it could be used outside.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Lorenz Bauer <lmb@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210127221501.46866-1-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 1 -
 net/core/skmsg.c      | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index fec0c5ac1c4f..8edbbf5f2f93 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -390,7 +390,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk)
 }
 
 void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
-void sk_psock_destroy(struct rcu_head *rcu);
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
 
 static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 25cdbb20f3a0..1261512d6807 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -669,14 +669,13 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	kfree(psock);
 }
 
-void sk_psock_destroy(struct rcu_head *rcu)
+static void sk_psock_destroy(struct rcu_head *rcu)
 {
 	struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
 
 	INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
 	schedule_work(&psock->gc);
 }
-EXPORT_SYMBOL_GPL(sk_psock_destroy);
 
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
-- 
cgit v1.2.3


From e5befc024cb4515d815662ed8746712cc5366bfc Mon Sep 17 00:00:00 2001
From: Eddie James <eajames@linux.ibm.com>
Date: Tue, 22 Dec 2020 09:26:39 -0600
Subject: hwmon: (pmbus) Add a PMBUS_NO_CAPABILITY platform data flag

Some PMBus chips don't respond with valid data when reading the
CAPABILITY register. Add a flag that device drivers can set so
that the PMBus core driver doesn't use CAPABILITY to determine it's
behavior.

Signed-off-by: Eddie James <eajames@linux.ibm.com>
Link: https://lore.kernel.org/r/20201222152640.27749-2-eajames@linux.ibm.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/pmbus/pmbus_core.c | 8 +++++---
 include/linux/pmbus.h            | 9 +++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index 192442b3b7a2..906c9fec9cce 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -2204,9 +2204,11 @@ static int pmbus_init_common(struct i2c_client *client, struct pmbus_data *data,
 	}
 
 	/* Enable PEC if the controller supports it */
-	ret = i2c_smbus_read_byte_data(client, PMBUS_CAPABILITY);
-	if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
-		client->flags |= I2C_CLIENT_PEC;
+	if (!(data->flags & PMBUS_NO_CAPABILITY)) {
+		ret = i2c_smbus_read_byte_data(client, PMBUS_CAPABILITY);
+		if (ret >= 0 && (ret & PB_CAPABILITY_ERROR_CHECK))
+			client->flags |= I2C_CLIENT_PEC;
+	}
 
 	/*
 	 * Check if the chip is write protected. If it is, we can not clear
diff --git a/include/linux/pmbus.h b/include/linux/pmbus.h
index 1ea5bae708a1..12cbbf305969 100644
--- a/include/linux/pmbus.h
+++ b/include/linux/pmbus.h
@@ -34,6 +34,15 @@
  */
 #define PMBUS_WRITE_PROTECTED	BIT(1)
 
+/*
+ * PMBUS_NO_CAPABILITY
+ *
+ * Some PMBus chips don't respond with valid data when reading the CAPABILITY
+ * register. For such chips, this flag should be set so that the PMBus core
+ * driver doesn't use CAPABILITY to determine it's behavior.
+ */
+#define PMBUS_NO_CAPABILITY			BIT(2)
+
 struct pmbus_platform_data {
 	u32 flags;		/* Device specific flags */
 
-- 
cgit v1.2.3


From 772412176fb98493158929b220fe250127f611af Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 27 Jan 2021 11:31:39 -0800
Subject: bpf: Allow rewriting to ports under ip_unprivileged_port_start

At the moment, BPF_CGROUP_INET{4,6}_BIND hooks can rewrite user_port
to the privileged ones (< ip_unprivileged_port_start), but it will
be rejected later on in the __inet_bind or __inet6_bind.

Let's add another return value to indicate that CAP_NET_BIND_SERVICE
check should be ignored. Use the same idea as we currently use
in cgroup/egress where bit #1 indicates CN. Instead, for
cgroup/bind{4,6}, bit #1 indicates that CAP_NET_BIND_SERVICE should
be bypassed.

v5:
- rename flags to be less confusing (Andrey Ignatov)
- rework BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY to work on flags
  and accept BPF_RET_SET_CN (no behavioral changes)

v4:
- Add missing IPv6 support (Martin KaFai Lau)

v3:
- Update description (Martin KaFai Lau)
- Fix capability restore in selftest (Martin KaFai Lau)

v2:
- Switch to explicit return code (Martin KaFai Lau)

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrey Ignatov <rdna@fb.com>
Link: https://lore.kernel.org/bpf/20210127193140.3170382-1-sdf@google.com
---
 include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++++++---------
 include/linux/bpf.h        | 52 +++++++++++++++++++++++++++++-----------------
 include/net/inet_common.h  |  2 ++
 kernel/bpf/cgroup.c        |  8 +++++--
 kernel/bpf/verifier.c      |  3 +++
 net/ipv4/af_inet.c         |  9 +++++---
 net/ipv6/af_inet6.c        |  9 +++++---
 7 files changed, 84 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 0748fd87969e..c42e02b4d84b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -125,7 +125,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
 				      enum bpf_attach_type type,
-				      void *t_ctx);
+				      void *t_ctx,
+				      u32 *flags);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
@@ -231,30 +232,48 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)				       \
 ({									       \
+	u32 __unused_flags;						       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(type))					       \
 		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
-							  NULL);	       \
+							  NULL,		       \
+							  &__unused_flags);    \
 	__ret;								       \
 })
 
 #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
 ({									       \
+	u32 __unused_flags;						       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(type))	{				       \
 		lock_sock(sk);						       \
 		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
-							  t_ctx);	       \
+							  t_ctx,	       \
+							  &__unused_flags);    \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL)
-
-#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL)
+/* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags
+ * via upper bits of return code. The only flag that is supported
+ * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
+ * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
+ */
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags)	       \
+({									       \
+	u32 __flags = 0;						       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled(type))	{				       \
+		lock_sock(sk);						       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  NULL, &__flags);     \
+		release_sock(sk);					       \
+		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
+			*bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;	       \
+	}								       \
+	__ret;								       \
+})
 
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk)				       \
 	((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) ||		       \
@@ -453,8 +472,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1aac2af12fed..321966fc35db 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1073,6 +1073,34 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array);
 
+/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
+#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE			(1 << 0)
+/* BPF program asks to set CN on the packet. */
+#define BPF_RET_SET_CN						(1 << 0)
+
+#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
+	({								\
+		struct bpf_prog_array_item *_item;			\
+		struct bpf_prog *_prog;					\
+		struct bpf_prog_array *_array;				\
+		u32 _ret = 1;						\
+		u32 func_ret;						\
+		migrate_disable();					\
+		rcu_read_lock();					\
+		_array = rcu_dereference(array);			\
+		_item = &_array->items[0];				\
+		while ((_prog = READ_ONCE(_item->prog))) {		\
+			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			func_ret = func(_prog, ctx);			\
+			_ret &= (func_ret & 1);				\
+			*(ret_flags) |= (func_ret >> 1);			\
+			_item++;					\
+		}							\
+		rcu_read_unlock();					\
+		migrate_enable();					\
+		_ret;							\
+	 })
+
 #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
 		struct bpf_prog_array_item *_item;	\
@@ -1120,25 +1148,11 @@ _out:							\
  */
 #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)		\
 	({						\
-		struct bpf_prog_array_item *_item;	\
-		struct bpf_prog *_prog;			\
-		struct bpf_prog_array *_array;		\
-		u32 ret;				\
-		u32 _ret = 1;				\
-		u32 _cn = 0;				\
-		migrate_disable();			\
-		rcu_read_lock();			\
-		_array = rcu_dereference(array);	\
-		_item = &_array->items[0];		\
-		while ((_prog = READ_ONCE(_item->prog))) {		\
-			bpf_cgroup_storage_set(_item->cgroup_storage);	\
-			ret = func(_prog, ctx);		\
-			_ret &= (ret & 1);		\
-			_cn |= (ret & 2);		\
-			_item++;			\
-		}					\
-		rcu_read_unlock();			\
-		migrate_enable();			\
+		u32 _flags = 0;				\
+		bool _cn;				\
+		u32 _ret;				\
+		_ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \
+		_cn = _flags & BPF_RET_SET_CN;		\
 		if (_ret)				\
 			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
 		else					\
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index cb2818862919..cad2a611efde 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -41,6 +41,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 #define BIND_WITH_LOCK			(1 << 1)
 /* Called from BPF program. */
 #define BIND_FROM_BPF			(1 << 2)
+/* Skip CAP_NET_BIND_SERVICE check. */
+#define BIND_NO_CAP_NET_BIND_SERVICE	(1 << 3)
 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 		u32 flags);
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index da649f20d6b2..cdf3c7e611d9 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @uaddr: sockaddr struct provided by user
  * @type: The type of program to be exectuted
  * @t_ctx: Pointer to attach type specific context
+ * @flags: Pointer to u32 which contains higher bits of BPF program
+ *         return value (OR'ed together).
  *
  * socket is expected to be of type INET or INET6.
  *
@@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
 				      enum bpf_attach_type type,
-				      void *t_ctx)
+				      void *t_ctx,
+				      u32 *flags)
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
@@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+	ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx,
+				       BPF_PROG_RUN, flags);
 
 	return ret == 1 ? 0 : -EPERM;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d0eae51b31e4..972fc38eb62d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7986,6 +7986,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
 			range = tnum_range(1, 1);
+		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
+		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
+			range = tnum_range(0, 3);
 		break;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6ba2930ff49b..aaa94bea19c3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release);
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
+	u32 flags = BIND_WITH_LOCK;
 	int err;
 
 	/* If the socket has its own bind function then use it. (RAW) */
@@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr);
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+						 BPF_CGROUP_INET4_BIND, &flags);
 	if (err)
 		return err;
 
-	return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+	return __inet_bind(sk, uaddr, addr_len, flags);
 }
 EXPORT_SYMBOL(inet_bind);
 
@@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 
 	snum = ntohs(addr->sin_port);
 	err = -EACCES;
-	if (snum && inet_port_requires_bind_service(net, snum) &&
+	if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+	    snum && inet_port_requires_bind_service(net, snum) &&
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		goto out;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b9c654836b72..f091fe9b4da5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -295,7 +295,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 		return -EINVAL;
 
 	snum = ntohs(addr->sin6_port);
-	if (snum && inet_port_requires_bind_service(net, snum) &&
+	if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+	    snum && inet_port_requires_bind_service(net, snum) &&
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
@@ -439,6 +440,7 @@ out_unlock:
 int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
+	u32 flags = BIND_WITH_LOCK;
 	int err = 0;
 
 	/* If the socket has its own bind function then use it. */
@@ -451,11 +453,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr);
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+						 BPF_CGROUP_INET6_BIND, &flags);
 	if (err)
 		return err;
 
-	return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
+	return __inet6_bind(sk, uaddr, addr_len, flags);
 }
 EXPORT_SYMBOL(inet6_bind);
 
-- 
cgit v1.2.3


From 3d347b1b19da20f973d1d3c6bb60c11185606afd Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Tue, 26 Jan 2021 15:24:07 -0800
Subject: net/mlx5: Add support for devlink traps in mlx5 core driver

Add devlink traps infra-structure to mlx5 core driver. Add traps list to
mlx5_priv and corresponding API:
 - mlx5_devlink_trap_report() to wrap trap reports to devlink
 - mlx5_devlink_trap_get_num_active() to decide whether to open/close trap
   resources.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 84 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/devlink.h | 16 +++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |  2 +
 include/linux/mlx5/driver.h                       |  1 +
 4 files changed, 103 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 3261d0dc1104..f04afaf785cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -168,6 +168,56 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
 	return 0;
 }
 
+static struct mlx5_devlink_trap *mlx5_find_trap_by_id(struct mlx5_core_dev *dev, int trap_id)
+{
+	struct mlx5_devlink_trap *dl_trap;
+
+	list_for_each_entry(dl_trap, &dev->priv.traps, list)
+		if (dl_trap->trap.id == trap_id)
+			return dl_trap;
+
+	return NULL;
+}
+
+static int mlx5_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap,
+				  void *trap_ctx)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_devlink_trap *dl_trap;
+
+	dl_trap = kzalloc(sizeof(*dl_trap), GFP_KERNEL);
+	if (!dl_trap)
+		return -ENOMEM;
+
+	dl_trap->trap.id = trap->id;
+	dl_trap->trap.action = DEVLINK_TRAP_ACTION_DROP;
+	dl_trap->item = trap_ctx;
+
+	if (mlx5_find_trap_by_id(dev, trap->id)) {
+		kfree(dl_trap);
+		mlx5_core_err(dev, "Devlink trap: Trap 0x%x already found", trap->id);
+		return -EEXIST;
+	}
+
+	list_add_tail(&dl_trap->list, &dev->priv.traps);
+	return 0;
+}
+
+static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink_trap *trap,
+				   void *trap_ctx)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_devlink_trap *dl_trap;
+
+	dl_trap = mlx5_find_trap_by_id(dev, trap->id);
+	if (!dl_trap) {
+		mlx5_core_err(dev, "Devlink trap: Missing trap id 0x%x", trap->id);
+		return;
+	}
+	list_del(&dl_trap->list);
+	kfree(dl_trap);
+}
+
 static const struct devlink_ops mlx5_devlink_ops = {
 #ifdef CONFIG_MLX5_ESWITCH
 	.eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
@@ -186,8 +236,42 @@ static const struct devlink_ops mlx5_devlink_ops = {
 	.reload_limits = BIT(DEVLINK_RELOAD_LIMIT_NO_RESET),
 	.reload_down = mlx5_devlink_reload_down,
 	.reload_up = mlx5_devlink_reload_up,
+	.trap_init = mlx5_devlink_trap_init,
+	.trap_fini = mlx5_devlink_trap_fini,
 };
 
+void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb,
+			      struct devlink_port *dl_port)
+{
+	struct devlink *devlink = priv_to_devlink(dev);
+	struct mlx5_devlink_trap *dl_trap;
+
+	dl_trap = mlx5_find_trap_by_id(dev, trap_id);
+	if (!dl_trap) {
+		mlx5_core_err(dev, "Devlink trap: Report on invalid trap id 0x%x", trap_id);
+		return;
+	}
+
+	if (dl_trap->trap.action != DEVLINK_TRAP_ACTION_TRAP) {
+		mlx5_core_dbg(dev, "Devlink trap: Trap id %d has action %d", trap_id,
+			      dl_trap->trap.action);
+		return;
+	}
+	devlink_trap_report(devlink, skb, dl_trap->item, dl_port, NULL);
+}
+
+int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev)
+{
+	struct mlx5_devlink_trap *dl_trap;
+	int count = 0;
+
+	list_for_each_entry(dl_trap, &dev->priv.traps, list)
+		if (dl_trap->trap.action == DEVLINK_TRAP_ACTION_TRAP)
+			count++;
+
+	return count;
+}
+
 struct devlink *mlx5_devlink_alloc(void)
 {
 	return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index f0de327a59be..a9829006fa78 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -12,6 +12,22 @@ enum mlx5_devlink_param_id {
 	MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM,
 };
 
+struct mlx5_trap_ctx {
+	int id;
+	int action;
+};
+
+struct mlx5_devlink_trap {
+	struct mlx5_trap_ctx trap;
+	void *item;
+	struct list_head list;
+};
+
+struct mlx5_core_dev;
+void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb,
+			      struct devlink_port *dl_port);
+int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev);
+
 struct devlink *mlx5_devlink_alloc(void);
 void mlx5_devlink_free(struct devlink *devlink);
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ca6f2fc39ea0..bfedf064db1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1305,6 +1305,8 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 
 	priv->dbg_root = debugfs_create_dir(dev_name(dev->device),
 					    mlx5_debugfs_root);
+	INIT_LIST_HEAD(&priv->traps);
+
 	err = mlx5_health_init(dev);
 	if (err)
 		goto err_health_init;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f93bfe7473aa..c4615dc51b6f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -564,6 +564,7 @@ struct mlx5_priv {
 	int			host_pf_pages;
 
 	struct mlx5_core_health health;
+	struct list_head	traps;
 
 	/* start: qp staff */
 	struct dentry	       *qp_debugfs;
-- 
cgit v1.2.3


From 241dc159391fb9d351362d911a39dff84074cc92 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Tue, 26 Jan 2021 15:24:11 -0800
Subject: net/mlx5: Notify on trap action by blocking event

In order to allow mlx5 core driver to trigger synchronous operations to
its consumers, add a blocking events handler. Add wrappers to
blocking_notifier_[call_chain/chain_register/chain_unregister]. Add trap
callback for action set and notify about this change. Following patches
in the set add a listener for this event.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 36 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/events.c  | 28 ++++++++++++++++++
 include/linux/mlx5/device.h                       |  4 +++
 include/linux/mlx5/driver.h                       | 15 ++++++++++
 4 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index f081eff9be25..c47291467cb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -218,6 +218,41 @@ static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink
 	kfree(dl_trap);
 }
 
+static int mlx5_devlink_trap_action_set(struct devlink *devlink,
+					const struct devlink_trap *trap,
+					enum devlink_trap_action action,
+					struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	enum devlink_trap_action action_orig;
+	struct mlx5_devlink_trap *dl_trap;
+	int err = 0;
+
+	dl_trap = mlx5_find_trap_by_id(dev, trap->id);
+	if (!dl_trap) {
+		mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (action == dl_trap->trap.action)
+		goto out;
+
+	action_orig = dl_trap->trap.action;
+	dl_trap->trap.action = action;
+	err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP,
+						&dl_trap->trap);
+	if (err)
+		dl_trap->trap.action = action_orig;
+out:
+	return err;
+}
+
 static const struct devlink_ops mlx5_devlink_ops = {
 #ifdef CONFIG_MLX5_ESWITCH
 	.eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
@@ -238,6 +273,7 @@ static const struct devlink_ops mlx5_devlink_ops = {
 	.reload_up = mlx5_devlink_reload_up,
 	.trap_init = mlx5_devlink_trap_init,
 	.trap_fini = mlx5_devlink_trap_fini,
+	.trap_action_set = mlx5_devlink_trap_action_set,
 };
 
 void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 054c0bc36d24..670f25f5ffd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -61,6 +61,8 @@ struct mlx5_events {
 	struct mlx5_pme_stats pme_stats;
 	/*pcie_core*/
 	struct work_struct pcie_core_work;
+	/* driver notifier chain for sw events */
+	struct blocking_notifier_head sw_nh;
 };
 
 static const char *eqe_type_str(u8 type)
@@ -351,6 +353,7 @@ int mlx5_events_init(struct mlx5_core_dev *dev)
 		return -ENOMEM;
 	}
 	INIT_WORK(&events->pcie_core_work, mlx5_pcie_event);
+	BLOCKING_INIT_NOTIFIER_HEAD(&events->sw_nh);
 
 	return 0;
 }
@@ -406,3 +409,28 @@ int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, voi
 {
 	return atomic_notifier_call_chain(&events->fw_nh, event, data);
 }
+
+/* This API is used only for processing and forwarding driver-specific
+ * events to mlx5 consumers.
+ */
+int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb)
+{
+	struct mlx5_events *events = dev->priv.events;
+
+	return blocking_notifier_chain_register(&events->sw_nh, nb);
+}
+
+int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb)
+{
+	struct mlx5_events *events = dev->priv.events;
+
+	return blocking_notifier_chain_unregister(&events->sw_nh, nb);
+}
+
+int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event,
+				      void *data)
+{
+	struct mlx5_events *events = dev->priv.events;
+
+	return blocking_notifier_call_chain(&events->sw_nh, event, data);
+}
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f1de49d64a98..77ba54d38772 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -359,6 +359,10 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_MAX                = 0x100,
 };
 
+enum mlx5_driver_event {
+	MLX5_DRIVER_EVENT_TYPE_TRAP = 0,
+};
+
 enum {
 	MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE = 0x0,
 	MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE = 0x1,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index c4615dc51b6f..45df5b465ba8 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1073,11 +1073,26 @@ enum {
 	MAX_MR_CACHE_ENTRIES
 };
 
+/* Async-atomic event notifier used by mlx5 core to forward FW
+ * evetns recived from event queue to mlx5 consumers.
+ * Optimise event queue dipatching.
+ */
 int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
 int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
+
+/* Async-atomic event notifier used for forwarding
+ * evetns from the event queue into the to mlx5 events dispatcher,
+ * eswitch, clock and others.
+ */
 int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
 int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
 
+/* Blocking event notifier used to forward SW events, used for slow path */
+int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
+int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
+int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event,
+				      void *data);
+
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
 
 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 5543e989fe5e2fe6a5829ee42c00152cac2bb8a0 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Tue, 26 Jan 2021 15:24:16 -0800
Subject: net/mlx5e: Add trap entity to ETH driver

Introduce mlx5e_trap which includes a dedicated RQ and NAPI for trapped
packets. Trap-RQ processes packets that were destined to be dropped,
but for debug and visibility sake these packets are trapped and reported
to devlink.
Trap-RQ connects between the HW and the driver and is not a part of a
channel. Open mlx5e_create_rq() and mlx5_core_destroy_rq() as API and
add dedicate RQ handlers which report to devlink of trapped packets.

Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   7 +
 drivers/net/ethernet/mellanox/mlx5/core/en/trap.c | 409 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/trap.h |  35 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   5 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |  46 +++
 include/linux/mlx5/device.h                       |   5 +
 7 files changed, 505 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/trap.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fcfc0b114985..d44f5f6ee449 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -27,7 +27,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_selftest.o en/port.o en/monitor_stats.o en/health.o \
 		en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
 		en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
-		en/qos.o
+		en/qos.o en/trap.o
 
 #
 # Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index dc4895a1fa9b..f439a977ad61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -564,6 +564,7 @@ typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
 int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool xsk);
+void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params);
 
 enum mlx5e_rq_flag {
 	MLX5E_RQ_FLAG_XDP_XMIT,
@@ -805,6 +806,8 @@ struct mlx5e_htb {
 	u16 defcls;
 };
 
+struct mlx5e_trap;
+
 struct mlx5e_priv {
 	/* priv data path fields - start */
 	/* +1 for port ptp ts */
@@ -844,8 +847,10 @@ struct mlx5e_priv {
 
 	struct mlx5_core_dev      *mdev;
 	struct net_device         *netdev;
+	struct mlx5e_trap         *en_trap;
 	struct mlx5e_stats         stats;
 	struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS];
+	struct mlx5e_channel_stats trap_stats;
 	struct mlx5e_port_ptp_stats port_ptp_stats;
 	u16                        max_nch;
 	u8                         max_opened_tc;
@@ -961,6 +966,8 @@ int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
 int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time);
 void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
 void mlx5e_close_rq(struct mlx5e_rq *rq);
+int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param);
+void mlx5e_destroy_rq(struct mlx5e_rq *rq);
 
 struct mlx5e_sq_param;
 int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
new file mode 100644
index 000000000000..5507efacb9dc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2020 Mellanox Technologies */
+
+#include <net/page_pool.h>
+#include "en/txrx.h"
+#include "en/params.h"
+#include "en/trap.h"
+
+static int mlx5e_trap_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct mlx5e_trap *trap_ctx = container_of(napi, struct mlx5e_trap, napi);
+	struct mlx5e_ch_stats *ch_stats = trap_ctx->stats;
+	struct mlx5e_rq *rq = &trap_ctx->rq;
+	bool busy = false;
+	int work_done = 0;
+
+	ch_stats->poll++;
+
+	work_done = mlx5e_poll_rx_cq(&rq->cq, budget);
+	busy |= work_done == budget;
+	busy |= rq->post_wqes(rq);
+
+	if (busy)
+		return budget;
+
+	if (unlikely(!napi_complete_done(napi, work_done)))
+		return work_done;
+
+	mlx5e_cq_arm(&rq->cq);
+	return work_done;
+}
+
+static int mlx5e_alloc_trap_rq(struct mlx5e_priv *priv, struct mlx5e_rq_param *rqp,
+			       struct mlx5e_rq_stats *stats, struct mlx5e_params *params,
+			       struct mlx5e_ch_stats *ch_stats,
+			       struct mlx5e_rq *rq)
+{
+	void *rqc_wq = MLX5_ADDR_OF(rqc, rqp->rqc, wq);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct page_pool_params pp_params = {};
+	int node = dev_to_node(mdev->device);
+	u32 pool_size;
+	int wq_sz;
+	int err;
+	int i;
+
+	rqp->wq.db_numa_node = node;
+
+	rq->wq_type  = params->rq_wq_type;
+	rq->pdev     = mdev->device;
+	rq->netdev   = priv->netdev;
+	rq->mdev     = mdev;
+	rq->priv     = priv;
+	rq->stats    = stats;
+	rq->clock    = &mdev->clock;
+	rq->tstamp   = &priv->tstamp;
+	rq->hw_mtu   = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+
+	xdp_rxq_info_unused(&rq->xdp_rxq);
+
+	rq->buff.map_dir = DMA_FROM_DEVICE;
+	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, NULL);
+	pool_size = 1 << params->log_rq_mtu_frames;
+
+	err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq, &rq->wq_ctrl);
+	if (err)
+		return err;
+
+	rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
+
+	wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
+
+	rq->wqe.info = rqp->frags_info;
+	rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride;
+	rq->wqe.frags =	kvzalloc_node(array_size(sizeof(*rq->wqe.frags),
+						 (wq_sz << rq->wqe.info.log_num_frags)),
+				      GFP_KERNEL, node);
+	if (!rq->wqe.frags) {
+		err = -ENOMEM;
+		goto err_wq_cyc_destroy;
+	}
+
+	err = mlx5e_init_di_list(rq, wq_sz, node);
+	if (err)
+		goto err_free_frags;
+
+	rq->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+
+	mlx5e_rq_set_trap_handlers(rq, params);
+
+	/* Create a page_pool and register it with rxq */
+	pp_params.order     = 0;
+	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+	pp_params.pool_size = pool_size;
+	pp_params.nid       = node;
+	pp_params.dev       = mdev->device;
+	pp_params.dma_dir   = rq->buff.map_dir;
+
+	/* page_pool can be used even when there is no rq->xdp_prog,
+	 * given page_pool does not handle DMA mapping there is no
+	 * required state to clear. And page_pool gracefully handle
+	 * elevated refcnt.
+	 */
+	rq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rq->page_pool)) {
+		err = PTR_ERR(rq->page_pool);
+		rq->page_pool = NULL;
+		goto err_free_di_list;
+	}
+	for (i = 0; i < wq_sz; i++) {
+		struct mlx5e_rx_wqe_cyc *wqe =
+			mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
+		int f;
+
+		for (f = 0; f < rq->wqe.info.num_frags; f++) {
+			u32 frag_size = rq->wqe.info.arr[f].frag_size |
+				MLX5_HW_START_PADDING;
+
+			wqe->data[f].byte_count = cpu_to_be32(frag_size);
+			wqe->data[f].lkey = rq->mkey_be;
+		}
+		/* check if num_frags is not a pow of two */
+		if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) {
+			wqe->data[f].byte_count = 0;
+			wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY);
+			wqe->data[f].addr = 0;
+		}
+	}
+	return 0;
+
+err_free_di_list:
+	mlx5e_free_di_list(rq);
+err_free_frags:
+	kvfree(rq->wqe.frags);
+err_wq_cyc_destroy:
+	mlx5_wq_destroy(&rq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_trap_rq(struct mlx5e_rq *rq)
+{
+	page_pool_destroy(rq->page_pool);
+	mlx5e_free_di_list(rq);
+	kvfree(rq->wqe.frags);
+	mlx5_wq_destroy(&rq->wq_ctrl);
+}
+
+static int mlx5e_open_trap_rq(struct mlx5e_priv *priv, struct napi_struct *napi,
+			      struct mlx5e_rq_stats *stats, struct mlx5e_params *params,
+			      struct mlx5e_rq_param *rq_param,
+			      struct mlx5e_ch_stats *ch_stats,
+			      struct mlx5e_rq *rq)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_create_cq_param ccp = {};
+	struct dim_cq_moder trap_moder = {};
+	struct mlx5e_cq *cq = &rq->cq;
+	int err;
+
+	ccp.node     = dev_to_node(mdev->device);
+	ccp.ch_stats = ch_stats;
+	ccp.napi     = napi;
+	ccp.ix       = 0;
+	err = mlx5e_open_cq(priv, trap_moder, &rq_param->cqp, &ccp, cq);
+	if (err)
+		return err;
+
+	err = mlx5e_alloc_trap_rq(priv, rq_param, stats, params, ch_stats, rq);
+	if (err)
+		goto err_destroy_cq;
+
+	err = mlx5e_create_rq(rq, rq_param);
+	if (err)
+		goto err_free_rq;
+
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err)
+		goto err_destroy_rq;
+
+	return 0;
+
+err_destroy_rq:
+	mlx5e_destroy_rq(rq);
+	mlx5e_free_rx_descs(rq);
+err_free_rq:
+	mlx5e_free_trap_rq(rq);
+err_destroy_cq:
+	mlx5e_close_cq(cq);
+
+	return err;
+}
+
+static void mlx5e_close_trap_rq(struct mlx5e_rq *rq)
+{
+	mlx5e_destroy_rq(rq);
+	mlx5e_free_rx_descs(rq);
+	mlx5e_free_trap_rq(rq);
+	mlx5e_close_cq(&rq->cq);
+}
+
+static int mlx5e_create_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir,
+					   u32 rqn)
+{
+	void *tirc;
+	int inlen;
+	u32 *in;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+	MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.td.tdn);
+	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_NONE);
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+	MLX5_SET(tirc, tirc, inline_rqn, rqn);
+	err = mlx5e_create_tir(mdev, tir, in);
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_destroy_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir)
+{
+	mlx5e_destroy_tir(mdev, tir);
+}
+
+static void mlx5e_activate_trap_rq(struct mlx5e_rq *rq)
+{
+	set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
+}
+
+static void mlx5e_deactivate_trap_rq(struct mlx5e_rq *rq)
+{
+	clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
+}
+
+static void mlx5e_build_trap_params(struct mlx5e_priv *priv, struct mlx5e_trap *t)
+{
+	struct mlx5e_params *params = &t->params;
+
+	params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC;
+	mlx5e_init_rq_type_params(priv->mdev, params);
+	params->sw_mtu = priv->netdev->max_mtu;
+	mlx5e_build_rq_param(priv, params, NULL, &t->rq_param);
+}
+
+static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
+{
+	int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, 0));
+	struct net_device *netdev = priv->netdev;
+	struct mlx5e_trap *t;
+	int err;
+
+	t = kvzalloc_node(sizeof(*t), GFP_KERNEL, cpu_to_node(cpu));
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	mlx5e_build_trap_params(priv, t);
+
+	t->priv     = priv;
+	t->mdev     = priv->mdev;
+	t->tstamp   = &priv->tstamp;
+	t->pdev     = mlx5_core_dma_dev(priv->mdev);
+	t->netdev   = priv->netdev;
+	t->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	t->stats    = &priv->trap_stats.ch;
+
+	netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll, 64);
+
+	err = mlx5e_open_trap_rq(priv, &t->napi,
+				 &priv->trap_stats.rq,
+				 &t->params, &t->rq_param,
+				 &priv->trap_stats.ch,
+				 &t->rq);
+	if (unlikely(err))
+		goto err_napi_del;
+
+	err = mlx5e_create_trap_direct_rq_tir(t->mdev, &t->tir, t->rq.rqn);
+	if (err)
+		goto err_close_trap_rq;
+
+	return t;
+
+err_close_trap_rq:
+	mlx5e_close_trap_rq(&t->rq);
+err_napi_del:
+	netif_napi_del(&t->napi);
+	kvfree(t);
+	return ERR_PTR(err);
+}
+
+void mlx5e_close_trap(struct mlx5e_trap *trap)
+{
+	mlx5e_destroy_trap_direct_rq_tir(trap->mdev, &trap->tir);
+	mlx5e_close_trap_rq(&trap->rq);
+	netif_napi_del(&trap->napi);
+	kvfree(trap);
+}
+
+static void mlx5e_activate_trap(struct mlx5e_trap *trap)
+{
+	napi_enable(&trap->napi);
+	mlx5e_activate_trap_rq(&trap->rq);
+	napi_schedule(&trap->napi);
+}
+
+void mlx5e_deactivate_trap(struct mlx5e_priv *priv)
+{
+	struct mlx5e_trap *trap = priv->en_trap;
+
+	mlx5e_deactivate_trap_rq(&trap->rq);
+	napi_disable(&trap->napi);
+}
+
+static struct mlx5e_trap *mlx5e_add_trap_queue(struct mlx5e_priv *priv)
+{
+	struct mlx5e_trap *trap;
+
+	trap = mlx5e_open_trap(priv);
+	if (IS_ERR(trap))
+		goto out;
+
+	mlx5e_activate_trap(trap);
+out:
+	return trap;
+}
+
+static void mlx5e_del_trap_queue(struct mlx5e_priv *priv)
+{
+	mlx5e_deactivate_trap(priv);
+	mlx5e_close_trap(priv->en_trap);
+	priv->en_trap = NULL;
+}
+
+static int mlx5e_trap_get_tirn(struct mlx5e_trap *en_trap)
+{
+	return en_trap->tir.tirn;
+}
+
+static int mlx5e_handle_action_trap(struct mlx5e_priv *priv, int trap_id)
+{
+	bool open_queue = !priv->en_trap;
+	struct mlx5e_trap *trap;
+	int err;
+
+	if (open_queue) {
+		trap = mlx5e_add_trap_queue(priv);
+		if (IS_ERR(trap))
+			return PTR_ERR(trap);
+		priv->en_trap = trap;
+	}
+
+	switch (trap_id) {
+	case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER:
+		err = mlx5e_add_vlan_trap(priv, trap_id, mlx5e_trap_get_tirn(priv->en_trap));
+		if (err)
+			goto err_out;
+		break;
+	default:
+		netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id);
+		err = -EINVAL;
+		goto err_out;
+	}
+	return 0;
+
+err_out:
+	if (open_queue)
+		mlx5e_del_trap_queue(priv);
+	return err;
+}
+
+static int mlx5e_handle_action_drop(struct mlx5e_priv *priv, int trap_id)
+{
+	switch (trap_id) {
+	case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER:
+		mlx5e_remove_vlan_trap(priv);
+		break;
+	default:
+		netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id);
+		return -EINVAL;
+	}
+	if (priv->en_trap && !mlx5_devlink_trap_get_num_active(priv->mdev))
+		mlx5e_del_trap_queue(priv);
+
+	return 0;
+}
+
+int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx)
+{
+	int err = 0;
+
+	switch (trap_ctx->action) {
+	case DEVLINK_TRAP_ACTION_TRAP:
+		err = mlx5e_handle_action_trap(priv, trap_ctx->id);
+		break;
+	case DEVLINK_TRAP_ACTION_DROP:
+		err = mlx5e_handle_action_drop(priv, trap_ctx->id);
+		break;
+	default:
+		netdev_warn(priv->netdev, "%s: Unsupported action %d\n", __func__,
+			    trap_ctx->action);
+		err = -EINVAL;
+	}
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h
new file mode 100644
index 000000000000..cc1fa9f12c45
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2020, Mellanox Technologies */
+
+#ifndef __MLX5E_TRAP_H__
+#define __MLX5E_TRAP_H__
+
+#include "../en.h"
+#include "../devlink.h"
+
+struct mlx5e_trap {
+	/* data path */
+	struct mlx5e_rq            rq;
+	struct mlx5e_tir           tir;
+	struct napi_struct         napi;
+	struct device             *pdev;
+	struct net_device         *netdev;
+	__be32                     mkey_be;
+
+	/* data path - accessed per napi poll */
+	struct mlx5e_ch_stats     *stats;
+
+	/* control */
+	struct mlx5e_priv         *priv;
+	struct mlx5_core_dev      *mdev;
+	struct hwtstamp_config    *tstamp;
+	DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES);
+
+	struct mlx5e_params        params;
+	struct mlx5e_rq_param      rq_param;
+};
+
+void mlx5e_close_trap(struct mlx5e_trap *trap);
+void mlx5e_deactivate_trap(struct mlx5e_priv *priv);
+int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx);
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bed2f1a6d730..ec5bb48cb54a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -649,8 +649,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 	mlx5_wq_destroy(&rq->wq_ctrl);
 }
 
-static int mlx5e_create_rq(struct mlx5e_rq *rq,
-			   struct mlx5e_rq_param *param)
+int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 {
 	struct mlx5_core_dev *mdev = rq->mdev;
 
@@ -773,7 +772,7 @@ static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
 	return err;
 }
 
-static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
+void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 {
 	mlx5_core_destroy_rq(rq->mdev, rq->rqn);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index dec93d57542f..98b56f495b32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -52,6 +52,7 @@
 #include "en/xsk/rx.h"
 #include "en/health.h"
 #include "en/params.h"
+#include "devlink.h"
 
 static struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
@@ -1815,3 +1816,48 @@ int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool
 
 	return 0;
 }
+
+static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_priv *priv = netdev_priv(rq->netdev);
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	struct mlx5e_wqe_frag_info *wi;
+	struct sk_buff *skb;
+	u32 cqe_bcnt;
+	u16 trap_id;
+	u16 ci;
+
+	trap_id  = get_cqe_flow_tag(cqe);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+
+	if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
+		rq->stats->wqe_err++;
+		goto free_wqe;
+	}
+
+	skb = mlx5e_skb_from_cqe_nonlinear(rq, cqe, wi, cqe_bcnt);
+	if (!skb)
+		goto free_wqe;
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	skb_push(skb, ETH_HLEN);
+
+	mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port);
+	dev_kfree_skb_any(skb);
+
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi, false);
+	mlx5_wq_cyc_pop(wq);
+}
+
+void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params)
+{
+	rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params, NULL) ?
+			       mlx5e_skb_from_cqe_linear :
+			       mlx5e_skb_from_cqe_nonlinear;
+	rq->post_wqes = mlx5e_post_rx_wqes;
+	rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
+	rq->handle_rx_cqe = mlx5e_trap_handle_rx_cqe;
+}
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 77ba54d38772..00057eae89ab 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -903,6 +903,11 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
 	return (u64)lo | ((u64)hi << 32);
 }
 
+static inline u16 get_cqe_flow_tag(struct mlx5_cqe64 *cqe)
+{
+	return be32_to_cpu(cqe->sop_drop_qpn) & 0xFFF;
+}
+
 #define MLX5_MPWQE_LOG_NUM_STRIDES_BASE	(9)
 #define MLX5_MPWQE_LOG_STRIDE_SZ_BASE	(6)
 
-- 
cgit v1.2.3


From a8ce9ebbecdfda3322bbcece6b3b25888217f8e3 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 15 Jan 2021 08:42:02 +0800
Subject: iommu/vt-d: Preset Access/Dirty bits for IOVA over FL

The Access/Dirty bits in the first level page table entry will be set
whenever a page table entry was used for address translation or write
permission was successfully translated. This is always true when using
the first-level page table for kernel IOVA. Instead of wasting hardware
cycles to update the certain bits, it's better to set them up at the
beginning.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210115004202.953965-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 14 ++++++++++++--
 include/linux/intel-iommu.h |  2 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f665322a0991..be85af612bc1 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1017,8 +1017,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 
 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
-			if (domain_use_first_level(domain))
+			if (domain_use_first_level(domain)) {
 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
+				if (domain->domain.type == IOMMU_DOMAIN_DMA)
+					pteval |= DMA_FL_PTE_ACCESS;
+			}
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
 				free_pgtable_page(tmp_page);
@@ -2310,9 +2313,16 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 		return -EINVAL;
 
 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
-	if (domain_use_first_level(domain))
+	if (domain_use_first_level(domain)) {
 		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
+		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
+			attr |= DMA_FL_PTE_ACCESS;
+			if (prot & DMA_PTE_WRITE)
+				attr |= DMA_FL_PTE_DIRTY;
+		}
+	}
+
 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
 
 	while (nr_pages > 0) {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 09c6a0bf3892..ecb35fdff03e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -42,6 +42,8 @@
 
 #define DMA_FL_PTE_PRESENT	BIT_ULL(0)
 #define DMA_FL_PTE_US		BIT_ULL(2)
+#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
+#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
 #define DMA_FL_PTE_XD		BIT_ULL(63)
 
 #define ADDR_WIDTH_5LEVEL	(57)
-- 
cgit v1.2.3


From 1ce53e2c2ac069e7b3c400a427002a70deb4a916 Mon Sep 17 00:00:00 2001
From: Alejandro Colomar <alx.manpages@gmail.com>
Date: Sat, 28 Nov 2020 13:39:46 +0100
Subject: futex: Change utime parameter to be 'const ... *'

futex(2) says that 'utime' is a pointer to 'const'.  The implementation
doesn't use 'const'; however, it _never_ modifies the contents of utime.

- futex() either uses 'utime' as a pointer to struct or as a 'u32'.

- In case it's used as a 'u32', it makes a copy of it, and of course it is
  not dereferenced.

- In case it's used as a 'struct __kernel_timespec __user *', the pointer
  is not dereferenced inside the futex() definition, and it is only passed
  to a function: get_timespec64(), which accepts a 'const struct
  __kernel_timespec __user *'.

[ tglx: Make the same change to the compat syscall and fixup the prototypes. ]

Signed-off-by: Alejandro Colomar <alx.manpages@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201128123945.4592-1-alx.manpages@gmail.com
---
 include/linux/syscalls.h | 8 ++++----
 kernel/futex.c           | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f3929aff39cf..5cb74edd9a4f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -583,11 +583,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags);
 
 /* kernel/futex.c */
 asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
-			struct __kernel_timespec __user *utime, u32 __user *uaddr2,
-			u32 val3);
+			  const struct __kernel_timespec __user *utime,
+			  u32 __user *uaddr2, u32 val3);
 asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val,
-			struct old_timespec32 __user *utime, u32 __user *uaddr2,
-			u32 val3);
+				 const struct old_timespec32 __user *utime,
+				 u32 __user *uaddr2, u32 val3);
 asmlinkage long sys_get_robust_list(int pid,
 				    struct robust_list_head __user * __user *head_ptr,
 				    size_t __user *len_ptr);
diff --git a/kernel/futex.c b/kernel/futex.c
index c47d1015d759..d0775aab8da9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3790,8 +3790,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 
 
 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-		struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
-		u32, val3)
+		const struct __kernel_timespec __user *, utime,
+		u32 __user *, uaddr2, u32, val3)
 {
 	struct timespec64 ts;
 	ktime_t t, *tp = NULL;
@@ -3986,7 +3986,7 @@ err_unlock:
 
 #ifdef CONFIG_COMPAT_32BIT_TIME
 SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
-		struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
 		u32, val3)
 {
 	struct timespec64 ts;
-- 
cgit v1.2.3


From 3ab657291638ea267654c3e4798161b2cee6ae01 Mon Sep 17 00:00:00 2001
From: Lianbo Jiang <lijiang@redhat.com>
Date: Tue, 26 Jan 2021 19:53:37 +0800
Subject: iommu: use the __iommu_attach_device() directly for deferred attach

Currently, because domain attach allows to be deferred from iommu
driver to device driver, and when iommu initializes, the devices
on the bus will be scanned and the default groups will be allocated.

Due to the above changes, some devices could be added to the same
group as below:

[    3.859417] pci 0000:01:00.0: Adding to iommu group 16
[    3.864572] pci 0000:01:00.1: Adding to iommu group 16
[    3.869738] pci 0000:02:00.0: Adding to iommu group 17
[    3.874892] pci 0000:02:00.1: Adding to iommu group 17

But when attaching these devices, it doesn't allow that a group has
more than one device, otherwise it will return an error. This conflicts
with the deferred attaching. Unfortunately, it has two devices in the
same group for my side, for example:

[    9.627014] iommu_group_device_count(): device name[0]:0000:01:00.0
[    9.633545] iommu_group_device_count(): device name[1]:0000:01:00.1
...
[   10.255609] iommu_group_device_count(): device name[0]:0000:02:00.0
[   10.262144] iommu_group_device_count(): device name[1]:0000:02:00.1

Finally, which caused the failure of tg3 driver when tg3 driver calls
the dma_alloc_coherent() to allocate coherent memory in the tg3_test_dma().

[    9.660310] tg3 0000:01:00.0: DMA engine test failed, aborting
[    9.754085] tg3: probe of 0000:01:00.0 failed with error -12
[    9.997512] tg3 0000:01:00.1: DMA engine test failed, aborting
[   10.043053] tg3: probe of 0000:01:00.1 failed with error -12
[   10.288905] tg3 0000:02:00.0: DMA engine test failed, aborting
[   10.334070] tg3: probe of 0000:02:00.0 failed with error -12
[   10.578303] tg3 0000:02:00.1: DMA engine test failed, aborting
[   10.622629] tg3: probe of 0000:02:00.1 failed with error -12

In addition, the similar situations also occur in other drivers such
as the bnxt_en driver. That can be reproduced easily in kdump kernel
when SME is active.

Let's move the handling currently in iommu_dma_deferred_attach() into
the iommu core code so that it can call the __iommu_attach_device()
directly instead of the iommu_attach_device(). The external interface
iommu_attach_device() is not suitable for handling this situation.

Signed-off-by: Lianbo Jiang <lijiang@redhat.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20210126115337.20068-3-lijiang@redhat.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dma-iommu.c | 18 +++---------------
 drivers/iommu/iommu.c     | 10 ++++++++++
 include/linux/iommu.h     |  1 +
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c80056f6c9f9..f659395e7959 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -380,18 +380,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	return iova_reserve_iommu_regions(dev, domain);
 }
 
-static int iommu_dma_deferred_attach(struct device *dev,
-		struct iommu_domain *domain)
-{
-	const struct iommu_ops *ops = domain->ops;
-
-	if (unlikely(ops->is_attach_deferred &&
-			ops->is_attach_deferred(domain, dev)))
-		return iommu_attach_device(domain, dev);
-
-	return 0;
-}
-
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
  *                    page flags.
@@ -535,7 +523,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	dma_addr_t iova;
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_dma_deferred_attach(dev, domain))
+	    iommu_deferred_attach(dev, domain))
 		return DMA_MAPPING_ERROR;
 
 	size = iova_align(iovad, size + iova_off);
@@ -694,7 +682,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 	*dma_handle = DMA_MAPPING_ERROR;
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_dma_deferred_attach(dev, domain))
+	    iommu_deferred_attach(dev, domain))
 		return NULL;
 
 	min_size = alloc_sizes & -alloc_sizes;
@@ -978,7 +966,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	int i;
 
 	if (static_branch_unlikely(&iommu_deferred_attach_enabled) &&
-	    iommu_dma_deferred_attach(dev, domain))
+	    iommu_deferred_attach(dev, domain))
 		return 0;
 
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 91f7871f5a37..fd76e2f579fe 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1980,6 +1980,16 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
+{
+	const struct iommu_ops *ops = domain->ops;
+
+	if (ops->is_attach_deferred && ops->is_attach_deferred(domain, dev))
+		return __iommu_attach_device(domain, dev);
+
+	return 0;
+}
+
 /*
  * Check flags and other user provided data for valid combinations. We also
  * make sure no reserved fields or unused flags are set. This is to ensure
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 91d94c014f47..524ffc2ff64f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -376,6 +376,7 @@ int  iommu_device_sysfs_add(struct iommu_device *iommu,
 void iommu_device_sysfs_remove(struct iommu_device *iommu);
 int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
+int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain);
 
 static inline void __iommu_device_set_ops(struct iommu_device *iommu,
 					  const struct iommu_ops *ops)
-- 
cgit v1.2.3


From c9b258c6be09283663c6851725b322568d867c0b Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 15 Dec 2020 01:36:54 -0600
Subject: iommu/amd: Prepare for generic IO page table framework

Add initial hook up code to implement generic IO page table framework.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Link: https://lore.kernel.org/r/20201215073705.123786-3-suravee.suthikulpanit@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/Kconfig           |  1 +
 drivers/iommu/amd/Makefile          |  2 +-
 drivers/iommu/amd/amd_iommu_types.h | 35 +++++++++++++++++++
 drivers/iommu/amd/io_pgtable.c      | 69 +++++++++++++++++++++++++++++++++++++
 drivers/iommu/amd/iommu.c           | 10 ------
 drivers/iommu/io-pgtable.c          |  3 ++
 include/linux/io-pgtable.h          |  2 ++
 7 files changed, 111 insertions(+), 11 deletions(-)
 create mode 100644 drivers/iommu/amd/io_pgtable.c

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index 626b97d0dd21..a3cbafb603f5 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -10,6 +10,7 @@ config AMD_IOMMU
 	select IOMMU_API
 	select IOMMU_IOVA
 	select IOMMU_DMA
+	select IOMMU_IO_PGTABLE
 	depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
 	help
 	  With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index dc5a2fa4fd37..a935f8f4b974 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o
+obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 553587827771..b2365924d898 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/irqreturn.h>
+#include <linux/io-pgtable.h>
 
 /*
  * Maximum number of IOMMUs supported
@@ -252,6 +253,19 @@
 
 #define GA_GUEST_NR		0x1
 
+#define IOMMU_IN_ADDR_BIT_SIZE  52
+#define IOMMU_OUT_ADDR_BIT_SIZE 52
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support
+ * to the IOMMU core, which will then use this information to split
+ * physically contiguous memory regions it is mapping into page sizes
+ * that we support.
+ *
+ * 512GB Pages are not supported due to a hardware bug
+ */
+#define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))
+
 /* Bit value definition for dte irq remapping fields*/
 #define DTE_IRQ_PHYS_ADDR_MASK	(((1ULL << 45)-1) << 6)
 #define DTE_IRQ_REMAP_INTCTL_MASK	(0x3ULL << 60)
@@ -466,6 +480,26 @@ struct amd_irte_ops;
 
 #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED      (1 << 0)
 
+#define io_pgtable_to_data(x) \
+	container_of((x), struct amd_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x) \
+	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+#define io_pgtable_ops_to_domain(x) \
+	container_of(io_pgtable_ops_to_data(x), \
+		     struct protection_domain, iop)
+
+#define io_pgtable_cfg_to_data(x) \
+	container_of((x), struct amd_io_pgtable, pgtbl_cfg)
+
+struct amd_io_pgtable {
+	struct io_pgtable_cfg	pgtbl_cfg;
+	struct io_pgtable	iop;
+	int			mode;
+	u64			*root;
+};
+
 /*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
@@ -474,6 +508,7 @@ struct protection_domain {
 	struct list_head dev_list; /* List of all devices in this domain */
 	struct iommu_domain domain; /* generic domain handle used by
 				       iommu core code */
+	struct amd_io_pgtable iop;
 	spinlock_t lock;	/* mostly used to lock the page table*/
 	u16 id;			/* the domain id written to the device table */
 	atomic64_t pt_root;	/* pgtable root and pgtable mode */
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
new file mode 100644
index 000000000000..cf2cd2d2e43e
--- /dev/null
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic AMD IO page table allocator.
+ *
+ * Copyright (C) 2020 Advanced Micro Devices, Inc.
+ * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+ */
+
+#define pr_fmt(fmt)     "AMD-Vi: " fmt
+#define dev_fmt(fmt)    pr_fmt(fmt)
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/barrier.h>
+
+#include "amd_iommu_types.h"
+#include "amd_iommu.h"
+
+static void v1_tlb_flush_all(void *cookie)
+{
+}
+
+static void v1_tlb_flush_walk(unsigned long iova, size_t size,
+				  size_t granule, void *cookie)
+{
+}
+
+static void v1_tlb_add_page(struct iommu_iotlb_gather *gather,
+					 unsigned long iova, size_t granule,
+					 void *cookie)
+{
+}
+
+static const struct iommu_flush_ops v1_flush_ops = {
+	.tlb_flush_all	= v1_tlb_flush_all,
+	.tlb_flush_walk = v1_tlb_flush_walk,
+	.tlb_add_page	= v1_tlb_add_page,
+};
+
+/*
+ * ----------------------------------------------------
+ */
+static void v1_free_pgtable(struct io_pgtable *iop)
+{
+}
+
+static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
+
+	cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES,
+	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE,
+	cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE,
+	cfg->tlb            = &v1_flush_ops;
+
+	return &pgtable->iop;
+}
+
+struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = {
+	.alloc	= v1_alloc_pgtable,
+	.free	= v1_free_pgtable,
+};
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 3485f8b60d7a..18d8c97f681e 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -57,16 +57,6 @@
 #define HT_RANGE_START		(0xfd00000000ULL)
 #define HT_RANGE_END		(0xffffffffffULL)
 
-/*
- * This bitmap is used to advertise the page sizes our hardware support
- * to the IOMMU core, which will then use this information to split
- * physically contiguous memory regions it is mapping into page sizes
- * that we support.
- *
- * 512GB Pages are not supported due to a hardware bug
- */
-#define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))
-
 #define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
 
 static DEFINE_SPINLOCK(pd_bitmap_lock);
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 94394c81468f..6e9917ce980f 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -24,6 +24,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
 	[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
 #endif
+#ifdef CONFIG_AMD_IOMMU
+	[AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
+#endif
 };
 
 struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ea727eb1a1a9..f58c14218ee1 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -15,6 +15,7 @@ enum io_pgtable_fmt {
 	ARM_64_LPAE_S2,
 	ARM_V7S,
 	ARM_MALI_LPAE,
+	AMD_IOMMU_V1,
 	IO_PGTABLE_NUM_FMTS,
 };
 
@@ -251,5 +252,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
 
 #endif /* __IO_PGTABLE_H */
-- 
cgit v1.2.3


From 3d5eab41451f8e28f3e45eef8f6b372bf56612fb Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 27 Jan 2021 16:29:29 +0000
Subject: iommu/io-pgtable: Remove TLBI_ON_MAP quirk

IO_PGTABLE_QUIRK_TLBI_ON_MAP is now fully superseded by the
core API's iotlb_sync_map callback.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/5abb80bba3a7c371d5ffb7e59c05586deddb9a91.1611764372.git.robin.murphy@arm.com
[will: Remove unused 'iop' local variable from arm_v7s_map()]
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/io-pgtable-arm-v7s.c | 9 +--------
 include/linux/io-pgtable.h         | 5 -----
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 1d92ac948db7..929cb1518db1 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -519,7 +519,6 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
 			phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
 	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
-	struct io_pgtable *iop = &data->iop;
 	int ret;
 
 	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias) ||
@@ -535,12 +534,7 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova,
 	 * Synchronise all PTE updates for the new mapping before there's
 	 * a chance for anything to kick off a table walk for the new iova.
 	 */
-	if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
-		io_pgtable_tlb_flush_walk(iop, iova, size,
-					  ARM_V7S_BLOCK_SIZE(2));
-	} else {
-		wmb();
-	}
+	wmb();
 
 	return ret;
 }
@@ -759,7 +753,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_NO_PERMS |
-			    IO_PGTABLE_QUIRK_TLBI_ON_MAP |
 			    IO_PGTABLE_QUIRK_ARM_MTK_EXT |
 			    IO_PGTABLE_QUIRK_NON_STRICT))
 		return NULL;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 2a5686ca2ba3..06753323a15e 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -68,10 +68,6 @@ struct io_pgtable_cfg {
 	 *	hardware which does not implement the permissions of a given
 	 *	format, and/or requires some format-specific default value.
 	 *
-	 * IO_PGTABLE_QUIRK_TLBI_ON_MAP: If the format forbids caching invalid
-	 *	(unmapped) entries but the hardware might do so anyway, perform
-	 *	TLB maintenance when mapping as well as when unmapping.
-	 *
 	 * IO_PGTABLE_QUIRK_ARM_MTK_EXT: (ARM v7s format) MediaTek IOMMUs extend
 	 *	to support up to 34 bits PA where the bit32 and bit33 are
 	 *	encoded in the bit9 and bit4 of the PTE respectively.
@@ -88,7 +84,6 @@ struct io_pgtable_cfg {
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
-	#define IO_PGTABLE_QUIRK_TLBI_ON_MAP	BIT(2)
 	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT	BIT(3)
 	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(4)
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1	BIT(5)
-- 
cgit v1.2.3


From c52eef0b6ee1a3c57f6fccb30ea0b5ae19358471 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Fri, 22 Jan 2021 19:44:28 -0800
Subject: of/device: Don't NULLify match table in of_match_device() with
 CONFIG_OF=n

This effectively reverts 1db73ae39a97 ("of/device: Nullify match table
in of_match_device() for CONFIG_OF=n") because that commit makes it more
surprising to users of this API that the arguments may never be
referenced by any code. This is because the pre-processor will replace
the argument with NULL and then the match table will be left unreferenced
by any code but the compiler optimizer doesn't know to drop it. This can
lead to compilers warning that match tables are unused, when we really
want to pass the match table to the API but have the compiler see that
it's all inlined and not used and then drop the match table while
silencing the warning. We're being too smart here and not giving the
compiler the chance to do dead code elimination.

Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Frank Rowand <frowand.list@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210123034428.2841052-7-swboyd@chromium.org
---
 include/linux/of_device.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 07ca187fc5e4..937f32f6aecb 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -99,13 +99,11 @@ static inline int of_device_uevent_modalias(struct device *dev,
 
 static inline void of_device_node_put(struct device *dev) { }
 
-static inline const struct of_device_id *__of_match_device(
+static inline const struct of_device_id *of_match_device(
 		const struct of_device_id *matches, const struct device *dev)
 {
 	return NULL;
 }
-#define of_match_device(matches, dev)	\
-	__of_match_device(of_match_ptr(matches), (dev))
 
 static inline struct device_node *of_cpu_device_node_get(int cpu)
 {
-- 
cgit v1.2.3


From 28af22c6c8dff6a16163e5b6a56211d5b535c97b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 26 Jan 2021 18:39:39 +0100
Subject: net: adjust net_device layout for cacheline usage

The current layout of net_device is not optimal for cacheline usage.

The member adj_list.lower linked list is split between cacheline 2 and 3.
The ifindex is placed together with stats (struct net_device_stats),
although most modern drivers don't update this stats member.

The members netdev_ops, mtu and hard_header_len are placed on three
different cachelines. These members are accessed for XDP redirect into
devmap, which were noticeably with perf tool. When not using the map
redirect variant (like TC-BPF does), then ifindex is also used, which is
placed on a separate fourth cacheline. These members are also accessed
during forwarding with regular network stack. The members priv_flags and
flags are on fast-path for network stack transmit path in __dev_queue_xmit
(currently located together with mtu cacheline).

This patch creates a read mostly cacheline, with the purpose of keeping the
above mentioned members on the same cacheline.

Some netdev_features_t members also becomes part of this cacheline, which is
on purpose, as function netif_skb_features() is on fast-path via
validate_xmit_skb().

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/r/161168277983.410784.12401225493601624417.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 53 ++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e8572533d8e..e9e7ada07ea1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1858,7 +1858,6 @@ struct net_device {
 	unsigned long		mem_end;
 	unsigned long		mem_start;
 	unsigned long		base_addr;
-	int			irq;
 
 	/*
 	 *	Some hardware also needs these fields (state,dev_list,
@@ -1880,6 +1879,23 @@ struct net_device {
 		struct list_head lower;
 	} adj_list;
 
+	/* Read-mostly cache-line for fast-path access */
+	unsigned int		flags;
+	unsigned int		priv_flags;
+	const struct net_device_ops *netdev_ops;
+	int			ifindex;
+	unsigned short		gflags;
+	unsigned short		hard_header_len;
+
+	/* Note : dev->mtu is often read without holding a lock.
+	 * Writers usually hold RTNL.
+	 * It is recommended to use READ_ONCE() to annotate the reads,
+	 * and to use WRITE_ONCE() to annotate the writes.
+	 */
+	unsigned int		mtu;
+	unsigned short		needed_headroom;
+	unsigned short		needed_tailroom;
+
 	netdev_features_t	features;
 	netdev_features_t	hw_features;
 	netdev_features_t	wanted_features;
@@ -1888,10 +1904,15 @@ struct net_device {
 	netdev_features_t	mpls_features;
 	netdev_features_t	gso_partial_features;
 
-	int			ifindex;
+	unsigned int		min_mtu;
+	unsigned int		max_mtu;
+	unsigned short		type;
+	unsigned char		min_header_len;
+	unsigned char		name_assign_type;
+
 	int			group;
 
-	struct net_device_stats	stats;
+	struct net_device_stats	stats; /* not used by modern drivers */
 
 	atomic_long_t		rx_dropped;
 	atomic_long_t		tx_dropped;
@@ -1905,7 +1926,6 @@ struct net_device {
 	const struct iw_handler_def *wireless_handlers;
 	struct iw_public_data	*wireless_data;
 #endif
-	const struct net_device_ops *netdev_ops;
 	const struct ethtool_ops *ethtool_ops;
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	const struct l3mdev_ops	*l3mdev_ops;
@@ -1924,34 +1944,12 @@ struct net_device {
 
 	const struct header_ops *header_ops;
 
-	unsigned int		flags;
-	unsigned int		priv_flags;
-
-	unsigned short		gflags;
-	unsigned short		padded;
-
 	unsigned char		operstate;
 	unsigned char		link_mode;
 
 	unsigned char		if_port;
 	unsigned char		dma;
 
-	/* Note : dev->mtu is often read without holding a lock.
-	 * Writers usually hold RTNL.
-	 * It is recommended to use READ_ONCE() to annotate the reads,
-	 * and to use WRITE_ONCE() to annotate the writes.
-	 */
-	unsigned int		mtu;
-	unsigned int		min_mtu;
-	unsigned int		max_mtu;
-	unsigned short		type;
-	unsigned short		hard_header_len;
-	unsigned char		min_header_len;
-	unsigned char		name_assign_type;
-
-	unsigned short		needed_headroom;
-	unsigned short		needed_tailroom;
-
 	/* Interface address info. */
 	unsigned char		perm_addr[MAX_ADDR_LEN];
 	unsigned char		addr_assign_type;
@@ -1962,7 +1960,10 @@ struct net_device {
 	unsigned short		neigh_priv_len;
 	unsigned short          dev_id;
 	unsigned short          dev_port;
+	unsigned short		padded;
+
 	spinlock_t		addr_list_lock;
+	int			irq;
 
 	struct netdev_hw_addr_list	uc;
 	struct netdev_hw_addr_list	mc;
-- 
cgit v1.2.3


From f8408264c77a0cebb20244d1f4750501b36abe0e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 14 Jan 2021 17:05:30 +0530
Subject: drivers: Remove CONFIG_OPROFILE support

The "oprofile" user-space tools don't use the kernel OPROFILE support
any more, and haven't in a long time. User-space has been converted to
the perf interfaces.

Remove kernel's old oprofile support.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Robert Richter <rric@kernel.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org> #RCU
Acked-by: William Cohen <wcohen@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/RCU/NMI-RCU.rst                      |   3 +-
 Documentation/admin-guide/kernel-parameters.txt    |  14 -
 Documentation/process/magic-number.rst             |   1 -
 .../translations/it_IT/process/magic-number.rst    |   1 -
 .../translations/zh_CN/process/magic-number.rst    |   1 -
 MAINTAINERS                                        |  11 -
 arch/Kconfig                                       |  32 --
 drivers/oprofile/buffer_sync.c                     | 591 ---------------------
 drivers/oprofile/buffer_sync.h                     |  22 -
 drivers/oprofile/cpu_buffer.c                      | 465 ----------------
 drivers/oprofile/cpu_buffer.h                      | 121 -----
 drivers/oprofile/event_buffer.c                    | 209 --------
 drivers/oprofile/event_buffer.h                    |  40 --
 drivers/oprofile/nmi_timer_int.c                   | 157 ------
 drivers/oprofile/oprof.c                           | 286 ----------
 drivers/oprofile/oprof.h                           |  50 --
 drivers/oprofile/oprofile_files.c                  | 201 -------
 drivers/oprofile/oprofile_perf.c                   | 328 ------------
 drivers/oprofile/oprofile_stats.c                  |  84 ---
 drivers/oprofile/oprofile_stats.h                  |  33 --
 drivers/oprofile/oprofilefs.c                      | 300 -----------
 drivers/oprofile/timer_int.c                       | 122 -----
 include/linux/oprofile.h                           | 209 --------
 init/Kconfig                                       |   2 +-
 24 files changed, 2 insertions(+), 3281 deletions(-)
 delete mode 100644 drivers/oprofile/buffer_sync.c
 delete mode 100644 drivers/oprofile/buffer_sync.h
 delete mode 100644 drivers/oprofile/cpu_buffer.c
 delete mode 100644 drivers/oprofile/cpu_buffer.h
 delete mode 100644 drivers/oprofile/event_buffer.c
 delete mode 100644 drivers/oprofile/event_buffer.h
 delete mode 100644 drivers/oprofile/nmi_timer_int.c
 delete mode 100644 drivers/oprofile/oprof.c
 delete mode 100644 drivers/oprofile/oprof.h
 delete mode 100644 drivers/oprofile/oprofile_files.c
 delete mode 100644 drivers/oprofile/oprofile_perf.c
 delete mode 100644 drivers/oprofile/oprofile_stats.c
 delete mode 100644 drivers/oprofile/oprofile_stats.h
 delete mode 100644 drivers/oprofile/oprofilefs.c
 delete mode 100644 drivers/oprofile/timer_int.c
 delete mode 100644 include/linux/oprofile.h

(limited to 'include/linux')

diff --git a/Documentation/RCU/NMI-RCU.rst b/Documentation/RCU/NMI-RCU.rst
index 180958388ff9..2a92bc685ef1 100644
--- a/Documentation/RCU/NMI-RCU.rst
+++ b/Documentation/RCU/NMI-RCU.rst
@@ -8,8 +8,7 @@ Although RCU is usually used to protect read-mostly data structures,
 it is possible to use RCU to provide dynamic non-maskable interrupt
 handlers, as well as dynamic irq handlers.  This document describes
 how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
-work in "arch/x86/oprofile/nmi_timer_int.c" and in
-"arch/x86/kernel/traps.c".
+work in "arch/x86/kernel/traps.c".
 
 The relevant pieces of code are listed below, each followed by a
 brief explanation::
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9e3cdb271d06..e860c681766b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3458,20 +3458,6 @@
 			For example, to override I2C bus2:
 			omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100
 
-	oprofile.timer=	[HW]
-			Use timer interrupt instead of performance counters
-
-	oprofile.cpu_type=	Force an oprofile cpu type
-			This might be useful if you have an older oprofile
-			userland or if you want common events.
-			Format: { arch_perfmon }
-			arch_perfmon: [X86] Force use of architectural
-				perfmon on Intel CPUs instead of the
-				CPU specific event set.
-			timer: [X86] Force use of architectural NMI
-				timer mode (see also oprofile.timer
-				for generic hr timer mode)
-
 	oops=panic	Always panic on oopses. Default is to just kill the
 			process, but there is a small probability of
 			deadlocking the machine.
diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index e02ff5ffb653..c6dfe060ec2f 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -135,7 +135,6 @@ FW_HEADER_MAGIC       0x65726F66       fw_header                ``drivers/atm/fo
 SLOT_MAGIC            0x67267321       slot                     ``drivers/hotplug/cpqphp.h``
 SLOT_MAGIC            0x67267322       slot                     ``drivers/hotplug/acpiphp.h``
 LO_MAGIC              0x68797548       nbd_device               ``include/linux/nbd.h``
-OPROFILE_MAGIC        0x6f70726f       super_block              ``drivers/oprofile/oprofilefs.h``
 M3_STATE_MAGIC        0x734d724d       m3_state                 ``sound/oss/maestro3.c``
 VMALLOC_MAGIC         0x87654320       snd_alloc_track          ``sound/core/memory.c``
 KMALLOC_MAGIC         0x87654321       snd_alloc_track          ``sound/core/memory.c``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 0243d32a0b59..1af30f4228f2 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -141,7 +141,6 @@ FW_HEADER_MAGIC       0x65726F66       fw_header                ``drivers/atm/fo
 SLOT_MAGIC            0x67267321       slot                     ``drivers/hotplug/cpqphp.h``
 SLOT_MAGIC            0x67267322       slot                     ``drivers/hotplug/acpiphp.h``
 LO_MAGIC              0x68797548       nbd_device               ``include/linux/nbd.h``
-OPROFILE_MAGIC        0x6f70726f       super_block              ``drivers/oprofile/oprofilefs.h``
 M3_STATE_MAGIC        0x734d724d       m3_state                 ``sound/oss/maestro3.c``
 VMALLOC_MAGIC         0x87654320       snd_alloc_track          ``sound/core/memory.c``
 KMALLOC_MAGIC         0x87654321       snd_alloc_track          ``sound/core/memory.c``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index de182bf4191c..7bb9d4165ed3 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -124,7 +124,6 @@ FW_HEADER_MAGIC       0x65726F66       fw_header                ``drivers/atm/fo
 SLOT_MAGIC            0x67267321       slot                     ``drivers/hotplug/cpqphp.h``
 SLOT_MAGIC            0x67267322       slot                     ``drivers/hotplug/acpiphp.h``
 LO_MAGIC              0x68797548       nbd_device               ``include/linux/nbd.h``
-OPROFILE_MAGIC        0x6f70726f       super_block              ``drivers/oprofile/oprofilefs.h``
 M3_STATE_MAGIC        0x734d724d       m3_state                 ``sound/oss/maestro3.c``
 VMALLOC_MAGIC         0x87654320       snd_alloc_track          ``sound/core/memory.c``
 KMALLOC_MAGIC         0x87654321       snd_alloc_track          ``sound/core/memory.c``
diff --git a/MAINTAINERS b/MAINTAINERS
index cc1e6a5ee6e6..ae2c2cef9d96 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1414,7 +1414,6 @@ F:	arch/arm*/include/asm/hw_breakpoint.h
 F:	arch/arm*/include/asm/perf_event.h
 F:	arch/arm*/kernel/hw_breakpoint.c
 F:	arch/arm*/kernel/perf_*
-F:	arch/arm/oprofile/common.c
 F:	drivers/perf/
 F:	include/linux/perf/arm_pmu.h
 
@@ -4083,7 +4082,6 @@ W:	http://www.ibm.com/developerworks/power/cell/
 F:	arch/powerpc/include/asm/cell*.h
 F:	arch/powerpc/include/asm/spu*.h
 F:	arch/powerpc/include/uapi/asm/spu*.h
-F:	arch/powerpc/oprofile/*cell*
 F:	arch/powerpc/platforms/cell/
 
 CELLWISE CW2015 BATTERY DRIVER
@@ -13311,15 +13309,6 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git
 F:	sound/drivers/opl4/
 
-OPROFILE
-M:	Robert Richter <rric@kernel.org>
-L:	oprofile-list@lists.sf.net
-S:	Maintained
-F:	arch/*/include/asm/oprofile*.h
-F:	arch/*/oprofile/
-F:	drivers/oprofile/
-F:	include/linux/oprofile.h
-
 ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
 M:	Mark Fasheh <mark@fasheh.com>
 M:	Joel Becker <jlbec@evilplan.org>
diff --git a/arch/Kconfig b/arch/Kconfig
index 24862d15f3a3..40277027a255 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -33,38 +33,6 @@ config HOTPLUG_SMT
 config GENERIC_ENTRY
        bool
 
-config OPROFILE
-	tristate "OProfile system profiling"
-	depends on PROFILING
-	depends on HAVE_OPROFILE
-	select RING_BUFFER
-	select RING_BUFFER_ALLOW_SWAP
-	help
-	  OProfile is a profiling system capable of profiling the
-	  whole system, include the kernel, kernel modules, libraries,
-	  and applications.
-
-	  If unsure, say N.
-
-config OPROFILE_EVENT_MULTIPLEX
-	bool "OProfile multiplexing support (EXPERIMENTAL)"
-	default n
-	depends on OPROFILE && X86
-	help
-	  The number of hardware counters is limited. The multiplexing
-	  feature enables OProfile to gather more events than counters
-	  are provided by the hardware. This is realized by switching
-	  between events at a user specified time interval.
-
-	  If unsure, say N.
-
-config HAVE_OPROFILE
-	bool
-
-config OPROFILE_NMI_TIMER
-	def_bool y
-	depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !PPC64
-
 config KPROBES
 	bool "Kprobes"
 	depends on MODULES
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
deleted file mode 100644
index cc917865f13a..000000000000
--- a/drivers/oprofile/buffer_sync.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/**
- * @file buffer_sync.c
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- * @author Barry Kasindorf
- * @author Robert Richter <robert.richter@amd.com>
- *
- * This is the core of the buffer management. Each
- * CPU buffer is processed and entered into the
- * global event buffer. Such processing is necessary
- * in several circumstances, mentioned below.
- *
- * The processing does the job of converting the
- * transitory EIP value into a persistent dentry/offset
- * value that the profiler can record at its leisure.
- *
- * See fs/dcookies.c for a description of the dentry/offset
- * objects.
- */
-
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/workqueue.h>
-#include <linux/notifier.h>
-#include <linux/dcookies.h>
-#include <linux/profile.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/oprofile.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
-#include <linux/gfp.h>
-
-#include "oprofile_stats.h"
-#include "event_buffer.h"
-#include "cpu_buffer.h"
-#include "buffer_sync.h"
-
-static LIST_HEAD(dying_tasks);
-static LIST_HEAD(dead_tasks);
-static cpumask_var_t marked_cpus;
-static DEFINE_SPINLOCK(task_mortuary);
-static void process_task_mortuary(void);
-
-/* Take ownership of the task struct and place it on the
- * list for processing. Only after two full buffer syncs
- * does the task eventually get freed, because by then
- * we are sure we will not reference it again.
- * Can be invoked from softirq via RCU callback due to
- * call_rcu() of the task struct, hence the _irqsave.
- */
-static int
-task_free_notify(struct notifier_block *self, unsigned long val, void *data)
-{
-	unsigned long flags;
-	struct task_struct *task = data;
-	spin_lock_irqsave(&task_mortuary, flags);
-	list_add(&task->tasks, &dying_tasks);
-	spin_unlock_irqrestore(&task_mortuary, flags);
-	return NOTIFY_OK;
-}
-
-
-/* The task is on its way out. A sync of the buffer means we can catch
- * any remaining samples for this task.
- */
-static int
-task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
-{
-	/* To avoid latency problems, we only process the current CPU,
-	 * hoping that most samples for the task are on this CPU
-	 */
-	sync_buffer(raw_smp_processor_id());
-	return 0;
-}
-
-
-/* The task is about to try a do_munmap(). We peek at what it's going to
- * do, and if it's an executable region, process the samples first, so
- * we don't lose any. This does not have to be exact, it's a QoI issue
- * only.
- */
-static int
-munmap_notify(struct notifier_block *self, unsigned long val, void *data)
-{
-	unsigned long addr = (unsigned long)data;
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *mpnt;
-
-	mmap_read_lock(mm);
-
-	mpnt = find_vma(mm, addr);
-	if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
-		mmap_read_unlock(mm);
-		/* To avoid latency problems, we only process the current CPU,
-		 * hoping that most samples for the task are on this CPU
-		 */
-		sync_buffer(raw_smp_processor_id());
-		return 0;
-	}
-
-	mmap_read_unlock(mm);
-	return 0;
-}
-
-
-/* We need to be told about new modules so we don't attribute to a previously
- * loaded module, or drop the samples on the floor.
- */
-static int
-module_load_notify(struct notifier_block *self, unsigned long val, void *data)
-{
-#ifdef CONFIG_MODULES
-	if (val != MODULE_STATE_COMING)
-		return NOTIFY_DONE;
-
-	/* FIXME: should we process all CPU buffers ? */
-	mutex_lock(&buffer_mutex);
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(MODULE_LOADED_CODE);
-	mutex_unlock(&buffer_mutex);
-#endif
-	return NOTIFY_OK;
-}
-
-
-static struct notifier_block task_free_nb = {
-	.notifier_call	= task_free_notify,
-};
-
-static struct notifier_block task_exit_nb = {
-	.notifier_call	= task_exit_notify,
-};
-
-static struct notifier_block munmap_nb = {
-	.notifier_call	= munmap_notify,
-};
-
-static struct notifier_block module_load_nb = {
-	.notifier_call = module_load_notify,
-};
-
-static void free_all_tasks(void)
-{
-	/* make sure we don't leak task structs */
-	process_task_mortuary();
-	process_task_mortuary();
-}
-
-int sync_start(void)
-{
-	int err;
-
-	if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
-		return -ENOMEM;
-
-	err = task_handoff_register(&task_free_nb);
-	if (err)
-		goto out1;
-	err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
-	if (err)
-		goto out2;
-	err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
-	if (err)
-		goto out3;
-	err = register_module_notifier(&module_load_nb);
-	if (err)
-		goto out4;
-
-	start_cpu_work();
-
-out:
-	return err;
-out4:
-	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
-out3:
-	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
-out2:
-	task_handoff_unregister(&task_free_nb);
-	free_all_tasks();
-out1:
-	free_cpumask_var(marked_cpus);
-	goto out;
-}
-
-
-void sync_stop(void)
-{
-	end_cpu_work();
-	unregister_module_notifier(&module_load_nb);
-	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
-	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
-	task_handoff_unregister(&task_free_nb);
-	barrier();			/* do all of the above first */
-
-	flush_cpu_work();
-
-	free_all_tasks();
-	free_cpumask_var(marked_cpus);
-}
-
-
-/* Optimisation. We can manage without taking the dcookie sem
- * because we cannot reach this code without at least one
- * dcookie user still being registered (namely, the reader
- * of the event buffer). */
-static inline unsigned long fast_get_dcookie(const struct path *path)
-{
-	unsigned long cookie;
-
-	if (path->dentry->d_flags & DCACHE_COOKIE)
-		return (unsigned long)path->dentry;
-	get_dcookie(path, &cookie);
-	return cookie;
-}
-
-
-/* Look up the dcookie for the task's mm->exe_file,
- * which corresponds loosely to "application name". This is
- * not strictly necessary but allows oprofile to associate
- * shared-library samples with particular applications
- */
-static unsigned long get_exec_dcookie(struct mm_struct *mm)
-{
-	unsigned long cookie = NO_COOKIE;
-	struct file *exe_file;
-
-	if (!mm)
-		goto done;
-
-	exe_file = get_mm_exe_file(mm);
-	if (!exe_file)
-		goto done;
-
-	cookie = fast_get_dcookie(&exe_file->f_path);
-	fput(exe_file);
-done:
-	return cookie;
-}
-
-
-/* Convert the EIP value of a sample into a persistent dentry/offset
- * pair that can then be added to the global event buffer. We make
- * sure to do this lookup before a mm->mmap modification happens so
- * we don't lose track.
- *
- * The caller must ensure the mm is not nil (ie: not a kernel thread).
- */
-static unsigned long
-lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
-{
-	unsigned long cookie = NO_COOKIE;
-	struct vm_area_struct *vma;
-
-	mmap_read_lock(mm);
-	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
-
-		if (addr < vma->vm_start || addr >= vma->vm_end)
-			continue;
-
-		if (vma->vm_file) {
-			cookie = fast_get_dcookie(&vma->vm_file->f_path);
-			*offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
-				vma->vm_start;
-		} else {
-			/* must be an anonymous map */
-			*offset = addr;
-		}
-
-		break;
-	}
-
-	if (!vma)
-		cookie = INVALID_COOKIE;
-	mmap_read_unlock(mm);
-
-	return cookie;
-}
-
-static unsigned long last_cookie = INVALID_COOKIE;
-
-static void add_cpu_switch(int i)
-{
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(CPU_SWITCH_CODE);
-	add_event_entry(i);
-	last_cookie = INVALID_COOKIE;
-}
-
-static void add_kernel_ctx_switch(unsigned int in_kernel)
-{
-	add_event_entry(ESCAPE_CODE);
-	if (in_kernel)
-		add_event_entry(KERNEL_ENTER_SWITCH_CODE);
-	else
-		add_event_entry(KERNEL_EXIT_SWITCH_CODE);
-}
-
-static void
-add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
-{
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(CTX_SWITCH_CODE);
-	add_event_entry(task->pid);
-	add_event_entry(cookie);
-	/* Another code for daemon back-compat */
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(CTX_TGID_CODE);
-	add_event_entry(task->tgid);
-}
-
-
-static void add_cookie_switch(unsigned long cookie)
-{
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(COOKIE_SWITCH_CODE);
-	add_event_entry(cookie);
-}
-
-
-static void add_trace_begin(void)
-{
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(TRACE_BEGIN_CODE);
-}
-
-static void add_data(struct op_entry *entry, struct mm_struct *mm)
-{
-	unsigned long code, pc, val;
-	unsigned long cookie;
-	off_t offset;
-
-	if (!op_cpu_buffer_get_data(entry, &code))
-		return;
-	if (!op_cpu_buffer_get_data(entry, &pc))
-		return;
-	if (!op_cpu_buffer_get_size(entry))
-		return;
-
-	if (mm) {
-		cookie = lookup_dcookie(mm, pc, &offset);
-
-		if (cookie == NO_COOKIE)
-			offset = pc;
-		if (cookie == INVALID_COOKIE) {
-			atomic_inc(&oprofile_stats.sample_lost_no_mapping);
-			offset = pc;
-		}
-		if (cookie != last_cookie) {
-			add_cookie_switch(cookie);
-			last_cookie = cookie;
-		}
-	} else
-		offset = pc;
-
-	add_event_entry(ESCAPE_CODE);
-	add_event_entry(code);
-	add_event_entry(offset);	/* Offset from Dcookie */
-
-	while (op_cpu_buffer_get_data(entry, &val))
-		add_event_entry(val);
-}
-
-static inline void add_sample_entry(unsigned long offset, unsigned long event)
-{
-	add_event_entry(offset);
-	add_event_entry(event);
-}
-
-
-/*
- * Add a sample to the global event buffer. If possible the
- * sample is converted into a persistent dentry/offset pair
- * for later lookup from userspace. Return 0 on failure.
- */
-static int
-add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
-{
-	unsigned long cookie;
-	off_t offset;
-
-	if (in_kernel) {
-		add_sample_entry(s->eip, s->event);
-		return 1;
-	}
-
-	/* add userspace sample */
-
-	if (!mm) {
-		atomic_inc(&oprofile_stats.sample_lost_no_mm);
-		return 0;
-	}
-
-	cookie = lookup_dcookie(mm, s->eip, &offset);
-
-	if (cookie == INVALID_COOKIE) {
-		atomic_inc(&oprofile_stats.sample_lost_no_mapping);
-		return 0;
-	}
-
-	if (cookie != last_cookie) {
-		add_cookie_switch(cookie);
-		last_cookie = cookie;
-	}
-
-	add_sample_entry(offset, s->event);
-
-	return 1;
-}
-
-
-static void release_mm(struct mm_struct *mm)
-{
-	if (!mm)
-		return;
-	mmput(mm);
-}
-
-static inline int is_code(unsigned long val)
-{
-	return val == ESCAPE_CODE;
-}
-
-
-/* Move tasks along towards death. Any tasks on dead_tasks
- * will definitely have no remaining references in any
- * CPU buffers at this point, because we use two lists,
- * and to have reached the list, it must have gone through
- * one full sync already.
- */
-static void process_task_mortuary(void)
-{
-	unsigned long flags;
-	LIST_HEAD(local_dead_tasks);
-	struct task_struct *task;
-	struct task_struct *ttask;
-
-	spin_lock_irqsave(&task_mortuary, flags);
-
-	list_splice_init(&dead_tasks, &local_dead_tasks);
-	list_splice_init(&dying_tasks, &dead_tasks);
-
-	spin_unlock_irqrestore(&task_mortuary, flags);
-
-	list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
-		list_del(&task->tasks);
-		free_task(task);
-	}
-}
-
-
-static void mark_done(int cpu)
-{
-	int i;
-
-	cpumask_set_cpu(cpu, marked_cpus);
-
-	for_each_online_cpu(i) {
-		if (!cpumask_test_cpu(i, marked_cpus))
-			return;
-	}
-
-	/* All CPUs have been processed at least once,
-	 * we can process the mortuary once
-	 */
-	process_task_mortuary();
-
-	cpumask_clear(marked_cpus);
-}
-
-
-/* FIXME: this is not sufficient if we implement syscall barrier backtrace
- * traversal, the code switch to sb_sample_start at first kernel enter/exit
- * switch so we need a fifth state and some special handling in sync_buffer()
- */
-typedef enum {
-	sb_bt_ignore = -2,
-	sb_buffer_start,
-	sb_bt_start,
-	sb_sample_start,
-} sync_buffer_state;
-
-/* Sync one of the CPU's buffers into the global event buffer.
- * Here we need to go through each batch of samples punctuated
- * by context switch notes, taking the task's mmap_lock and doing
- * lookup in task->mm->mmap to convert EIP into dcookie/offset
- * value.
- */
-void sync_buffer(int cpu)
-{
-	struct mm_struct *mm = NULL;
-	struct mm_struct *oldmm;
-	unsigned long val;
-	struct task_struct *new;
-	unsigned long cookie = 0;
-	int in_kernel = 1;
-	sync_buffer_state state = sb_buffer_start;
-	unsigned int i;
-	unsigned long available;
-	unsigned long flags;
-	struct op_entry entry;
-	struct op_sample *sample;
-
-	mutex_lock(&buffer_mutex);
-
-	add_cpu_switch(cpu);
-
-	op_cpu_buffer_reset(cpu);
-	available = op_cpu_buffer_entries(cpu);
-
-	for (i = 0; i < available; ++i) {
-		sample = op_cpu_buffer_read_entry(&entry, cpu);
-		if (!sample)
-			break;
-
-		if (is_code(sample->eip)) {
-			flags = sample->event;
-			if (flags & TRACE_BEGIN) {
-				state = sb_bt_start;
-				add_trace_begin();
-			}
-			if (flags & KERNEL_CTX_SWITCH) {
-				/* kernel/userspace switch */
-				in_kernel = flags & IS_KERNEL;
-				if (state == sb_buffer_start)
-					state = sb_sample_start;
-				add_kernel_ctx_switch(flags & IS_KERNEL);
-			}
-			if (flags & USER_CTX_SWITCH
-			    && op_cpu_buffer_get_data(&entry, &val)) {
-				/* userspace context switch */
-				new = (struct task_struct *)val;
-				oldmm = mm;
-				release_mm(oldmm);
-				mm = get_task_mm(new);
-				if (mm != oldmm)
-					cookie = get_exec_dcookie(mm);
-				add_user_ctx_switch(new, cookie);
-			}
-			if (op_cpu_buffer_get_size(&entry))
-				add_data(&entry, mm);
-			continue;
-		}
-
-		if (state < sb_bt_start)
-			/* ignore sample */
-			continue;
-
-		if (add_sample(mm, sample, in_kernel))
-			continue;
-
-		/* ignore backtraces if failed to add a sample */
-		if (state == sb_bt_start) {
-			state = sb_bt_ignore;
-			atomic_inc(&oprofile_stats.bt_lost_no_mapping);
-		}
-	}
-	release_mm(mm);
-
-	mark_done(cpu);
-
-	mutex_unlock(&buffer_mutex);
-}
-
-/* The function can be used to add a buffer worth of data directly to
- * the kernel buffer. The buffer is assumed to be a circular buffer.
- * Take the entries from index start and end at index end, wrapping
- * at max_entries.
- */
-void oprofile_put_buff(unsigned long *buf, unsigned int start,
-		       unsigned int stop, unsigned int max)
-{
-	int i;
-
-	i = start;
-
-	mutex_lock(&buffer_mutex);
-	while (i != stop) {
-		add_event_entry(buf[i++]);
-
-		if (i >= max)
-			i = 0;
-	}
-
-	mutex_unlock(&buffer_mutex);
-}
-
diff --git a/drivers/oprofile/buffer_sync.h b/drivers/oprofile/buffer_sync.h
deleted file mode 100644
index 3110732c1835..000000000000
--- a/drivers/oprofile/buffer_sync.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * @file buffer_sync.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#ifndef OPROFILE_BUFFER_SYNC_H
-#define OPROFILE_BUFFER_SYNC_H
-
-/* add the necessary profiling hooks */
-int sync_start(void);
-
-/* remove the hooks */
-void sync_stop(void);
-
-/* sync the given CPU's buffer */
-void sync_buffer(int cpu);
-
-#endif /* OPROFILE_BUFFER_SYNC_H */
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
deleted file mode 100644
index 9210a95cb4e6..000000000000
--- a/drivers/oprofile/cpu_buffer.c
+++ /dev/null
@@ -1,465 +0,0 @@
-/**
- * @file cpu_buffer.c
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Robert Richter <robert.richter@amd.com>
- *
- * Each CPU has a local buffer that stores PC value/event
- * pairs. We also log context switches when we notice them.
- * Eventually each CPU's buffer is processed into the global
- * event buffer by sync_buffer().
- *
- * We use a local buffer for two reasons: an NMI or similar
- * interrupt cannot synchronise, and high sampling rates
- * would lead to catastrophic global synchronisation if
- * a global buffer was used.
- */
-
-#include <linux/sched.h>
-#include <linux/oprofile.h>
-#include <linux/errno.h>
-
-#include <asm/ptrace.h>
-
-#include "event_buffer.h"
-#include "cpu_buffer.h"
-#include "buffer_sync.h"
-#include "oprof.h"
-
-#define OP_BUFFER_FLAGS	0
-
-static struct trace_buffer *op_ring_buffer;
-DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
-
-static void wq_sync_buffer(struct work_struct *work);
-
-#define DEFAULT_TIMER_EXPIRE (HZ / 10)
-static int work_enabled;
-
-unsigned long oprofile_get_cpu_buffer_size(void)
-{
-	return oprofile_cpu_buffer_size;
-}
-
-void oprofile_cpu_buffer_inc_smpl_lost(void)
-{
-	struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
-
-	cpu_buf->sample_lost_overflow++;
-}
-
-void free_cpu_buffers(void)
-{
-	if (op_ring_buffer)
-		ring_buffer_free(op_ring_buffer);
-	op_ring_buffer = NULL;
-}
-
-#define RB_EVENT_HDR_SIZE 4
-
-int alloc_cpu_buffers(void)
-{
-	int i;
-
-	unsigned long buffer_size = oprofile_cpu_buffer_size;
-	unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
-						 RB_EVENT_HDR_SIZE);
-
-	op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
-	if (!op_ring_buffer)
-		goto fail;
-
-	for_each_possible_cpu(i) {
-		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
-
-		b->last_task = NULL;
-		b->last_is_kernel = -1;
-		b->tracing = 0;
-		b->buffer_size = buffer_size;
-		b->sample_received = 0;
-		b->sample_lost_overflow = 0;
-		b->backtrace_aborted = 0;
-		b->sample_invalid_eip = 0;
-		b->cpu = i;
-		INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
-	}
-	return 0;
-
-fail:
-	free_cpu_buffers();
-	return -ENOMEM;
-}
-
-void start_cpu_work(void)
-{
-	int i;
-
-	work_enabled = 1;
-
-	for_each_online_cpu(i) {
-		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
-
-		/*
-		 * Spread the work by 1 jiffy per cpu so they dont all
-		 * fire at once.
-		 */
-		schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
-	}
-}
-
-void end_cpu_work(void)
-{
-	work_enabled = 0;
-}
-
-void flush_cpu_work(void)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
-
-		/* these works are per-cpu, no need for flush_sync */
-		flush_delayed_work(&b->work);
-	}
-}
-
-/*
- * This function prepares the cpu buffer to write a sample.
- *
- * Struct op_entry is used during operations on the ring buffer while
- * struct op_sample contains the data that is stored in the ring
- * buffer. Struct entry can be uninitialized. The function reserves a
- * data array that is specified by size. Use
- * op_cpu_buffer_write_commit() after preparing the sample. In case of
- * errors a null pointer is returned, otherwise the pointer to the
- * sample.
- *
- */
-struct op_sample
-*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
-{
-	entry->event = ring_buffer_lock_reserve
-		(op_ring_buffer, sizeof(struct op_sample) +
-		 size * sizeof(entry->sample->data[0]));
-	if (!entry->event)
-		return NULL;
-	entry->sample = ring_buffer_event_data(entry->event);
-	entry->size = size;
-	entry->data = entry->sample->data;
-
-	return entry->sample;
-}
-
-int op_cpu_buffer_write_commit(struct op_entry *entry)
-{
-	return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
-}
-
-struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
-{
-	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL);
-	if (!e)
-		return NULL;
-
-	entry->event = e;
-	entry->sample = ring_buffer_event_data(e);
-	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
-		/ sizeof(entry->sample->data[0]);
-	entry->data = entry->sample->data;
-	return entry->sample;
-}
-
-unsigned long op_cpu_buffer_entries(int cpu)
-{
-	return ring_buffer_entries_cpu(op_ring_buffer, cpu);
-}
-
-static int
-op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
-	    int is_kernel, struct task_struct *task)
-{
-	struct op_entry entry;
-	struct op_sample *sample;
-	unsigned long flags;
-	int size;
-
-	flags = 0;
-
-	if (backtrace)
-		flags |= TRACE_BEGIN;
-
-	/* notice a switch from user->kernel or vice versa */
-	is_kernel = !!is_kernel;
-	if (cpu_buf->last_is_kernel != is_kernel) {
-		cpu_buf->last_is_kernel = is_kernel;
-		flags |= KERNEL_CTX_SWITCH;
-		if (is_kernel)
-			flags |= IS_KERNEL;
-	}
-
-	/* notice a task switch */
-	if (cpu_buf->last_task != task) {
-		cpu_buf->last_task = task;
-		flags |= USER_CTX_SWITCH;
-	}
-
-	if (!flags)
-		/* nothing to do */
-		return 0;
-
-	if (flags & USER_CTX_SWITCH)
-		size = 1;
-	else
-		size = 0;
-
-	sample = op_cpu_buffer_write_reserve(&entry, size);
-	if (!sample)
-		return -ENOMEM;
-
-	sample->eip = ESCAPE_CODE;
-	sample->event = flags;
-
-	if (size)
-		op_cpu_buffer_add_data(&entry, (unsigned long)task);
-
-	op_cpu_buffer_write_commit(&entry);
-
-	return 0;
-}
-
-static inline int
-op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
-	      unsigned long pc, unsigned long event)
-{
-	struct op_entry entry;
-	struct op_sample *sample;
-
-	sample = op_cpu_buffer_write_reserve(&entry, 0);
-	if (!sample)
-		return -ENOMEM;
-
-	sample->eip = pc;
-	sample->event = event;
-
-	return op_cpu_buffer_write_commit(&entry);
-}
-
-/*
- * This must be safe from any context.
- *
- * is_kernel is needed because on some architectures you cannot
- * tell if you are in kernel or user space simply by looking at
- * pc. We tag this in the buffer by generating kernel enter/exit
- * events whenever is_kernel changes
- */
-static int
-log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
-	   unsigned long backtrace, int is_kernel, unsigned long event,
-	   struct task_struct *task)
-{
-	struct task_struct *tsk = task ? task : current;
-	cpu_buf->sample_received++;
-
-	if (pc == ESCAPE_CODE) {
-		cpu_buf->sample_invalid_eip++;
-		return 0;
-	}
-
-	if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
-		goto fail;
-
-	if (op_add_sample(cpu_buf, pc, event))
-		goto fail;
-
-	return 1;
-
-fail:
-	cpu_buf->sample_lost_overflow++;
-	return 0;
-}
-
-static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
-{
-	cpu_buf->tracing = 1;
-}
-
-static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
-{
-	cpu_buf->tracing = 0;
-}
-
-static inline void
-__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
-			  unsigned long event, int is_kernel,
-			  struct task_struct *task)
-{
-	struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
-	unsigned long backtrace = oprofile_backtrace_depth;
-
-	/*
-	 * if log_sample() fail we can't backtrace since we lost the
-	 * source of this event
-	 */
-	if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task))
-		/* failed */
-		return;
-
-	if (!backtrace)
-		return;
-
-	oprofile_begin_trace(cpu_buf);
-	oprofile_ops.backtrace(regs, backtrace);
-	oprofile_end_trace(cpu_buf);
-}
-
-void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
-				unsigned long event, int is_kernel,
-				struct task_struct *task)
-{
-	__oprofile_add_ext_sample(pc, regs, event, is_kernel, task);
-}
-
-void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
-			     unsigned long event, int is_kernel)
-{
-	__oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
-}
-
-void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
-{
-	int is_kernel;
-	unsigned long pc;
-
-	if (likely(regs)) {
-		is_kernel = !user_mode(regs);
-		pc = profile_pc(regs);
-	} else {
-		is_kernel = 0;    /* This value will not be used */
-		pc = ESCAPE_CODE; /* as this causes an early return. */
-	}
-
-	__oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL);
-}
-
-/*
- * Add samples with data to the ring buffer.
- *
- * Use oprofile_add_data(&entry, val) to add data and
- * oprofile_write_commit(&entry) to commit the sample.
- */
-void
-oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
-		       unsigned long pc, int code, int size)
-{
-	struct op_sample *sample;
-	int is_kernel = !user_mode(regs);
-	struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
-
-	cpu_buf->sample_received++;
-
-	/* no backtraces for samples with data */
-	if (op_add_code(cpu_buf, 0, is_kernel, current))
-		goto fail;
-
-	sample = op_cpu_buffer_write_reserve(entry, size + 2);
-	if (!sample)
-		goto fail;
-	sample->eip = ESCAPE_CODE;
-	sample->event = 0;		/* no flags */
-
-	op_cpu_buffer_add_data(entry, code);
-	op_cpu_buffer_add_data(entry, pc);
-
-	return;
-
-fail:
-	entry->event = NULL;
-	cpu_buf->sample_lost_overflow++;
-}
-
-int oprofile_add_data(struct op_entry *entry, unsigned long val)
-{
-	if (!entry->event)
-		return 0;
-	return op_cpu_buffer_add_data(entry, val);
-}
-
-int oprofile_add_data64(struct op_entry *entry, u64 val)
-{
-	if (!entry->event)
-		return 0;
-	if (op_cpu_buffer_get_size(entry) < 2)
-		/*
-		 * the function returns 0 to indicate a too small
-		 * buffer, even if there is some space left
-		 */
-		return 0;
-	if (!op_cpu_buffer_add_data(entry, (u32)val))
-		return 0;
-	return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
-}
-
-int oprofile_write_commit(struct op_entry *entry)
-{
-	if (!entry->event)
-		return -EINVAL;
-	return op_cpu_buffer_write_commit(entry);
-}
-
-void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
-{
-	struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
-	log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
-}
-
-void oprofile_add_trace(unsigned long pc)
-{
-	struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer);
-
-	if (!cpu_buf->tracing)
-		return;
-
-	/*
-	 * broken frame can give an eip with the same value as an
-	 * escape code, abort the trace if we get it
-	 */
-	if (pc == ESCAPE_CODE)
-		goto fail;
-
-	if (op_add_sample(cpu_buf, pc, 0))
-		goto fail;
-
-	return;
-fail:
-	cpu_buf->tracing = 0;
-	cpu_buf->backtrace_aborted++;
-	return;
-}
-
-/*
- * This serves to avoid cpu buffer overflow, and makes sure
- * the task mortuary progresses
- *
- * By using schedule_delayed_work_on and then schedule_delayed_work
- * we guarantee this will stay on the correct cpu
- */
-static void wq_sync_buffer(struct work_struct *work)
-{
-	struct oprofile_cpu_buffer *b =
-		container_of(work, struct oprofile_cpu_buffer, work.work);
-	if (b->cpu != smp_processor_id() && !cpu_online(b->cpu)) {
-		cancel_delayed_work(&b->work);
-		return;
-	}
-	sync_buffer(b->cpu);
-
-	/* don't re-add the work if we're shutting down */
-	if (work_enabled)
-		schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
-}
diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h
deleted file mode 100644
index 31478c0cff87..000000000000
--- a/drivers/oprofile/cpu_buffer.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * @file cpu_buffer.h
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#ifndef OPROFILE_CPU_BUFFER_H
-#define OPROFILE_CPU_BUFFER_H
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/workqueue.h>
-#include <linux/cache.h>
-#include <linux/sched.h>
-#include <linux/ring_buffer.h>
-
-struct task_struct;
-
-int alloc_cpu_buffers(void);
-void free_cpu_buffers(void);
-
-void start_cpu_work(void);
-void end_cpu_work(void);
-void flush_cpu_work(void);
-
-/* CPU buffer is composed of such entries (which are
- * also used for context switch notes)
- */
-struct op_sample {
-	unsigned long eip;
-	unsigned long event;
-	unsigned long data[];
-};
-
-struct op_entry;
-
-struct oprofile_cpu_buffer {
-	unsigned long buffer_size;
-	struct task_struct *last_task;
-	int last_is_kernel;
-	int tracing;
-	unsigned long sample_received;
-	unsigned long sample_lost_overflow;
-	unsigned long backtrace_aborted;
-	unsigned long sample_invalid_eip;
-	int cpu;
-	struct delayed_work work;
-};
-
-DECLARE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
-
-/*
- * Resets the cpu buffer to a sane state.
- *
- * reset these to invalid values; the next sample collected will
- * populate the buffer with proper values to initialize the buffer
- */
-static inline void op_cpu_buffer_reset(int cpu)
-{
-	struct oprofile_cpu_buffer *cpu_buf = &per_cpu(op_cpu_buffer, cpu);
-
-	cpu_buf->last_is_kernel = -1;
-	cpu_buf->last_task = NULL;
-}
-
-/*
- * op_cpu_buffer_add_data() and op_cpu_buffer_write_commit() may be
- * called only if op_cpu_buffer_write_reserve() did not return NULL or
- * entry->event != NULL, otherwise entry->size or entry->event will be
- * used uninitialized.
- */
-
-struct op_sample
-*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size);
-int op_cpu_buffer_write_commit(struct op_entry *entry);
-struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu);
-unsigned long op_cpu_buffer_entries(int cpu);
-
-/* returns the remaining free size of data in the entry */
-static inline
-int op_cpu_buffer_add_data(struct op_entry *entry, unsigned long val)
-{
-	if (!entry->size)
-		return 0;
-	*entry->data = val;
-	entry->size--;
-	entry->data++;
-	return entry->size;
-}
-
-/* returns the size of data in the entry */
-static inline
-int op_cpu_buffer_get_size(struct op_entry *entry)
-{
-	return entry->size;
-}
-
-/* returns 0 if empty or the size of data including the current value */
-static inline
-int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val)
-{
-	int size = entry->size;
-	if (!size)
-		return 0;
-	*val = *entry->data;
-	entry->size--;
-	entry->data++;
-	return size;
-}
-
-/* extra data flags */
-#define KERNEL_CTX_SWITCH	(1UL << 0)
-#define IS_KERNEL		(1UL << 1)
-#define TRACE_BEGIN		(1UL << 2)
-#define USER_CTX_SWITCH		(1UL << 3)
-
-#endif /* OPROFILE_CPU_BUFFER_H */
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
deleted file mode 100644
index 6c9edc8bbc95..000000000000
--- a/drivers/oprofile/event_buffer.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/**
- * @file event_buffer.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- *
- * This is the global event buffer that the user-space
- * daemon reads from. The event buffer is an untyped array
- * of unsigned longs. Entries are prefixed by the
- * escape value ESCAPE_CODE followed by an identifying code.
- */
-
-#include <linux/vmalloc.h>
-#include <linux/oprofile.h>
-#include <linux/sched/signal.h>
-#include <linux/capability.h>
-#include <linux/dcookies.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-
-#include "oprof.h"
-#include "event_buffer.h"
-#include "oprofile_stats.h"
-
-DEFINE_MUTEX(buffer_mutex);
-
-static unsigned long buffer_opened;
-static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
-static unsigned long *event_buffer;
-static unsigned long buffer_size;
-static unsigned long buffer_watershed;
-static size_t buffer_pos;
-/* atomic_t because wait_event checks it outside of buffer_mutex */
-static atomic_t buffer_ready = ATOMIC_INIT(0);
-
-/*
- * Add an entry to the event buffer. When we get near to the end we
- * wake up the process sleeping on the read() of the file. To protect
- * the event_buffer this function may only be called when buffer_mutex
- * is set.
- */
-void add_event_entry(unsigned long value)
-{
-	/*
-	 * This shouldn't happen since all workqueues or handlers are
-	 * canceled or flushed before the event buffer is freed.
-	 */
-	if (!event_buffer) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-
-	if (buffer_pos == buffer_size) {
-		atomic_inc(&oprofile_stats.event_lost_overflow);
-		return;
-	}
-
-	event_buffer[buffer_pos] = value;
-	if (++buffer_pos == buffer_size - buffer_watershed) {
-		atomic_set(&buffer_ready, 1);
-		wake_up(&buffer_wait);
-	}
-}
-
-
-/* Wake up the waiting process if any. This happens
- * on "echo 0 >/dev/oprofile/enable" so the daemon
- * processes the data remaining in the event buffer.
- */
-void wake_up_buffer_waiter(void)
-{
-	mutex_lock(&buffer_mutex);
-	atomic_set(&buffer_ready, 1);
-	wake_up(&buffer_wait);
-	mutex_unlock(&buffer_mutex);
-}
-
-
-int alloc_event_buffer(void)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&oprofilefs_lock, flags);
-	buffer_size = oprofile_buffer_size;
-	buffer_watershed = oprofile_buffer_watershed;
-	raw_spin_unlock_irqrestore(&oprofilefs_lock, flags);
-
-	if (buffer_watershed >= buffer_size)
-		return -EINVAL;
-
-	buffer_pos = 0;
-	event_buffer = vmalloc(array_size(buffer_size, sizeof(unsigned long)));
-	if (!event_buffer)
-		return -ENOMEM;
-
-	return 0;
-}
-
-
-void free_event_buffer(void)
-{
-	mutex_lock(&buffer_mutex);
-	vfree(event_buffer);
-	buffer_pos = 0;
-	event_buffer = NULL;
-	mutex_unlock(&buffer_mutex);
-}
-
-
-static int event_buffer_open(struct inode *inode, struct file *file)
-{
-	int err = -EPERM;
-
-	if (!perfmon_capable())
-		return -EPERM;
-
-	if (test_and_set_bit_lock(0, &buffer_opened))
-		return -EBUSY;
-
-	/* Register as a user of dcookies
-	 * to ensure they persist for the lifetime of
-	 * the open event file
-	 */
-	err = -EINVAL;
-	file->private_data = dcookie_register();
-	if (!file->private_data)
-		goto out;
-
-	if ((err = oprofile_setup()))
-		goto fail;
-
-	/* NB: the actual start happens from userspace
-	 * echo 1 >/dev/oprofile/enable
-	 */
-
-	return nonseekable_open(inode, file);
-
-fail:
-	dcookie_unregister(file->private_data);
-out:
-	__clear_bit_unlock(0, &buffer_opened);
-	return err;
-}
-
-
-static int event_buffer_release(struct inode *inode, struct file *file)
-{
-	oprofile_stop();
-	oprofile_shutdown();
-	dcookie_unregister(file->private_data);
-	buffer_pos = 0;
-	atomic_set(&buffer_ready, 0);
-	__clear_bit_unlock(0, &buffer_opened);
-	return 0;
-}
-
-
-static ssize_t event_buffer_read(struct file *file, char __user *buf,
-				 size_t count, loff_t *offset)
-{
-	int retval = -EINVAL;
-	size_t const max = buffer_size * sizeof(unsigned long);
-
-	/* handling partial reads is more trouble than it's worth */
-	if (count != max || *offset)
-		return -EINVAL;
-
-	wait_event_interruptible(buffer_wait, atomic_read(&buffer_ready));
-
-	if (signal_pending(current))
-		return -EINTR;
-
-	/* can't currently happen */
-	if (!atomic_read(&buffer_ready))
-		return -EAGAIN;
-
-	mutex_lock(&buffer_mutex);
-
-	/* May happen if the buffer is freed during pending reads. */
-	if (!event_buffer) {
-		retval = -EINTR;
-		goto out;
-	}
-
-	atomic_set(&buffer_ready, 0);
-
-	retval = -EFAULT;
-
-	count = buffer_pos * sizeof(unsigned long);
-
-	if (copy_to_user(buf, event_buffer, count))
-		goto out;
-
-	retval = count;
-	buffer_pos = 0;
-
-out:
-	mutex_unlock(&buffer_mutex);
-	return retval;
-}
-
-const struct file_operations event_buffer_fops = {
-	.open		= event_buffer_open,
-	.release	= event_buffer_release,
-	.read		= event_buffer_read,
-	.llseek		= no_llseek,
-};
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h
deleted file mode 100644
index a8d5bb3cba89..000000000000
--- a/drivers/oprofile/event_buffer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * @file event_buffer.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#ifndef EVENT_BUFFER_H
-#define EVENT_BUFFER_H
-
-#include <linux/types.h>
-#include <linux/mutex.h>
-
-int alloc_event_buffer(void);
-
-void free_event_buffer(void);
-
-/**
- * Add data to the event buffer.
- * The data passed is free-form, but typically consists of
- * file offsets, dcookies, context information, and ESCAPE codes.
- */
-void add_event_entry(unsigned long data);
-
-/* wake up the process sleeping on the event file */
-void wake_up_buffer_waiter(void);
-
-#define INVALID_COOKIE ~0UL
-#define NO_COOKIE 0UL
-
-extern const struct file_operations event_buffer_fops;
-
-/* mutex between sync_cpu_buffers() and the
- * file reading code.
- */
-extern struct mutex buffer_mutex;
-
-#endif /* EVENT_BUFFER_H */
diff --git a/drivers/oprofile/nmi_timer_int.c b/drivers/oprofile/nmi_timer_int.c
deleted file mode 100644
index f343bd96609a..000000000000
--- a/drivers/oprofile/nmi_timer_int.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/**
- * @file nmi_timer_int.c
- *
- * @remark Copyright 2011 Advanced Micro Devices, Inc.
- *
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/oprofile.h>
-#include <linux/perf_event.h>
-
-#ifdef CONFIG_OPROFILE_NMI_TIMER
-
-static DEFINE_PER_CPU(struct perf_event *, nmi_timer_events);
-static int ctr_running;
-
-static struct perf_event_attr nmi_timer_attr = {
-	.type           = PERF_TYPE_HARDWARE,
-	.config         = PERF_COUNT_HW_CPU_CYCLES,
-	.size           = sizeof(struct perf_event_attr),
-	.pinned         = 1,
-	.disabled       = 1,
-};
-
-static void nmi_timer_callback(struct perf_event *event,
-			       struct perf_sample_data *data,
-			       struct pt_regs *regs)
-{
-	event->hw.interrupts = 0;       /* don't throttle interrupts */
-	oprofile_add_sample(regs, 0);
-}
-
-static int nmi_timer_start_cpu(int cpu)
-{
-	struct perf_event *event = per_cpu(nmi_timer_events, cpu);
-
-	if (!event) {
-		event = perf_event_create_kernel_counter(&nmi_timer_attr, cpu, NULL,
-							 nmi_timer_callback, NULL);
-		if (IS_ERR(event))
-			return PTR_ERR(event);
-		per_cpu(nmi_timer_events, cpu) = event;
-	}
-
-	if (event && ctr_running)
-		perf_event_enable(event);
-
-	return 0;
-}
-
-static void nmi_timer_stop_cpu(int cpu)
-{
-	struct perf_event *event = per_cpu(nmi_timer_events, cpu);
-
-	if (event && ctr_running)
-		perf_event_disable(event);
-}
-
-static int nmi_timer_cpu_online(unsigned int cpu)
-{
-	nmi_timer_start_cpu(cpu);
-	return 0;
-}
-static int nmi_timer_cpu_predown(unsigned int cpu)
-{
-	nmi_timer_stop_cpu(cpu);
-	return 0;
-}
-
-static int nmi_timer_start(void)
-{
-	int cpu;
-
-	get_online_cpus();
-	ctr_running = 1;
-	for_each_online_cpu(cpu)
-		nmi_timer_start_cpu(cpu);
-	put_online_cpus();
-
-	return 0;
-}
-
-static void nmi_timer_stop(void)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		nmi_timer_stop_cpu(cpu);
-	ctr_running = 0;
-	put_online_cpus();
-}
-
-static enum cpuhp_state hp_online;
-
-static void nmi_timer_shutdown(void)
-{
-	struct perf_event *event;
-	int cpu;
-
-	cpuhp_remove_state(hp_online);
-	for_each_possible_cpu(cpu) {
-		event = per_cpu(nmi_timer_events, cpu);
-		if (!event)
-			continue;
-		perf_event_disable(event);
-		per_cpu(nmi_timer_events, cpu) = NULL;
-		perf_event_release_kernel(event);
-	}
-}
-
-static int nmi_timer_setup(void)
-{
-	int err;
-	u64 period;
-
-	/* clock cycles per tick: */
-	period = (u64)cpu_khz * 1000;
-	do_div(period, HZ);
-	nmi_timer_attr.sample_period = period;
-
-	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "oprofile/nmi:online",
-				nmi_timer_cpu_online, nmi_timer_cpu_predown);
-	if (err < 0) {
-		nmi_timer_shutdown();
-		return err;
-	}
-	hp_online = err;
-	return 0;
-}
-
-int __init op_nmi_timer_init(struct oprofile_operations *ops)
-{
-	int err = 0;
-
-	err = nmi_timer_setup();
-	if (err)
-		return err;
-	nmi_timer_shutdown();		/* only check, don't alloc */
-
-	ops->create_files	= NULL;
-	ops->setup		= nmi_timer_setup;
-	ops->shutdown		= nmi_timer_shutdown;
-	ops->start		= nmi_timer_start;
-	ops->stop		= nmi_timer_stop;
-	ops->cpu_type		= "timer";
-
-	printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
-
-	return 0;
-}
-
-#endif
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
deleted file mode 100644
index ed2c3ec07024..000000000000
--- a/drivers/oprofile/oprof.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/**
- * @file oprof.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/oprofile.h>
-#include <linux/moduleparam.h>
-#include <linux/workqueue.h>
-#include <linux/time.h>
-#include <linux/mutex.h>
-
-#include "oprof.h"
-#include "event_buffer.h"
-#include "cpu_buffer.h"
-#include "buffer_sync.h"
-#include "oprofile_stats.h"
-
-struct oprofile_operations oprofile_ops;
-
-unsigned long oprofile_started;
-unsigned long oprofile_backtrace_depth;
-static unsigned long is_setup;
-static DEFINE_MUTEX(start_mutex);
-
-/* timer
-   0 - use performance monitoring hardware if available
-   1 - use the timer int mechanism regardless
- */
-static int timer = 0;
-
-int oprofile_setup(void)
-{
-	int err;
-
-	mutex_lock(&start_mutex);
-
-	if ((err = alloc_cpu_buffers()))
-		goto out;
-
-	if ((err = alloc_event_buffer()))
-		goto out1;
-
-	if (oprofile_ops.setup && (err = oprofile_ops.setup()))
-		goto out2;
-
-	/* Note even though this starts part of the
-	 * profiling overhead, it's necessary to prevent
-	 * us missing task deaths and eventually oopsing
-	 * when trying to process the event buffer.
-	 */
-	if (oprofile_ops.sync_start) {
-		int sync_ret = oprofile_ops.sync_start();
-		switch (sync_ret) {
-		case 0:
-			goto post_sync;
-		case 1:
-			goto do_generic;
-		case -1:
-			goto out3;
-		default:
-			goto out3;
-		}
-	}
-do_generic:
-	if ((err = sync_start()))
-		goto out3;
-
-post_sync:
-	is_setup = 1;
-	mutex_unlock(&start_mutex);
-	return 0;
-
-out3:
-	if (oprofile_ops.shutdown)
-		oprofile_ops.shutdown();
-out2:
-	free_event_buffer();
-out1:
-	free_cpu_buffers();
-out:
-	mutex_unlock(&start_mutex);
-	return err;
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void switch_worker(struct work_struct *work);
-static DECLARE_DELAYED_WORK(switch_work, switch_worker);
-
-static void start_switch_worker(void)
-{
-	if (oprofile_ops.switch_events)
-		schedule_delayed_work(&switch_work, oprofile_time_slice);
-}
-
-static void stop_switch_worker(void)
-{
-	cancel_delayed_work_sync(&switch_work);
-}
-
-static void switch_worker(struct work_struct *work)
-{
-	if (oprofile_ops.switch_events())
-		return;
-
-	atomic_inc(&oprofile_stats.multiplex_counter);
-	start_switch_worker();
-}
-
-/* User inputs in ms, converts to jiffies */
-int oprofile_set_timeout(unsigned long val_msec)
-{
-	int err = 0;
-	unsigned long time_slice;
-
-	mutex_lock(&start_mutex);
-
-	if (oprofile_started) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	if (!oprofile_ops.switch_events) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	time_slice = msecs_to_jiffies(val_msec);
-	if (time_slice == MAX_JIFFY_OFFSET) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	oprofile_time_slice = time_slice;
-
-out:
-	mutex_unlock(&start_mutex);
-	return err;
-
-}
-
-#else
-
-static inline void start_switch_worker(void) { }
-static inline void stop_switch_worker(void) { }
-
-#endif
-
-/* Actually start profiling (echo 1>/dev/oprofile/enable) */
-int oprofile_start(void)
-{
-	int err = -EINVAL;
-
-	mutex_lock(&start_mutex);
-
-	if (!is_setup)
-		goto out;
-
-	err = 0;
-
-	if (oprofile_started)
-		goto out;
-
-	oprofile_reset_stats();
-
-	if ((err = oprofile_ops.start()))
-		goto out;
-
-	start_switch_worker();
-
-	oprofile_started = 1;
-out:
-	mutex_unlock(&start_mutex);
-	return err;
-}
-
-
-/* echo 0>/dev/oprofile/enable */
-void oprofile_stop(void)
-{
-	mutex_lock(&start_mutex);
-	if (!oprofile_started)
-		goto out;
-	oprofile_ops.stop();
-	oprofile_started = 0;
-
-	stop_switch_worker();
-
-	/* wake up the daemon to read what remains */
-	wake_up_buffer_waiter();
-out:
-	mutex_unlock(&start_mutex);
-}
-
-
-void oprofile_shutdown(void)
-{
-	mutex_lock(&start_mutex);
-	if (oprofile_ops.sync_stop) {
-		int sync_ret = oprofile_ops.sync_stop();
-		switch (sync_ret) {
-		case 0:
-			goto post_sync;
-		case 1:
-			goto do_generic;
-		default:
-			goto post_sync;
-		}
-	}
-do_generic:
-	sync_stop();
-post_sync:
-	if (oprofile_ops.shutdown)
-		oprofile_ops.shutdown();
-	is_setup = 0;
-	free_event_buffer();
-	free_cpu_buffers();
-	mutex_unlock(&start_mutex);
-}
-
-int oprofile_set_ulong(unsigned long *addr, unsigned long val)
-{
-	int err = -EBUSY;
-
-	mutex_lock(&start_mutex);
-	if (!oprofile_started) {
-		*addr = val;
-		err = 0;
-	}
-	mutex_unlock(&start_mutex);
-
-	return err;
-}
-
-static int timer_mode;
-
-static int __init oprofile_init(void)
-{
-	int err;
-
-	/* always init architecture to setup backtrace support */
-	timer_mode = 0;
-	err = oprofile_arch_init(&oprofile_ops);
-	if (!err) {
-		if (!timer && !oprofilefs_register())
-			return 0;
-		oprofile_arch_exit();
-	}
-
-	/* setup timer mode: */
-	timer_mode = 1;
-	/* no nmi timer mode if oprofile.timer is set */
-	if (timer || op_nmi_timer_init(&oprofile_ops)) {
-		err = oprofile_timer_init(&oprofile_ops);
-		if (err)
-			return err;
-	}
-
-	return oprofilefs_register();
-}
-
-
-static void __exit oprofile_exit(void)
-{
-	oprofilefs_unregister();
-	if (!timer_mode)
-		oprofile_arch_exit();
-}
-
-
-module_init(oprofile_init);
-module_exit(oprofile_exit);
-
-module_param_named(timer, timer, int, 0644);
-MODULE_PARM_DESC(timer, "force use of timer interrupt");
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("John Levon <levon@movementarian.org>");
-MODULE_DESCRIPTION("OProfile system profiler");
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
deleted file mode 100644
index d5412060ab0f..000000000000
--- a/drivers/oprofile/oprof.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * @file oprof.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#ifndef OPROF_H
-#define OPROF_H
-
-int oprofile_setup(void);
-void oprofile_shutdown(void);
-
-int oprofilefs_register(void);
-void oprofilefs_unregister(void);
-
-int oprofile_start(void);
-void oprofile_stop(void);
-
-struct oprofile_operations;
-
-extern unsigned long oprofile_buffer_size;
-extern unsigned long oprofile_cpu_buffer_size;
-extern unsigned long oprofile_buffer_watershed;
-extern unsigned long oprofile_time_slice;
-
-extern struct oprofile_operations oprofile_ops;
-extern unsigned long oprofile_started;
-extern unsigned long oprofile_backtrace_depth;
-
-struct dentry;
-
-void oprofile_create_files(struct dentry *root);
-int oprofile_timer_init(struct oprofile_operations *ops);
-#ifdef CONFIG_OPROFILE_NMI_TIMER
-int op_nmi_timer_init(struct oprofile_operations *ops);
-#else
-static inline int op_nmi_timer_init(struct oprofile_operations *ops)
-{
-	return -ENODEV;
-}
-#endif
-
-
-int oprofile_set_ulong(unsigned long *addr, unsigned long val);
-int oprofile_set_timeout(unsigned long time);
-
-#endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
deleted file mode 100644
index ee2cfce358b9..000000000000
--- a/drivers/oprofile/oprofile_files.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/**
- * @file oprofile_files.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/fs.h>
-#include <linux/oprofile.h>
-#include <linux/jiffies.h>
-
-#include "event_buffer.h"
-#include "oprofile_stats.h"
-#include "oprof.h"
-
-#define BUFFER_SIZE_DEFAULT		131072
-#define CPU_BUFFER_SIZE_DEFAULT		8192
-#define BUFFER_WATERSHED_DEFAULT	32768	/* FIXME: tune */
-#define TIME_SLICE_DEFAULT		1
-
-unsigned long oprofile_buffer_size;
-unsigned long oprofile_cpu_buffer_size;
-unsigned long oprofile_buffer_watershed;
-unsigned long oprofile_time_slice;
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static ssize_t timeout_read(struct file *file, char __user *buf,
-		size_t count, loff_t *offset)
-{
-	return oprofilefs_ulong_to_user(jiffies_to_msecs(oprofile_time_slice),
-					buf, count, offset);
-}
-
-
-static ssize_t timeout_write(struct file *file, char const __user *buf,
-		size_t count, loff_t *offset)
-{
-	unsigned long val;
-	int retval;
-
-	if (*offset)
-		return -EINVAL;
-
-	retval = oprofilefs_ulong_from_user(&val, buf, count);
-	if (retval <= 0)
-		return retval;
-
-	retval = oprofile_set_timeout(val);
-
-	if (retval)
-		return retval;
-	return count;
-}
-
-
-static const struct file_operations timeout_fops = {
-	.read		= timeout_read,
-	.write		= timeout_write,
-	.llseek		= default_llseek,
-};
-
-#endif
-
-
-static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
-{
-	return oprofilefs_ulong_to_user(oprofile_backtrace_depth, buf, count,
-					offset);
-}
-
-
-static ssize_t depth_write(struct file *file, char const __user *buf, size_t count, loff_t *offset)
-{
-	unsigned long val;
-	int retval;
-
-	if (*offset)
-		return -EINVAL;
-
-	if (!oprofile_ops.backtrace)
-		return -EINVAL;
-
-	retval = oprofilefs_ulong_from_user(&val, buf, count);
-	if (retval <= 0)
-		return retval;
-
-	retval = oprofile_set_ulong(&oprofile_backtrace_depth, val);
-	if (retval)
-		return retval;
-
-	return count;
-}
-
-
-static const struct file_operations depth_fops = {
-	.read		= depth_read,
-	.write		= depth_write,
-	.llseek		= default_llseek,
-};
-
-
-static ssize_t pointer_size_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
-{
-	return oprofilefs_ulong_to_user(sizeof(void *), buf, count, offset);
-}
-
-
-static const struct file_operations pointer_size_fops = {
-	.read		= pointer_size_read,
-	.llseek		= default_llseek,
-};
-
-
-static ssize_t cpu_type_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
-{
-	return oprofilefs_str_to_user(oprofile_ops.cpu_type, buf, count, offset);
-}
-
-
-static const struct file_operations cpu_type_fops = {
-	.read		= cpu_type_read,
-	.llseek		= default_llseek,
-};
-
-
-static ssize_t enable_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
-{
-	return oprofilefs_ulong_to_user(oprofile_started, buf, count, offset);
-}
-
-
-static ssize_t enable_write(struct file *file, char const __user *buf, size_t count, loff_t *offset)
-{
-	unsigned long val;
-	int retval;
-
-	if (*offset)
-		return -EINVAL;
-
-	retval = oprofilefs_ulong_from_user(&val, buf, count);
-	if (retval <= 0)
-		return retval;
-
-	retval = 0;
-	if (val)
-		retval = oprofile_start();
-	else
-		oprofile_stop();
-
-	if (retval)
-		return retval;
-	return count;
-}
-
-
-static const struct file_operations enable_fops = {
-	.read		= enable_read,
-	.write		= enable_write,
-	.llseek		= default_llseek,
-};
-
-
-static ssize_t dump_write(struct file *file, char const __user *buf, size_t count, loff_t *offset)
-{
-	wake_up_buffer_waiter();
-	return count;
-}
-
-
-static const struct file_operations dump_fops = {
-	.write		= dump_write,
-	.llseek		= noop_llseek,
-};
-
-void oprofile_create_files(struct dentry *root)
-{
-	/* reinitialize default values */
-	oprofile_buffer_size =		BUFFER_SIZE_DEFAULT;
-	oprofile_cpu_buffer_size =	CPU_BUFFER_SIZE_DEFAULT;
-	oprofile_buffer_watershed =	BUFFER_WATERSHED_DEFAULT;
-	oprofile_time_slice =		msecs_to_jiffies(TIME_SLICE_DEFAULT);
-
-	oprofilefs_create_file(root, "enable", &enable_fops);
-	oprofilefs_create_file_perm(root, "dump", &dump_fops, 0666);
-	oprofilefs_create_file(root, "buffer", &event_buffer_fops);
-	oprofilefs_create_ulong(root, "buffer_size", &oprofile_buffer_size);
-	oprofilefs_create_ulong(root, "buffer_watershed", &oprofile_buffer_watershed);
-	oprofilefs_create_ulong(root, "cpu_buffer_size", &oprofile_cpu_buffer_size);
-	oprofilefs_create_file(root, "cpu_type", &cpu_type_fops);
-	oprofilefs_create_file(root, "backtrace_depth", &depth_fops);
-	oprofilefs_create_file(root, "pointer_size", &pointer_size_fops);
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-	oprofilefs_create_file(root, "time_slice", &timeout_fops);
-#endif
-	oprofile_create_stats_files(root);
-	if (oprofile_ops.create_files)
-		oprofile_ops.create_files(root);
-}
diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c
deleted file mode 100644
index 98a63a5f8763..000000000000
--- a/drivers/oprofile/oprofile_perf.c
+++ /dev/null
@@ -1,328 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2010 ARM Ltd.
- * Copyright 2012 Advanced Micro Devices, Inc., Robert Richter
- *
- * Perf-events backend for OProfile.
- */
-#include <linux/perf_event.h>
-#include <linux/platform_device.h>
-#include <linux/oprofile.h>
-#include <linux/slab.h>
-
-/*
- * Per performance monitor configuration as set via oprofilefs.
- */
-struct op_counter_config {
-	unsigned long count;
-	unsigned long enabled;
-	unsigned long event;
-	unsigned long unit_mask;
-	unsigned long kernel;
-	unsigned long user;
-	struct perf_event_attr attr;
-};
-
-static int oprofile_perf_enabled;
-static DEFINE_MUTEX(oprofile_perf_mutex);
-
-static struct op_counter_config *counter_config;
-static DEFINE_PER_CPU(struct perf_event **, perf_events);
-static int num_counters;
-
-/*
- * Overflow callback for oprofile.
- */
-static void op_overflow_handler(struct perf_event *event,
-			struct perf_sample_data *data, struct pt_regs *regs)
-{
-	int id;
-	u32 cpu = smp_processor_id();
-
-	for (id = 0; id < num_counters; ++id)
-		if (per_cpu(perf_events, cpu)[id] == event)
-			break;
-
-	if (id != num_counters)
-		oprofile_add_sample(regs, id);
-	else
-		pr_warn("oprofile: ignoring spurious overflow on cpu %u\n",
-			cpu);
-}
-
-/*
- * Called by oprofile_perf_setup to create perf attributes to mirror the oprofile
- * settings in counter_config. Attributes are created as `pinned' events and
- * so are permanently scheduled on the PMU.
- */
-static void op_perf_setup(void)
-{
-	int i;
-	u32 size = sizeof(struct perf_event_attr);
-	struct perf_event_attr *attr;
-
-	for (i = 0; i < num_counters; ++i) {
-		attr = &counter_config[i].attr;
-		memset(attr, 0, size);
-		attr->type		= PERF_TYPE_RAW;
-		attr->size		= size;
-		attr->config		= counter_config[i].event;
-		attr->sample_period	= counter_config[i].count;
-		attr->pinned		= 1;
-	}
-}
-
-static int op_create_counter(int cpu, int event)
-{
-	struct perf_event *pevent;
-
-	if (!counter_config[event].enabled || per_cpu(perf_events, cpu)[event])
-		return 0;
-
-	pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
-						  cpu, NULL,
-						  op_overflow_handler, NULL);
-
-	if (IS_ERR(pevent))
-		return PTR_ERR(pevent);
-
-	if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
-		perf_event_release_kernel(pevent);
-		pr_warn("oprofile: failed to enable event %d on CPU %d\n",
-			event, cpu);
-		return -EBUSY;
-	}
-
-	per_cpu(perf_events, cpu)[event] = pevent;
-
-	return 0;
-}
-
-static void op_destroy_counter(int cpu, int event)
-{
-	struct perf_event *pevent = per_cpu(perf_events, cpu)[event];
-
-	if (pevent) {
-		perf_event_release_kernel(pevent);
-		per_cpu(perf_events, cpu)[event] = NULL;
-	}
-}
-
-/*
- * Called by oprofile_perf_start to create active perf events based on the
- * perviously configured attributes.
- */
-static int op_perf_start(void)
-{
-	int cpu, event, ret = 0;
-
-	for_each_online_cpu(cpu) {
-		for (event = 0; event < num_counters; ++event) {
-			ret = op_create_counter(cpu, event);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return ret;
-}
-
-/*
- * Called by oprofile_perf_stop at the end of a profiling run.
- */
-static void op_perf_stop(void)
-{
-	int cpu, event;
-
-	for_each_online_cpu(cpu)
-		for (event = 0; event < num_counters; ++event)
-			op_destroy_counter(cpu, event);
-}
-
-static int oprofile_perf_create_files(struct dentry *root)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_counters; i++) {
-		struct dentry *dir;
-		char buf[4];
-
-		snprintf(buf, sizeof buf, "%d", i);
-		dir = oprofilefs_mkdir(root, buf);
-		oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
-		oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
-		oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
-		oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
-		oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
-		oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
-	}
-
-	return 0;
-}
-
-static int oprofile_perf_setup(void)
-{
-	raw_spin_lock(&oprofilefs_lock);
-	op_perf_setup();
-	raw_spin_unlock(&oprofilefs_lock);
-	return 0;
-}
-
-static int oprofile_perf_start(void)
-{
-	int ret = -EBUSY;
-
-	mutex_lock(&oprofile_perf_mutex);
-	if (!oprofile_perf_enabled) {
-		ret = 0;
-		op_perf_start();
-		oprofile_perf_enabled = 1;
-	}
-	mutex_unlock(&oprofile_perf_mutex);
-	return ret;
-}
-
-static void oprofile_perf_stop(void)
-{
-	mutex_lock(&oprofile_perf_mutex);
-	if (oprofile_perf_enabled)
-		op_perf_stop();
-	oprofile_perf_enabled = 0;
-	mutex_unlock(&oprofile_perf_mutex);
-}
-
-#ifdef CONFIG_PM
-
-static int oprofile_perf_suspend(struct platform_device *dev, pm_message_t state)
-{
-	mutex_lock(&oprofile_perf_mutex);
-	if (oprofile_perf_enabled)
-		op_perf_stop();
-	mutex_unlock(&oprofile_perf_mutex);
-	return 0;
-}
-
-static int oprofile_perf_resume(struct platform_device *dev)
-{
-	mutex_lock(&oprofile_perf_mutex);
-	if (oprofile_perf_enabled && op_perf_start())
-		oprofile_perf_enabled = 0;
-	mutex_unlock(&oprofile_perf_mutex);
-	return 0;
-}
-
-static struct platform_driver oprofile_driver = {
-	.driver		= {
-		.name		= "oprofile-perf",
-	},
-	.resume		= oprofile_perf_resume,
-	.suspend	= oprofile_perf_suspend,
-};
-
-static struct platform_device *oprofile_pdev;
-
-static int __init init_driverfs(void)
-{
-	int ret;
-
-	ret = platform_driver_register(&oprofile_driver);
-	if (ret)
-		return ret;
-
-	oprofile_pdev =	platform_device_register_simple(
-				oprofile_driver.driver.name, 0, NULL, 0);
-	if (IS_ERR(oprofile_pdev)) {
-		ret = PTR_ERR(oprofile_pdev);
-		platform_driver_unregister(&oprofile_driver);
-	}
-
-	return ret;
-}
-
-static void exit_driverfs(void)
-{
-	platform_device_unregister(oprofile_pdev);
-	platform_driver_unregister(&oprofile_driver);
-}
-
-#else
-
-static inline int  init_driverfs(void) { return 0; }
-static inline void exit_driverfs(void) { }
-
-#endif /* CONFIG_PM */
-
-void oprofile_perf_exit(void)
-{
-	int cpu, id;
-	struct perf_event *event;
-
-	for_each_possible_cpu(cpu) {
-		for (id = 0; id < num_counters; ++id) {
-			event = per_cpu(perf_events, cpu)[id];
-			if (event)
-				perf_event_release_kernel(event);
-		}
-
-		kfree(per_cpu(perf_events, cpu));
-	}
-
-	kfree(counter_config);
-	exit_driverfs();
-}
-
-int __init oprofile_perf_init(struct oprofile_operations *ops)
-{
-	int cpu, ret = 0;
-
-	ret = init_driverfs();
-	if (ret)
-		return ret;
-
-	num_counters = perf_num_counters();
-	if (num_counters <= 0) {
-		pr_info("oprofile: no performance counters\n");
-		ret = -ENODEV;
-		goto out;
-	}
-
-	counter_config = kcalloc(num_counters,
-			sizeof(struct op_counter_config), GFP_KERNEL);
-
-	if (!counter_config) {
-		pr_info("oprofile: failed to allocate %d "
-				"counters\n", num_counters);
-		ret = -ENOMEM;
-		num_counters = 0;
-		goto out;
-	}
-
-	for_each_possible_cpu(cpu) {
-		per_cpu(perf_events, cpu) = kcalloc(num_counters,
-				sizeof(struct perf_event *), GFP_KERNEL);
-		if (!per_cpu(perf_events, cpu)) {
-			pr_info("oprofile: failed to allocate %d perf events "
-					"for cpu %d\n", num_counters, cpu);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	ops->create_files	= oprofile_perf_create_files;
-	ops->setup		= oprofile_perf_setup;
-	ops->start		= oprofile_perf_start;
-	ops->stop		= oprofile_perf_stop;
-	ops->shutdown		= oprofile_perf_stop;
-	ops->cpu_type		= op_name_from_perf_id();
-
-	if (!ops->cpu_type)
-		ret = -ENODEV;
-	else
-		pr_info("oprofile: using %s\n", ops->cpu_type);
-
-out:
-	if (ret)
-		oprofile_perf_exit();
-
-	return ret;
-}
diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c
deleted file mode 100644
index 59659cea4582..000000000000
--- a/drivers/oprofile/oprofile_stats.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * @file oprofile_stats.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- */
-
-#include <linux/oprofile.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/threads.h>
-
-#include "oprofile_stats.h"
-#include "cpu_buffer.h"
-
-struct oprofile_stat_struct oprofile_stats;
-
-void oprofile_reset_stats(void)
-{
-	struct oprofile_cpu_buffer *cpu_buf;
-	int i;
-
-	for_each_possible_cpu(i) {
-		cpu_buf = &per_cpu(op_cpu_buffer, i);
-		cpu_buf->sample_received = 0;
-		cpu_buf->sample_lost_overflow = 0;
-		cpu_buf->backtrace_aborted = 0;
-		cpu_buf->sample_invalid_eip = 0;
-	}
-
-	atomic_set(&oprofile_stats.sample_lost_no_mm, 0);
-	atomic_set(&oprofile_stats.sample_lost_no_mapping, 0);
-	atomic_set(&oprofile_stats.event_lost_overflow, 0);
-	atomic_set(&oprofile_stats.bt_lost_no_mapping, 0);
-	atomic_set(&oprofile_stats.multiplex_counter, 0);
-}
-
-
-void oprofile_create_stats_files(struct dentry *root)
-{
-	struct oprofile_cpu_buffer *cpu_buf;
-	struct dentry *cpudir;
-	struct dentry *dir;
-	char buf[10];
-	int i;
-
-	dir = oprofilefs_mkdir(root, "stats");
-	if (!dir)
-		return;
-
-	for_each_possible_cpu(i) {
-		cpu_buf = &per_cpu(op_cpu_buffer, i);
-		snprintf(buf, 10, "cpu%d", i);
-		cpudir = oprofilefs_mkdir(dir, buf);
-
-		/* Strictly speaking access to these ulongs is racy,
-		 * but we can't simply lock them, and they are
-		 * informational only.
-		 */
-		oprofilefs_create_ro_ulong(cpudir, "sample_received",
-			&cpu_buf->sample_received);
-		oprofilefs_create_ro_ulong(cpudir, "sample_lost_overflow",
-			&cpu_buf->sample_lost_overflow);
-		oprofilefs_create_ro_ulong(cpudir, "backtrace_aborted",
-			&cpu_buf->backtrace_aborted);
-		oprofilefs_create_ro_ulong(cpudir, "sample_invalid_eip",
-			&cpu_buf->sample_invalid_eip);
-	}
-
-	oprofilefs_create_ro_atomic(dir, "sample_lost_no_mm",
-		&oprofile_stats.sample_lost_no_mm);
-	oprofilefs_create_ro_atomic(dir, "sample_lost_no_mapping",
-		&oprofile_stats.sample_lost_no_mapping);
-	oprofilefs_create_ro_atomic(dir, "event_lost_overflow",
-		&oprofile_stats.event_lost_overflow);
-	oprofilefs_create_ro_atomic(dir, "bt_lost_no_mapping",
-		&oprofile_stats.bt_lost_no_mapping);
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-	oprofilefs_create_ro_atomic(dir, "multiplex_counter",
-		&oprofile_stats.multiplex_counter);
-#endif
-}
diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h
deleted file mode 100644
index 1fc622bd1834..000000000000
--- a/drivers/oprofile/oprofile_stats.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * @file oprofile_stats.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- */
-
-#ifndef OPROFILE_STATS_H
-#define OPROFILE_STATS_H
-
-#include <linux/atomic.h>
-
-struct oprofile_stat_struct {
-	atomic_t sample_lost_no_mm;
-	atomic_t sample_lost_no_mapping;
-	atomic_t bt_lost_no_mapping;
-	atomic_t event_lost_overflow;
-	atomic_t multiplex_counter;
-};
-
-extern struct oprofile_stat_struct oprofile_stats;
-
-/* reset all stats to zero */
-void oprofile_reset_stats(void);
-
-struct dentry;
-
-/* create the stats/ dir */
-void oprofile_create_stats_files(struct dentry *root);
-
-#endif /* OPROFILE_STATS_H */
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
deleted file mode 100644
index 0875f2f122b3..000000000000
--- a/drivers/oprofile/oprofilefs.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/**
- * @file oprofilefs.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- *
- * A simple filesystem for configuration and
- * access of oprofile.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/oprofile.h>
-#include <linux/fs.h>
-#include <linux/fs_context.h>
-#include <linux/pagemap.h>
-#include <linux/uaccess.h>
-
-#include "oprof.h"
-
-#define OPROFILEFS_MAGIC 0x6f70726f
-
-DEFINE_RAW_SPINLOCK(oprofilefs_lock);
-
-static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode)
-{
-	struct inode *inode = new_inode(sb);
-
-	if (inode) {
-		inode->i_ino = get_next_ino();
-		inode->i_mode = mode;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
-	}
-	return inode;
-}
-
-
-static const struct super_operations s_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode 	= generic_delete_inode,
-};
-
-
-ssize_t oprofilefs_str_to_user(char const *str, char __user *buf, size_t count, loff_t *offset)
-{
-	return simple_read_from_buffer(buf, count, offset, str, strlen(str));
-}
-
-
-#define TMPBUFSIZE 50
-
-ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user *buf, size_t count, loff_t *offset)
-{
-	char tmpbuf[TMPBUFSIZE];
-	size_t maxlen = snprintf(tmpbuf, TMPBUFSIZE, "%lu\n", val);
-	if (maxlen > TMPBUFSIZE)
-		maxlen = TMPBUFSIZE;
-	return simple_read_from_buffer(buf, count, offset, tmpbuf, maxlen);
-}
-
-
-/*
- * Note: If oprofilefs_ulong_from_user() returns 0, then *val remains
- * unchanged and might be uninitialized. This follows write syscall
- * implementation when count is zero: "If count is zero ... [and if]
- * no errors are detected, 0 will be returned without causing any
- * other effect." (man 2 write)
- */
-int oprofilefs_ulong_from_user(unsigned long *val, char const __user *buf, size_t count)
-{
-	char tmpbuf[TMPBUFSIZE];
-	unsigned long flags;
-
-	if (!count)
-		return 0;
-
-	if (count > TMPBUFSIZE - 1)
-		return -EINVAL;
-
-	memset(tmpbuf, 0x0, TMPBUFSIZE);
-
-	if (copy_from_user(tmpbuf, buf, count))
-		return -EFAULT;
-
-	raw_spin_lock_irqsave(&oprofilefs_lock, flags);
-	*val = simple_strtoul(tmpbuf, NULL, 0);
-	raw_spin_unlock_irqrestore(&oprofilefs_lock, flags);
-	return count;
-}
-
-
-static ssize_t ulong_read_file(struct file *file, char __user *buf, size_t count, loff_t *offset)
-{
-	unsigned long *val = file->private_data;
-	return oprofilefs_ulong_to_user(*val, buf, count, offset);
-}
-
-
-static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_t count, loff_t *offset)
-{
-	unsigned long value;
-	int retval;
-
-	if (*offset)
-		return -EINVAL;
-
-	retval = oprofilefs_ulong_from_user(&value, buf, count);
-	if (retval <= 0)
-		return retval;
-
-	retval = oprofile_set_ulong(file->private_data, value);
-	if (retval)
-		return retval;
-
-	return count;
-}
-
-
-static const struct file_operations ulong_fops = {
-	.read		= ulong_read_file,
-	.write		= ulong_write_file,
-	.open		= simple_open,
-	.llseek		= default_llseek,
-};
-
-
-static const struct file_operations ulong_ro_fops = {
-	.read		= ulong_read_file,
-	.open		= simple_open,
-	.llseek		= default_llseek,
-};
-
-
-static int __oprofilefs_create_file(struct dentry *root, char const *name,
-	const struct file_operations *fops, int perm, void *priv)
-{
-	struct dentry *dentry;
-	struct inode *inode;
-
-	if (!root)
-		return -ENOMEM;
-
-	inode_lock(d_inode(root));
-	dentry = d_alloc_name(root, name);
-	if (!dentry) {
-		inode_unlock(d_inode(root));
-		return -ENOMEM;
-	}
-	inode = oprofilefs_get_inode(root->d_sb, S_IFREG | perm);
-	if (!inode) {
-		dput(dentry);
-		inode_unlock(d_inode(root));
-		return -ENOMEM;
-	}
-	inode->i_fop = fops;
-	inode->i_private = priv;
-	d_add(dentry, inode);
-	inode_unlock(d_inode(root));
-	return 0;
-}
-
-
-int oprofilefs_create_ulong(struct dentry *root,
-	char const *name, unsigned long *val)
-{
-	return __oprofilefs_create_file(root, name,
-					&ulong_fops, 0644, val);
-}
-
-
-int oprofilefs_create_ro_ulong(struct dentry *root,
-	char const *name, unsigned long *val)
-{
-	return __oprofilefs_create_file(root, name,
-					&ulong_ro_fops, 0444, val);
-}
-
-
-static ssize_t atomic_read_file(struct file *file, char __user *buf, size_t count, loff_t *offset)
-{
-	atomic_t *val = file->private_data;
-	return oprofilefs_ulong_to_user(atomic_read(val), buf, count, offset);
-}
-
-
-static const struct file_operations atomic_ro_fops = {
-	.read		= atomic_read_file,
-	.open		= simple_open,
-	.llseek		= default_llseek,
-};
-
-
-int oprofilefs_create_ro_atomic(struct dentry *root,
-	char const *name, atomic_t *val)
-{
-	return __oprofilefs_create_file(root, name,
-					&atomic_ro_fops, 0444, val);
-}
-
-
-int oprofilefs_create_file(struct dentry *root,
-	char const *name, const struct file_operations *fops)
-{
-	return __oprofilefs_create_file(root, name, fops, 0644, NULL);
-}
-
-
-int oprofilefs_create_file_perm(struct dentry *root,
-	char const *name, const struct file_operations *fops, int perm)
-{
-	return __oprofilefs_create_file(root, name, fops, perm, NULL);
-}
-
-
-struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name)
-{
-	struct dentry *dentry;
-	struct inode *inode;
-
-	inode_lock(d_inode(parent));
-	dentry = d_alloc_name(parent, name);
-	if (!dentry) {
-		inode_unlock(d_inode(parent));
-		return NULL;
-	}
-	inode = oprofilefs_get_inode(parent->d_sb, S_IFDIR | 0755);
-	if (!inode) {
-		dput(dentry);
-		inode_unlock(d_inode(parent));
-		return NULL;
-	}
-	inode->i_op = &simple_dir_inode_operations;
-	inode->i_fop = &simple_dir_operations;
-	d_add(dentry, inode);
-	inode_unlock(d_inode(parent));
-	return dentry;
-}
-
-
-static int oprofilefs_fill_super(struct super_block *sb, struct fs_context *fc)
-{
-	struct inode *root_inode;
-
-	sb->s_blocksize = PAGE_SIZE;
-	sb->s_blocksize_bits = PAGE_SHIFT;
-	sb->s_magic = OPROFILEFS_MAGIC;
-	sb->s_op = &s_ops;
-	sb->s_time_gran = 1;
-
-	root_inode = oprofilefs_get_inode(sb, S_IFDIR | 0755);
-	if (!root_inode)
-		return -ENOMEM;
-	root_inode->i_op = &simple_dir_inode_operations;
-	root_inode->i_fop = &simple_dir_operations;
-	sb->s_root = d_make_root(root_inode);
-	if (!sb->s_root)
-		return -ENOMEM;
-
-	oprofile_create_files(sb->s_root);
-
-	// FIXME: verify kill_litter_super removes our dentries
-	return 0;
-}
-
-static int oprofilefs_get_tree(struct fs_context *fc)
-{
-	return get_tree_single(fc, oprofilefs_fill_super);
-}
-
-static const struct fs_context_operations oprofilefs_context_ops = {
-	.get_tree	= oprofilefs_get_tree,
-};
-
-static int oprofilefs_init_fs_context(struct fs_context *fc)
-{
-	fc->ops = &oprofilefs_context_ops;
-	return 0;
-}
-
-static struct file_system_type oprofilefs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "oprofilefs",
-	.init_fs_context = oprofilefs_init_fs_context,
-	.kill_sb	= kill_litter_super,
-};
-MODULE_ALIAS_FS("oprofilefs");
-
-
-int __init oprofilefs_register(void)
-{
-	return register_filesystem(&oprofilefs_type);
-}
-
-
-void __exit oprofilefs_unregister(void)
-{
-	unregister_filesystem(&oprofilefs_type);
-}
diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c
deleted file mode 100644
index 2498a6cd7c24..000000000000
--- a/drivers/oprofile/timer_int.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * @file timer_int.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/oprofile.h>
-#include <linux/profile.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/hrtimer.h>
-#include <asm/irq_regs.h>
-#include <asm/ptrace.h>
-
-#include "oprof.h"
-
-static DEFINE_PER_CPU(struct hrtimer, oprofile_hrtimer);
-static int ctr_running;
-
-static enum hrtimer_restart oprofile_hrtimer_notify(struct hrtimer *hrtimer)
-{
-	oprofile_add_sample(get_irq_regs(), 0);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(TICK_NSEC));
-	return HRTIMER_RESTART;
-}
-
-static void __oprofile_hrtimer_start(void *unused)
-{
-	struct hrtimer *hrtimer = this_cpu_ptr(&oprofile_hrtimer);
-
-	if (!ctr_running)
-		return;
-
-	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hrtimer->function = oprofile_hrtimer_notify;
-
-	hrtimer_start(hrtimer, ns_to_ktime(TICK_NSEC),
-		      HRTIMER_MODE_REL_PINNED);
-}
-
-static int oprofile_hrtimer_start(void)
-{
-	get_online_cpus();
-	ctr_running = 1;
-	on_each_cpu(__oprofile_hrtimer_start, NULL, 1);
-	put_online_cpus();
-	return 0;
-}
-
-static void __oprofile_hrtimer_stop(int cpu)
-{
-	struct hrtimer *hrtimer = &per_cpu(oprofile_hrtimer, cpu);
-
-	if (!ctr_running)
-		return;
-
-	hrtimer_cancel(hrtimer);
-}
-
-static void oprofile_hrtimer_stop(void)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		__oprofile_hrtimer_stop(cpu);
-	ctr_running = 0;
-	put_online_cpus();
-}
-
-static int oprofile_timer_online(unsigned int cpu)
-{
-	local_irq_disable();
-	__oprofile_hrtimer_start(NULL);
-	local_irq_enable();
-	return 0;
-}
-
-static int oprofile_timer_prep_down(unsigned int cpu)
-{
-	__oprofile_hrtimer_stop(cpu);
-	return 0;
-}
-
-static enum cpuhp_state hp_online;
-
-static int oprofile_hrtimer_setup(void)
-{
-	int ret;
-
-	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
-					"oprofile/timer:online",
-					oprofile_timer_online,
-					oprofile_timer_prep_down);
-	if (ret < 0)
-		return ret;
-	hp_online = ret;
-	return 0;
-}
-
-static void oprofile_hrtimer_shutdown(void)
-{
-	cpuhp_remove_state_nocalls(hp_online);
-}
-
-int oprofile_timer_init(struct oprofile_operations *ops)
-{
-	ops->create_files	= NULL;
-	ops->setup		= oprofile_hrtimer_setup;
-	ops->shutdown		= oprofile_hrtimer_shutdown;
-	ops->start		= oprofile_hrtimer_start;
-	ops->stop		= oprofile_hrtimer_stop;
-	ops->cpu_type		= "timer";
-	printk(KERN_INFO "oprofile: using timer interrupt.\n");
-	return 0;
-}
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
deleted file mode 100644
index b2a0f15f11fe..000000000000
--- a/include/linux/oprofile.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/**
- * @file oprofile.h
- *
- * API for machine-specific interrupts to interface
- * to oprofile.
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#ifndef OPROFILE_H
-#define OPROFILE_H
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/printk.h>
-#include <linux/atomic.h>
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- * These #defines live in this file so that arch-specific
- * buffer sync'ing code can access them.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE			1
-#define CPU_SWITCH_CODE			2
-#define COOKIE_SWITCH_CODE		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
-#define XEN_ENTER_SWITCH_CODE		10
-#define SPU_PROFILING_CODE		11
-#define SPU_CTX_SWITCH_CODE		12
-#define IBS_FETCH_CODE			13
-#define IBS_OP_CODE			14
-
-struct dentry;
-struct file_operations;
-struct pt_regs;
- 
-/* Operations structure to be filled in */
-struct oprofile_operations {
-	/* create any necessary configuration files in the oprofile fs.
-	 * Optional. */
-	int (*create_files)(struct dentry * root);
-	/* Do any necessary interrupt setup. Optional. */
-	int (*setup)(void);
-	/* Do any necessary interrupt shutdown. Optional. */
-	void (*shutdown)(void);
-	/* Start delivering interrupts. */
-	int (*start)(void);
-	/* Stop delivering interrupts. */
-	void (*stop)(void);
-	/* Arch-specific buffer sync functions.
-	 * Return value = 0:  Success
-	 * Return value = -1: Failure
-	 * Return value = 1:  Run generic sync function
-	 */
-	int (*sync_start)(void);
-	int (*sync_stop)(void);
-
-	/* Initiate a stack backtrace. Optional. */
-	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
-
-	/* Multiplex between different events. Optional. */
-	int (*switch_events)(void);
-	/* CPU identification string. */
-	char * cpu_type;
-};
-
-/**
- * One-time initialisation. *ops must be set to a filled-in
- * operations structure. This is called even in timer interrupt
- * mode so an arch can set a backtrace callback.
- *
- * If an error occurs, the fields should be left untouched.
- */
-int oprofile_arch_init(struct oprofile_operations * ops);
- 
-/**
- * One-time exit/cleanup for the arch.
- */
-void oprofile_arch_exit(void);
-
-/**
- * Add a sample. This may be called from any context.
- */
-void oprofile_add_sample(struct pt_regs * const regs, unsigned long event);
-
-/**
- * Add an extended sample.  Use this when the PC is not from the regs, and
- * we cannot determine if we're in kernel mode from the regs.
- *
- * This function does perform a backtrace.
- *
- */
-void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
-				unsigned long event, int is_kernel);
-
-/**
- * Add an hardware sample.
- */
-void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
-	unsigned long event, int is_kernel,
-	struct task_struct *task);
-
-/* Use this instead when the PC value is not from the regs. Doesn't
- * backtrace. */
-void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
-
-/* add a backtrace entry, to be called from the ->backtrace callback */
-void oprofile_add_trace(unsigned long eip);
-
-
-/**
- * Create a file of the given name as a child of the given root, with
- * the specified file operations.
- */
-int oprofilefs_create_file(struct dentry * root,
-	char const * name, const struct file_operations * fops);
-
-int oprofilefs_create_file_perm(struct dentry * root,
-	char const * name, const struct file_operations * fops, int perm);
- 
-/** Create a file for read/write access to an unsigned long. */
-int oprofilefs_create_ulong(struct dentry * root,
-	char const * name, ulong * val);
- 
-/** Create a file for read-only access to an unsigned long. */
-int oprofilefs_create_ro_ulong(struct dentry * root,
-	char const * name, ulong * val);
- 
-/** Create a file for read-only access to an atomic_t. */
-int oprofilefs_create_ro_atomic(struct dentry * root,
-	char const * name, atomic_t * val);
- 
-/** create a directory */
-struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name);
-
-/**
- * Write the given asciz string to the given user buffer @buf, updating *offset
- * appropriately. Returns bytes written or -EFAULT.
- */
-ssize_t oprofilefs_str_to_user(char const * str, char __user * buf, size_t count, loff_t * offset);
-
-/**
- * Convert an unsigned long value into ASCII and copy it to the user buffer @buf,
- * updating *offset appropriately. Returns bytes written or -EFAULT.
- */
-ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user * buf, size_t count, loff_t * offset);
-
-/**
- * Read an ASCII string for a number from a userspace buffer and fill *val on success.
- * Returns 0 on success, < 0 on error.
- */
-int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count);
-
-/** lock for read/write safety */
-extern raw_spinlock_t oprofilefs_lock;
-
-/**
- * Add the contents of a circular buffer to the event buffer.
- */
-void oprofile_put_buff(unsigned long *buf, unsigned int start,
-			unsigned int stop, unsigned int max);
-
-unsigned long oprofile_get_cpu_buffer_size(void);
-void oprofile_cpu_buffer_inc_smpl_lost(void);
- 
-/* cpu buffer functions */
-
-struct op_sample;
-
-struct op_entry {
-	struct ring_buffer_event *event;
-	struct op_sample *sample;
-	unsigned long size;
-	unsigned long *data;
-};
-
-void oprofile_write_reserve(struct op_entry *entry,
-			    struct pt_regs * const regs,
-			    unsigned long pc, int code, int size);
-int oprofile_add_data(struct op_entry *entry, unsigned long val);
-int oprofile_add_data64(struct op_entry *entry, u64 val);
-int oprofile_write_commit(struct op_entry *entry);
-
-#ifdef CONFIG_HW_PERF_EVENTS
-int __init oprofile_perf_init(struct oprofile_operations *ops);
-void oprofile_perf_exit(void);
-char *op_name_from_perf_id(void);
-#else
-static inline int __init oprofile_perf_init(struct oprofile_operations *ops)
-{
-	pr_info("oprofile: hardware counters not available\n");
-	return -ENODEV;
-}
-static inline void oprofile_perf_exit(void) { }
-#endif /* CONFIG_HW_PERF_EVENTS */
-
-#endif /* OPROFILE_H */
diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..03eb36256405 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2024,7 +2024,7 @@ config PROFILING
 	bool "Profiling support"
 	help
 	  Say Y here to enable the extended profiling support mechanisms used
-	  by profilers such as OProfile.
+	  by profilers.
 
 #
 # Place an empty function call at each tracepoint site. Can be
-- 
cgit v1.2.3


From be65de6b03aa638c46ea51e9d11a92e4914d8103 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 14 Jan 2021 17:05:31 +0530
Subject: fs: Remove dcookies support

The dcookies stuff was only used by the kernel's old oprofile code. Now
that oprofile's support is removed from the kernel, there is no need for
dcookies as well. Remove it.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Robert Richter <rric@kernel.org>
Acked-by: William Cohen <wcohen@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/Makefile              |   1 -
 fs/dcookies.c            | 356 -----------------------------------------------
 include/linux/dcookies.h |  69 ---------
 kernel/sys.c             |   1 -
 4 files changed, 427 deletions(-)
 delete mode 100644 fs/dcookies.c
 delete mode 100644 include/linux/dcookies.h

(limited to 'include/linux')

diff --git a/fs/Makefile b/fs/Makefile
index 999d1a23f036..3215fe205256 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -64,7 +64,6 @@ obj-$(CONFIG_SYSFS)		+= sysfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
-obj-$(CONFIG_PROFILING)		+= dcookies.o
 obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
diff --git a/fs/dcookies.c b/fs/dcookies.c
deleted file mode 100644
index 6eeb61100a09..000000000000
--- a/fs/dcookies.c
+++ /dev/null
@@ -1,356 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * dcookies.c
- *
- * Copyright 2002 John Levon <levon@movementarian.org>
- *
- * Persistent cookie-path mappings. These are used by
- * profilers to convert a per-task EIP value into something
- * non-transitory that can be processed at a later date.
- * This is done by locking the dentry/vfsmnt pair in the
- * kernel until released by the tasks needing the persistent
- * objects. The tag is simply an unsigned long that refers
- * to the pair and can be looked up from userspace.
- */
-
-#include <linux/syscalls.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/mount.h>
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/dcookies.h>
-#include <linux/mutex.h>
-#include <linux/path.h>
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-
-/* The dcookies are allocated from a kmem_cache and
- * hashed onto a small number of lists. None of the
- * code here is particularly performance critical
- */
-struct dcookie_struct {
-	struct path path;
-	struct list_head hash_list;
-};
-
-static LIST_HEAD(dcookie_users);
-static DEFINE_MUTEX(dcookie_mutex);
-static struct kmem_cache *dcookie_cache __read_mostly;
-static struct list_head *dcookie_hashtable __read_mostly;
-static size_t hash_size __read_mostly;
-
-static inline int is_live(void)
-{
-	return !(list_empty(&dcookie_users));
-}
-
-
-/* The dentry is locked, its address will do for the cookie */
-static inline unsigned long dcookie_value(struct dcookie_struct * dcs)
-{
-	return (unsigned long)dcs->path.dentry;
-}
-
-
-static size_t dcookie_hash(unsigned long dcookie)
-{
-	return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1);
-}
-
-
-static struct dcookie_struct * find_dcookie(unsigned long dcookie)
-{
-	struct dcookie_struct *found = NULL;
-	struct dcookie_struct * dcs;
-	struct list_head * pos;
-	struct list_head * list;
-
-	list = dcookie_hashtable + dcookie_hash(dcookie);
-
-	list_for_each(pos, list) {
-		dcs = list_entry(pos, struct dcookie_struct, hash_list);
-		if (dcookie_value(dcs) == dcookie) {
-			found = dcs;
-			break;
-		}
-	}
-
-	return found;
-}
-
-
-static void hash_dcookie(struct dcookie_struct * dcs)
-{
-	struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs));
-	list_add(&dcs->hash_list, list);
-}
-
-
-static struct dcookie_struct *alloc_dcookie(const struct path *path)
-{
-	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
-							GFP_KERNEL);
-	struct dentry *d;
-	if (!dcs)
-		return NULL;
-
-	d = path->dentry;
-	spin_lock(&d->d_lock);
-	d->d_flags |= DCACHE_COOKIE;
-	spin_unlock(&d->d_lock);
-
-	dcs->path = *path;
-	path_get(path);
-	hash_dcookie(dcs);
-	return dcs;
-}
-
-
-/* This is the main kernel-side routine that retrieves the cookie
- * value for a dentry/vfsmnt pair.
- */
-int get_dcookie(const struct path *path, unsigned long *cookie)
-{
-	int err = 0;
-	struct dcookie_struct * dcs;
-
-	mutex_lock(&dcookie_mutex);
-
-	if (!is_live()) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	if (path->dentry->d_flags & DCACHE_COOKIE) {
-		dcs = find_dcookie((unsigned long)path->dentry);
-	} else {
-		dcs = alloc_dcookie(path);
-		if (!dcs) {
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-
-	*cookie = dcookie_value(dcs);
-
-out:
-	mutex_unlock(&dcookie_mutex);
-	return err;
-}
-
-
-/* And here is where the userspace process can look up the cookie value
- * to retrieve the path.
- */
-static int do_lookup_dcookie(u64 cookie64, char __user *buf, size_t len)
-{
-	unsigned long cookie = (unsigned long)cookie64;
-	int err = -EINVAL;
-	char * kbuf;
-	char * path;
-	size_t pathlen;
-	struct dcookie_struct * dcs;
-
-	/* we could leak path information to users
-	 * without dir read permission without this
-	 */
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&dcookie_mutex);
-
-	if (!is_live()) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	if (!(dcs = find_dcookie(cookie)))
-		goto out;
-
-	err = -ENOMEM;
-	kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!kbuf)
-		goto out;
-
-	/* FIXME: (deleted) ? */
-	path = d_path(&dcs->path, kbuf, PAGE_SIZE);
-
-	mutex_unlock(&dcookie_mutex);
-
-	if (IS_ERR(path)) {
-		err = PTR_ERR(path);
-		goto out_free;
-	}
-
-	err = -ERANGE;
- 
-	pathlen = kbuf + PAGE_SIZE - path;
-	if (pathlen <= len) {
-		err = pathlen;
-		if (copy_to_user(buf, path, pathlen))
-			err = -EFAULT;
-	}
-
-out_free:
-	kfree(kbuf);
-	return err;
-out:
-	mutex_unlock(&dcookie_mutex);
-	return err;
-}
-
-SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
-{
-	return do_lookup_dcookie(cookie64, buf, len);
-}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
-{
-#ifdef __BIG_ENDIAN
-	return do_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
-#else
-	return do_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
-#endif
-}
-#endif
-
-static int dcookie_init(void)
-{
-	struct list_head * d;
-	unsigned int i, hash_bits;
-	int err = -ENOMEM;
-
-	dcookie_cache = kmem_cache_create("dcookie_cache",
-		sizeof(struct dcookie_struct),
-		0, 0, NULL);
-
-	if (!dcookie_cache)
-		goto out;
-
-	dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!dcookie_hashtable)
-		goto out_kmem;
-
-	err = 0;
-
-	/*
-	 * Find the power-of-two list-heads that can fit into the allocation..
-	 * We don't guarantee that "sizeof(struct list_head)" is necessarily
-	 * a power-of-two.
-	 */
-	hash_size = PAGE_SIZE / sizeof(struct list_head);
-	hash_bits = 0;
-	do {
-		hash_bits++;
-	} while ((hash_size >> hash_bits) != 0);
-	hash_bits--;
-
-	/*
-	 * Re-calculate the actual number of entries and the mask
-	 * from the number of bits we can fit.
-	 */
-	hash_size = 1UL << hash_bits;
-
-	/* And initialize the newly allocated array */
-	d = dcookie_hashtable;
-	i = hash_size;
-	do {
-		INIT_LIST_HEAD(d);
-		d++;
-		i--;
-	} while (i);
-
-out:
-	return err;
-out_kmem:
-	kmem_cache_destroy(dcookie_cache);
-	goto out;
-}
-
-
-static void free_dcookie(struct dcookie_struct * dcs)
-{
-	struct dentry *d = dcs->path.dentry;
-
-	spin_lock(&d->d_lock);
-	d->d_flags &= ~DCACHE_COOKIE;
-	spin_unlock(&d->d_lock);
-
-	path_put(&dcs->path);
-	kmem_cache_free(dcookie_cache, dcs);
-}
-
-
-static void dcookie_exit(void)
-{
-	struct list_head * list;
-	struct list_head * pos;
-	struct list_head * pos2;
-	struct dcookie_struct * dcs;
-	size_t i;
-
-	for (i = 0; i < hash_size; ++i) {
-		list = dcookie_hashtable + i;
-		list_for_each_safe(pos, pos2, list) {
-			dcs = list_entry(pos, struct dcookie_struct, hash_list);
-			list_del(&dcs->hash_list);
-			free_dcookie(dcs);
-		}
-	}
-
-	kfree(dcookie_hashtable);
-	kmem_cache_destroy(dcookie_cache);
-}
-
-
-struct dcookie_user {
-	struct list_head next;
-};
- 
-struct dcookie_user * dcookie_register(void)
-{
-	struct dcookie_user * user;
-
-	mutex_lock(&dcookie_mutex);
-
-	user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL);
-	if (!user)
-		goto out;
-
-	if (!is_live() && dcookie_init())
-		goto out_free;
-
-	list_add(&user->next, &dcookie_users);
-
-out:
-	mutex_unlock(&dcookie_mutex);
-	return user;
-out_free:
-	kfree(user);
-	user = NULL;
-	goto out;
-}
-
-
-void dcookie_unregister(struct dcookie_user * user)
-{
-	mutex_lock(&dcookie_mutex);
-
-	list_del(&user->next);
-	kfree(user);
-
-	if (!is_live())
-		dcookie_exit();
-
-	mutex_unlock(&dcookie_mutex);
-}
-
-EXPORT_SYMBOL_GPL(dcookie_register);
-EXPORT_SYMBOL_GPL(dcookie_unregister);
-EXPORT_SYMBOL_GPL(get_dcookie);
diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h
deleted file mode 100644
index ddfdac20cad0..000000000000
--- a/include/linux/dcookies.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * dcookies.h
- *
- * Persistent cookie-path mappings
- *
- * Copyright 2002 John Levon <levon@movementarian.org>
- */
-
-#ifndef DCOOKIES_H
-#define DCOOKIES_H
- 
-
-#ifdef CONFIG_PROFILING
- 
-#include <linux/dcache.h>
-#include <linux/types.h>
- 
-struct dcookie_user;
-struct path;
- 
-/**
- * dcookie_register - register a user of dcookies
- *
- * Register as a dcookie user. Returns %NULL on failure.
- */
-struct dcookie_user * dcookie_register(void);
-
-/**
- * dcookie_unregister - unregister a user of dcookies
- *
- * Unregister as a dcookie user. This may invalidate
- * any dcookie values returned from get_dcookie().
- */
-void dcookie_unregister(struct dcookie_user * user);
-  
-/**
- * get_dcookie - acquire a dcookie
- *
- * Convert the given dentry/vfsmount pair into
- * a cookie value.
- *
- * Returns -EINVAL if no living task has registered as a
- * dcookie user.
- *
- * Returns 0 on success, with *cookie filled in
- */
-int get_dcookie(const struct path *path, unsigned long *cookie);
-
-#else
-
-static inline struct dcookie_user * dcookie_register(void)
-{
-	return NULL;
-}
-
-static inline void dcookie_unregister(struct dcookie_user * user)
-{
-	return;
-}
-
-static inline int get_dcookie(const struct path *path, unsigned long *cookie)
-{
-	return -ENOSYS;
-}
-
-#endif /* CONFIG_PROFILING */
-
-#endif /* DCOOKIES_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 51f00fe20e4d..6928d23c46ea 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -24,7 +24,6 @@
 #include <linux/times.h>
 #include <linux/posix-timers.h>
 #include <linux/security.h>
-#include <linux/dcookies.h>
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
-- 
cgit v1.2.3


From 1d3f9bb1be8556bce237934ae82b3cdeda3242fd Mon Sep 17 00:00:00 2001
From: dingsenjie <dingsenjie@yulong.com>
Date: Wed, 27 Jan 2021 10:28:01 +0800
Subject: linux/qed: fix spelling typo in qed_chain.h

allocted -> allocated

Signed-off-by: dingsenjie <dingsenjie@yulong.com>
Link: https://lore.kernel.org/r/20210127022801.8028-1-dingsenjie@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/qed/qed_chain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 4d58dc8943f0..e339b48de32d 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -470,7 +470,7 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 /**
  * @brief qed_chain_reset - Resets the chain to its start state
  *
- * @param p_chain pointer to a previously allocted chain
+ * @param p_chain pointer to a previously allocated chain
  */
 static inline void qed_chain_reset(struct qed_chain *p_chain)
 {
-- 
cgit v1.2.3


From ae8eba8b5d723a4ca543024b6e51f4d0f4fb6b6b Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 27 Jan 2021 23:53:43 +0000
Subject: tlb: mmu_gather: Remove unused start/end arguments from
 tlb_finish_mmu()

Since commit 7a30df49f63a ("mm: mmu_gather: remove __tlb_reset_range()
for force flush"), the 'start' and 'end' arguments to tlb_finish_mmu()
are no longer used, since we flush the whole mm in case of a nested
invalidation.

Remove the unused arguments and update all callers.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yu Zhao <yuzhao@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lkml.kernel.org/r/20210127235347.1402-3-will@kernel.org
---
 arch/ia64/include/asm/tlb.h | 2 +-
 arch/x86/kernel/ldt.c       | 2 +-
 fs/exec.c                   | 2 +-
 include/linux/mm_types.h    | 3 +--
 mm/hugetlb.c                | 2 +-
 mm/madvise.c                | 6 +++---
 mm/memory.c                 | 4 ++--
 mm/mmap.c                   | 4 ++--
 mm/mmu_gather.c             | 5 +----
 mm/oom_kill.c               | 4 ++--
 10 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 8d9da6f08a62..7059eb2e867a 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -36,7 +36,7 @@
  *	    tlb_end_vma(tlb, vma);
  *	  }
  *	}
- *	tlb_finish_mmu(tlb, start, end);	// finish unmap for address space MM
+ *	tlb_finish_mmu(tlb);				// finish unmap for address space MM
  */
 #include <linux/mm.h>
 #include <linux/pagemap.h>
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8aee71840ae..0d4e1253c9c9 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -400,7 +400,7 @@ static void free_ldt_pgtables(struct mm_struct *mm)
 
 	tlb_gather_mmu(&tlb, mm, start, end);
 	free_pgd_range(&tlb, start, end, start, end);
-	tlb_finish_mmu(&tlb, start, end);
+	tlb_finish_mmu(&tlb);
 #endif
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 5d4d52039105..69d89a0c35e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -725,7 +725,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		free_pgd_range(&tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 	}
-	tlb_finish_mmu(&tlb, old_start, old_end);
+	tlb_finish_mmu(&tlb);
 
 	/*
 	 * Shrink the vma to just the new range.  Always succeeds.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 07d9acb5b19c..1fe6a51298a6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -590,8 +590,7 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 				unsigned long start, unsigned long end);
-extern void tlb_finish_mmu(struct mmu_gather *tlb,
-				unsigned long start, unsigned long end);
+extern void tlb_finish_mmu(struct mmu_gather *tlb);
 
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 18f6ee317900..33db4fa62c7b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3985,7 +3985,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 	tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
-	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+	tlb_finish_mmu(&tlb);
 }
 
 /*
diff --git a/mm/madvise.c b/mm/madvise.c
index 6a660858784b..1b68520ea3f4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -508,7 +508,7 @@ static long madvise_cold(struct vm_area_struct *vma,
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
 	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
-	tlb_finish_mmu(&tlb, start_addr, end_addr);
+	tlb_finish_mmu(&tlb);
 
 	return 0;
 }
@@ -560,7 +560,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
 	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
-	tlb_finish_mmu(&tlb, start_addr, end_addr);
+	tlb_finish_mmu(&tlb);
 
 	return 0;
 }
@@ -732,7 +732,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 			&madvise_free_walk_ops, &tlb);
 	tlb_end_vma(&tlb, vma);
 	mmu_notifier_invalidate_range_end(&range);
-	tlb_finish_mmu(&tlb, range.start, range.end);
+	tlb_finish_mmu(&tlb);
 
 	return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..7bd3f122bd10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1540,7 +1540,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
 		unmap_single_vma(&tlb, vma, start, range.end, NULL);
 	mmu_notifier_invalidate_range_end(&range);
-	tlb_finish_mmu(&tlb, start, range.end);
+	tlb_finish_mmu(&tlb);
 }
 
 /**
@@ -1566,7 +1566,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
 	mmu_notifier_invalidate_range_start(&range);
 	unmap_single_vma(&tlb, vma, address, range.end, details);
 	mmu_notifier_invalidate_range_end(&range);
-	tlb_finish_mmu(&tlb, address, range.end);
+	tlb_finish_mmu(&tlb);
 }
 
 /**
diff --git a/mm/mmap.c b/mm/mmap.c
index dc7206032387..7a9f493a4b83 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2676,7 +2676,7 @@ static void unmap_region(struct mm_struct *mm,
 	unmap_vmas(&tlb, vma, start, end);
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
 				 next ? next->vm_start : USER_PGTABLES_CEILING);
-	tlb_finish_mmu(&tlb, start, end);
+	tlb_finish_mmu(&tlb);
 }
 
 /*
@@ -3219,7 +3219,7 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
-	tlb_finish_mmu(&tlb, 0, -1);
+	tlb_finish_mmu(&tlb);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 03c33c93a582..b0be5a7aa08f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -290,14 +290,11 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 /**
  * tlb_finish_mmu - finish an mmu_gather structure
  * @tlb: the mmu_gather structure to finish
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
  *
  * Called at the end of the shootdown operation to free up any resources that
  * were required.
  */
-void tlb_finish_mmu(struct mmu_gather *tlb,
-		unsigned long start, unsigned long end)
+void tlb_finish_mmu(struct mmu_gather *tlb)
 {
 	/*
 	 * If there are parallel threads are doing PTE changes on same range
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04b19b7b5435..757e557211fb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -548,13 +548,13 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 						vma->vm_end);
 			tlb_gather_mmu(&tlb, mm, range.start, range.end);
 			if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
-				tlb_finish_mmu(&tlb, range.start, range.end);
+				tlb_finish_mmu(&tlb);
 				ret = false;
 				continue;
 			}
 			unmap_page_range(&tlb, vma, range.start, range.end, NULL);
 			mmu_notifier_invalidate_range_end(&range);
-			tlb_finish_mmu(&tlb, range.start, range.end);
+			tlb_finish_mmu(&tlb);
 		}
 	}
 
-- 
cgit v1.2.3


From d8b450530b90f8845ab962af18b8a10ed77fc889 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 27 Jan 2021 23:53:44 +0000
Subject: tlb: mmu_gather: Introduce tlb_gather_mmu_fullmm()

Passing the range '0, -1' to tlb_gather_mmu() sets the 'fullmm' flag,
which indicates that the mm_struct being operated on is going away. In
this case, some architectures (such as arm64) can elide TLB invalidation
by ensuring that the TLB tag (ASID) associated with this mm is not
immediately reclaimed. Although this behaviour is documented in
asm-generic/tlb.h, it's subtle and easily missed.

Introduce tlb_gather_mmu_fullmm() to make it clearer that this is for the
entire mm and WARN() if tlb_gather_mmu() is called with the 'fullmm'
address range.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yu Zhao <yuzhao@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lkml.kernel.org/r/20210127235347.1402-4-will@kernel.org
---
 include/asm-generic/tlb.h |  6 ++++--
 include/linux/mm_types.h  |  1 +
 mm/mmap.c                 |  2 +-
 mm/mmu_gather.c           | 16 ++++++++++++++--
 4 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 6661ee1cff47..2c68a545ffa7 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -46,7 +46,9 @@
  *
  * The mmu_gather API consists of:
  *
- *  - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather
+ *  - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu()
+ *
+ *    start and finish a mmu_gather
  *
  *    Finish in particular will issue a (final) TLB invalidate and free
  *    all (remaining) queued pages.
@@ -91,7 +93,7 @@
  *
  *  - mmu_gather::fullmm
  *
- *    A flag set by tlb_gather_mmu() to indicate we're going to free
+ *    A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free
  *    the entire mm; this allows a number of optimizations.
  *
  *    - We can ignore tlb_{start,end}_vma(); because we don't
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1fe6a51298a6..e49868bc12a7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -590,6 +590,7 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 				unsigned long start, unsigned long end);
+extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_finish_mmu(struct mmu_gather *tlb);
 
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
diff --git a/mm/mmap.c b/mm/mmap.c
index 7a9f493a4b83..4eac7c63edbe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3214,7 +3214,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	lru_add_drain();
 	flush_cache_mm(mm);
-	tlb_gather_mmu(&tlb, mm, 0, -1);
+	tlb_gather_mmu_fullmm(&tlb, mm);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index b0be5a7aa08f..5f5e45d9eb50 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -261,8 +261,8 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  * respectively when @mm is without users and we're going to destroy
  * the full address space (exit/execve).
  */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
-			unsigned long start, unsigned long end)
+static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+			     unsigned long start, unsigned long end)
 {
 	tlb->mm = mm;
 
@@ -287,6 +287,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 	inc_tlb_flush_pending(tlb->mm);
 }
 
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+		    unsigned long start, unsigned long end)
+{
+	WARN_ON(!(start | (end + 1))); /* Use _fullmm() instead */
+	__tlb_gather_mmu(tlb, mm, start, end);
+}
+
+void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+	__tlb_gather_mmu(tlb, mm, 0, -1);
+}
+
 /**
  * tlb_finish_mmu - finish an mmu_gather structure
  * @tlb: the mmu_gather structure to finish
-- 
cgit v1.2.3


From a72afd873089c697053e9daa85ff343b3140d2e7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 27 Jan 2021 23:53:45 +0000
Subject: tlb: mmu_gather: Remove start/end arguments from tlb_gather_mmu()

The 'start' and 'end' arguments to tlb_gather_mmu() are no longer
needed now that there is a separate function for 'fullmm' flushing.

Remove the unused arguments and update all callers.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yu Zhao <yuzhao@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wjQWa14_4UpfDf=fiineNP+RH74kZeDMo_f1D35xNzq9w@mail.gmail.com
---
 arch/ia64/include/asm/tlb.h |  2 +-
 arch/x86/kernel/ldt.c       |  2 +-
 fs/exec.c                   |  2 +-
 include/linux/mm_types.h    |  3 +--
 mm/hugetlb.c                | 16 +---------------
 mm/madvise.c                |  6 +++---
 mm/memory.c                 |  4 ++--
 mm/mmap.c                   |  2 +-
 mm/mmu_gather.c             | 22 ++++++++--------------
 mm/oom_kill.c               |  2 +-
 10 files changed, 20 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 7059eb2e867a..a15fe0809aae 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -23,7 +23,7 @@
  * unmapping a portion of the virtual address space, these hooks are called according to
  * the following template:
  *
- *	tlb <- tlb_gather_mmu(mm, start, end);		// start unmap for address space MM
+ *	tlb <- tlb_gather_mmu(mm);			// start unmap for address space MM
  *	{
  *	  for each vma that needs a shootdown do {
  *	    tlb_start_vma(tlb, vma);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0d4e1253c9c9..7ad9834e0d95 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -398,7 +398,7 @@ static void free_ldt_pgtables(struct mm_struct *mm)
 	if (!boot_cpu_has(X86_FEATURE_PTI))
 		return;
 
-	tlb_gather_mmu(&tlb, mm, start, end);
+	tlb_gather_mmu(&tlb, mm);
 	free_pgd_range(&tlb, start, end, start, end);
 	tlb_finish_mmu(&tlb);
 #endif
diff --git a/fs/exec.c b/fs/exec.c
index 69d89a0c35e9..5a853f03c233 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -708,7 +708,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		return -ENOMEM;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, old_start, old_end);
+	tlb_gather_mmu(&tlb, mm);
 	if (new_end > old_start) {
 		/*
 		 * when the old and new regions overlap clear from new_end.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e49868bc12a7..0974ad501a47 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -588,8 +588,7 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 }
 
 struct mmu_gather;
-extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
-				unsigned long start, unsigned long end);
+extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_finish_mmu(struct mmu_gather *tlb);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 33db4fa62c7b..89635f407232 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3967,23 +3967,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			  unsigned long end, struct page *ref_page)
 {
-	struct mm_struct *mm;
 	struct mmu_gather tlb;
-	unsigned long tlb_start = start;
-	unsigned long tlb_end = end;
 
-	/*
-	 * If shared PMDs were possibly used within this vma range, adjust
-	 * start/end for worst case tlb flushing.
-	 * Note that we can not be sure if PMDs are shared until we try to
-	 * unmap pages.  However, we want to make sure TLB flushing covers
-	 * the largest possible range.
-	 */
-	adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
-
-	mm = vma->vm_mm;
-
-	tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
+	tlb_gather_mmu(&tlb, vma->vm_mm);
 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
 	tlb_finish_mmu(&tlb);
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 1b68520ea3f4..0938fd3ad228 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -506,7 +506,7 @@ static long madvise_cold(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	tlb_gather_mmu(&tlb, mm);
 	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
 	tlb_finish_mmu(&tlb);
 
@@ -558,7 +558,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
 		return 0;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	tlb_gather_mmu(&tlb, mm);
 	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
 	tlb_finish_mmu(&tlb);
 
@@ -723,7 +723,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 				range.start, range.end);
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, range.start, range.end);
+	tlb_gather_mmu(&tlb, mm);
 	update_hiwater_rss(mm);
 
 	mmu_notifier_invalidate_range_start(&range);
diff --git a/mm/memory.c b/mm/memory.c
index 7bd3f122bd10..9e8576a83147 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1534,7 +1534,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	lru_add_drain();
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 				start, start + size);
-	tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+	tlb_gather_mmu(&tlb, vma->vm_mm);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
 	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
@@ -1561,7 +1561,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
 	lru_add_drain();
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 				address, address + size);
-	tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+	tlb_gather_mmu(&tlb, vma->vm_mm);
 	update_hiwater_rss(vma->vm_mm);
 	mmu_notifier_invalidate_range_start(&range);
 	unmap_single_vma(&tlb, vma, address, range.end, details);
diff --git a/mm/mmap.c b/mm/mmap.c
index 4eac7c63edbe..90673febce6a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2671,7 +2671,7 @@ static void unmap_region(struct mm_struct *mm,
 	struct mmu_gather tlb;
 
 	lru_add_drain();
-	tlb_gather_mmu(&tlb, mm, start, end);
+	tlb_gather_mmu(&tlb, mm);
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end);
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 5f5e45d9eb50..0dc7149b0c61 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -253,21 +253,17 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
  * @tlb: the mmu_gather structure to initialize
  * @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
+ * @fullmm: @mm is without users and we're going to destroy the full address
+ *	    space (exit/execve)
  *
  * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
+ * tear-down from @mm.
  */
 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
-			     unsigned long start, unsigned long end)
+			     bool fullmm)
 {
 	tlb->mm = mm;
-
-	/* Is it from 0 to ~0? */
-	tlb->fullmm     = !(start | (end+1));
+	tlb->fullmm = fullmm;
 
 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 	tlb->need_flush_all = 0;
@@ -287,16 +283,14 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 	inc_tlb_flush_pending(tlb->mm);
 }
 
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
-		    unsigned long start, unsigned long end)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
 {
-	WARN_ON(!(start | (end + 1))); /* Use _fullmm() instead */
-	__tlb_gather_mmu(tlb, mm, start, end);
+	__tlb_gather_mmu(tlb, mm, false);
 }
 
 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 {
-	__tlb_gather_mmu(tlb, mm, 0, -1);
+	__tlb_gather_mmu(tlb, mm, true);
 }
 
 /**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 757e557211fb..c9a33ffe38b7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -546,7 +546,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 			mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
 						vma, mm, vma->vm_start,
 						vma->vm_end);
-			tlb_gather_mmu(&tlb, mm, range.start, range.end);
+			tlb_gather_mmu(&tlb, mm);
 			if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
 				tlb_finish_mmu(&tlb);
 				ret = false;
-- 
cgit v1.2.3


From 97c753e62e6c31a404183898d950d8c08d752dbd Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 28 Jan 2021 00:37:51 +0900
Subject: tracing/kprobe: Fix to support kretprobe events on unloaded modules

Fix kprobe_on_func_entry() returns error code instead of false so that
register_kretprobe() can return an appropriate error code.

append_trace_kprobe() expects the kprobe registration returns -ENOENT
when the target symbol is not found, and it checks whether the target
module is unloaded or not. If the target module doesn't exist, it
defers to probe the target symbol until the module is loaded.

However, since register_kretprobe() returns -EINVAL instead of -ENOENT
in that case, it always fail on putting the kretprobe event on unloaded
modules. e.g.

Kprobe event:
/sys/kernel/debug/tracing # echo p xfs:xfs_end_io >> kprobe_events
[   16.515574] trace_kprobe: This probe might be able to register after target module is loaded. Continue.

Kretprobe event: (p -> r)
/sys/kernel/debug/tracing # echo r xfs:xfs_end_io >> kprobe_events
sh: write error: Invalid argument
/sys/kernel/debug/tracing # cat error_log
[   41.122514] trace_kprobe: error: Failed to register probe event
  Command: r xfs:xfs_end_io
             ^

To fix this bug, change kprobe_on_func_entry() to detect symbol lookup
failure and return -ENOENT in that case. Otherwise it returns -EINVAL
or 0 (succeeded, given address is on the entry).

Link: https://lkml.kernel.org/r/161176187132.1067016.8118042342894378981.stgit@devnote2

Cc: stable@vger.kernel.org
Fixes: 59158ec4aef7 ("tracing/kprobes: Check the probe on unloaded module correctly")
Reported-by: Jianlin Lv <Jianlin.Lv@arm.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h     |  2 +-
 kernel/kprobes.c            | 34 +++++++++++++++++++++++++---------
 kernel/trace/trace_kprobe.c | 10 ++++++----
 3 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index b3a36b0cfc81..1883a4a9f16a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -266,7 +266,7 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
 extern int arch_populate_kprobe_blacklist(void);
 extern bool arch_kprobe_on_func_entry(unsigned long offset);
-extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
+extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 extern int kprobe_add_ksym_blacklist(unsigned long entry);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f7fb5d135930..1a5bc321e0a5 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1954,29 +1954,45 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
 	return !offset;
 }
 
-bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+/**
+ * kprobe_on_func_entry() -- check whether given address is function entry
+ * @addr: Target address
+ * @sym:  Target symbol name
+ * @offset: The offset from the symbol or the address
+ *
+ * This checks whether the given @addr+@offset or @sym+@offset is on the
+ * function entry address or not.
+ * This returns 0 if it is the function entry, or -EINVAL if it is not.
+ * And also it returns -ENOENT if it fails the symbol or address lookup.
+ * Caller must pass @addr or @sym (either one must be NULL), or this
+ * returns -EINVAL.
+ */
+int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
 {
 	kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
 
 	if (IS_ERR(kp_addr))
-		return false;
+		return PTR_ERR(kp_addr);
 
-	if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
-						!arch_kprobe_on_func_entry(offset))
-		return false;
+	if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
+		return -ENOENT;
 
-	return true;
+	if (!arch_kprobe_on_func_entry(offset))
+		return -EINVAL;
+
+	return 0;
 }
 
 int register_kretprobe(struct kretprobe *rp)
 {
-	int ret = 0;
+	int ret;
 	struct kretprobe_instance *inst;
 	int i;
 	void *addr;
 
-	if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
-		return -EINVAL;
+	ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
+	if (ret)
+		return ret;
 
 	if (kretprobe_blacklist_size) {
 		addr = kprobe_addr(&rp->kp);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index e6fba1798771..56c7fbff7bd7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -221,9 +221,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call)
 {
 	struct trace_kprobe *tk = trace_kprobe_primary_from_call(call);
 
-	return tk ? kprobe_on_func_entry(tk->rp.kp.addr,
+	return tk ? (kprobe_on_func_entry(tk->rp.kp.addr,
 			tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
-			tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false;
+			tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0) : false;
 }
 
 bool trace_kprobe_error_injectable(struct trace_event_call *call)
@@ -828,9 +828,11 @@ static int trace_kprobe_create(int argc, const char *argv[])
 		}
 		if (is_return)
 			flags |= TPARG_FL_RETURN;
-		if (kprobe_on_func_entry(NULL, symbol, offset))
+		ret = kprobe_on_func_entry(NULL, symbol, offset);
+		if (ret == 0)
 			flags |= TPARG_FL_FENTRY;
-		if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
+		/* Defer the ENOENT case until register kprobe */
+		if (ret == -EINVAL && is_return) {
 			trace_probe_log_err(0, BAD_RETPROBE);
 			goto parse_error;
 		}
-- 
cgit v1.2.3


From 9c7caf28068421c9e0d1faea437e35e6b8983ac6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 29 Jan 2021 02:59:59 +0200
Subject: net: dsa: tag_8021q: add helpers to deduce whether a VLAN ID is RX or
 TX VLAN

The sja1105 implementation can be blind about this, but the felix driver
doesn't do exactly what it's being told, so it needs to know whether it
is a TX or an RX VLAN, so it can install the appropriate type of TCAM
rule.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dsa/8021q.h | 14 ++++++++++++++
 net/dsa/tag_8021q.c       | 15 +++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 88cd72dfa4e0..b12b05f1c8b4 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -64,6 +64,10 @@ int dsa_8021q_rx_source_port(u16 vid);
 
 u16 dsa_8021q_rx_subvlan(u16 vid);
 
+bool vid_is_dsa_8021q_rxvlan(u16 vid);
+
+bool vid_is_dsa_8021q_txvlan(u16 vid);
+
 bool vid_is_dsa_8021q(u16 vid);
 
 #else
@@ -123,6 +127,16 @@ u16 dsa_8021q_rx_subvlan(u16 vid)
 	return 0;
 }
 
+bool vid_is_dsa_8021q_rxvlan(u16 vid)
+{
+	return false;
+}
+
+bool vid_is_dsa_8021q_txvlan(u16 vid)
+{
+	return false;
+}
+
 bool vid_is_dsa_8021q(u16 vid)
 {
 	return false;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 8e3e8a5b8559..008c1ec6e20c 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -133,10 +133,21 @@ u16 dsa_8021q_rx_subvlan(u16 vid)
 }
 EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan);
 
+bool vid_is_dsa_8021q_rxvlan(u16 vid)
+{
+	return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX;
+}
+EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_rxvlan);
+
+bool vid_is_dsa_8021q_txvlan(u16 vid)
+{
+	return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX;
+}
+EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_txvlan);
+
 bool vid_is_dsa_8021q(u16 vid)
 {
-	return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX ||
-		(vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX);
+	return vid_is_dsa_8021q_rxvlan(vid) || vid_is_dsa_8021q_txvlan(vid);
 }
 EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
 
-- 
cgit v1.2.3


From de5774d192ba15539191ed8b0c79f3d52464b8e3 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 26 Jan 2021 12:45:39 +0000
Subject: clk: imx: Move 'imx6sl_set_wait_clk()'s prototype out to accessible
 header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following W=1 kernel build warning(s):

 drivers/clk/imx/clk-imx6sl.c:156:6: warning: no previous prototype for ‘imx6sl_set_wait_clk’ [-Wmissing-prototypes]

Cc: Russell King <linux@armlinux.org.uk>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Pengutronix Kernel Team <kernel@pengutronix.de>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: NXP Linux Team <linux-imx@nxp.com>
Cc: Ahmad Fatoum <a.fatoum@pengutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 arch/arm/mach-imx/common.h         |  1 -
 arch/arm/mach-imx/cpuidle-imx6sl.c |  1 +
 arch/arm/mach-imx/pm-imx6.c        |  1 +
 drivers/clk/imx/clk-imx6sl.c       |  1 +
 include/linux/clk/imx.h            | 15 +++++++++++++++
 5 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/clk/imx.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h
index 2d76e2c6c99e..2b004cc4f95e 100644
--- a/arch/arm/mach-imx/common.h
+++ b/arch/arm/mach-imx/common.h
@@ -85,7 +85,6 @@ void imx_anatop_pre_suspend(void);
 void imx_anatop_post_resume(void);
 int imx6_set_lpm(enum mxc_cpu_pwr_mode mode);
 void imx6_set_int_mem_clk_lpm(bool enable);
-void imx6sl_set_wait_clk(bool enter);
 int imx_mmdc_get_ddr_type(void);
 int imx7ulp_set_lpm(enum ulp_cpu_pwr_mode mode);
 
diff --git a/arch/arm/mach-imx/cpuidle-imx6sl.c b/arch/arm/mach-imx/cpuidle-imx6sl.c
index 4521e5352bf6..b86ffbeb28e4 100644
--- a/arch/arm/mach-imx/cpuidle-imx6sl.c
+++ b/arch/arm/mach-imx/cpuidle-imx6sl.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2014 Freescale Semiconductor, Inc.
  */
 
+#include <linux/clk/imx.h>
 #include <linux/cpuidle.h>
 #include <linux/module.h>
 #include <asm/cpuidle.h>
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 40c74b4c4d73..9244437cb1b9 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -4,6 +4,7 @@
  * Copyright 2011 Linaro Ltd.
  */
 
+#include <linux/clk/imx.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6sl.c b/drivers/clk/imx/clk-imx6sl.c
index 2f9361946a0e..29eab05c9068 100644
--- a/drivers/clk/imx/clk-imx6sl.c
+++ b/drivers/clk/imx/clk-imx6sl.c
@@ -6,6 +6,7 @@
 #include <linux/bits.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk/imx.h>
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/include/linux/clk/imx.h b/include/linux/clk/imx.h
new file mode 100644
index 000000000000..75a0d9696552
--- /dev/null
+++ b/include/linux/clk/imx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Freescale Semiconductor, Inc.
+ *
+ * Author: Lee Jones <lee.jones@linaro.org>
+ */
+
+#ifndef __LINUX_CLK_IMX_H
+#define __LINUX_CLK_IMX_H
+
+#include <linux/types.h>
+
+void imx6sl_set_wait_clk(bool enter);
+
+#endif
-- 
cgit v1.2.3


From 3cc55f4434b421d37300aa9a167ace7d60b45ccf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 29 Jan 2021 14:26:29 -0500
Subject: nfs: use change attribute for NFS re-exports

When exporting NFS, we may as well use the real change attribute
returned by the original server instead of faking up a change attribute
from the ctime.

Note we can't do that by setting I_VERSION--that would also turn on the
logic in iversion.h which treats the lower bit specially, and that
doesn't make sense for NFS.

So instead we define a new export operation for filesystems like NFS
that want to manage the change attribute themselves.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/export.c          | 18 ++++++++++++++++++
 fs/nfsd/nfsfh.h          |  5 ++++-
 include/linux/exportfs.h |  1 +
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 7412bb164fa7..f2b34cfe286c 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -167,10 +167,28 @@ out:
 	return parent;
 }
 
+static u64 nfs_fetch_iversion(struct inode *inode)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+
+	/* Is this the right call?: */
+	nfs_revalidate_inode(server, inode);
+	/*
+	 * Also, note we're ignoring any returned error.  That seems to be
+	 * the practice for cache consistency information elsewhere in
+	 * the server, but I'm not sure why.
+	 */
+	if (server->nfs_client->rpc_ops->version >= 4)
+		return inode_peek_iversion_raw(inode);
+	else
+		return time_to_chattr(&inode->i_ctime);
+}
+
 const struct export_operations nfs_export_ops = {
 	.encode_fh = nfs_encode_fh,
 	.fh_to_dentry = nfs_fh_to_dentry,
 	.get_parent = nfs_get_parent,
+	.fetch_iversion = nfs_fetch_iversion,
 	.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
 		EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
 		EXPORT_OP_NOATOMIC_ATTR,
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cb20c2cd3469..f58933519f38 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -12,6 +12,7 @@
 #include <linux/sunrpc/svc.h>
 #include <uapi/linux/nfsd/nfsfh.h>
 #include <linux/iversion.h>
+#include <linux/exportfs.h>
 
 static inline __u32 ino_t_to_u32(ino_t ino)
 {
@@ -264,7 +265,9 @@ fh_clear_wcc(struct svc_fh *fhp)
 static inline u64 nfsd4_change_attribute(struct kstat *stat,
 					 struct inode *inode)
 {
-	if (IS_I_VERSION(inode)) {
+	if (inode->i_sb->s_export_op->fetch_iversion)
+		return inode->i_sb->s_export_op->fetch_iversion(inode);
+	else if (IS_I_VERSION(inode)) {
 		u64 chattr;
 
 		chattr =  stat->ctime.tv_sec;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 9f4d4bcbf251..fe848901fcc3 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -213,6 +213,7 @@ struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
+	u64 (*fetch_iversion)(struct inode *);
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
-- 
cgit v1.2.3


From 207f13b419a60c56fb75baeb3d668de080514354 Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Wed, 9 Dec 2020 14:32:37 +0100
Subject: soc: mediatek: pm-domains: Add support for mt8167

Add the needed board data to support mt8167 SoC.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Reviewed-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201209133238.384030-2-fparent@baylibre.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mt8167-pm-domains.h | 86 ++++++++++++++++++++++++++++++++
 drivers/soc/mediatek/mtk-pm-domains.c    |  5 ++
 drivers/soc/mediatek/mtk-pm-domains.h    |  1 +
 include/linux/soc/mediatek/infracfg.h    |  8 +++
 4 files changed, 100 insertions(+)
 create mode 100644 drivers/soc/mediatek/mt8167-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mt8167-pm-domains.h b/drivers/soc/mediatek/mt8167-pm-domains.h
new file mode 100644
index 000000000000..ad0b8dfa0527
--- /dev/null
+++ b/drivers/soc/mediatek/mt8167-pm-domains.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT8167_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT8167_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mt8167-power.h>
+
+#define MT8167_PWR_STATUS_MFG_2D	BIT(24)
+#define MT8167_PWR_STATUS_MFG_ASYNC	BIT(25)
+
+/*
+ * MT8167 power domain support
+ */
+
+static const struct scpsys_domain_data scpsys_domain_data_mt8167[] = {
+	[MT8167_POWER_DOMAIN_MM] = {
+		.sta_mask = PWR_STATUS_DISP,
+		.ctl_offs = SPM_DIS_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_UPDATE_TOPAXI(MT8167_TOP_AXI_PROT_EN_MM_EMI |
+					       MT8167_TOP_AXI_PROT_EN_MCU_MM),
+		},
+		.caps = MTK_SCPD_ACTIVE_WAKEUP,
+	},
+	[MT8167_POWER_DOMAIN_VDEC] = {
+		.sta_mask = PWR_STATUS_VDEC,
+		.ctl_offs = SPM_VDE_PWR_CON,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.caps = MTK_SCPD_ACTIVE_WAKEUP,
+	},
+	[MT8167_POWER_DOMAIN_ISP] = {
+		.sta_mask = PWR_STATUS_ISP,
+		.ctl_offs = SPM_ISP_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.caps = MTK_SCPD_ACTIVE_WAKEUP,
+	},
+	[MT8167_POWER_DOMAIN_MFG_ASYNC] = {
+		.sta_mask = MT8167_PWR_STATUS_MFG_ASYNC,
+		.ctl_offs = SPM_MFG_ASYNC_PWR_CON,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.bp_infracfg = {
+			BUS_PROT_UPDATE_TOPAXI(MT8167_TOP_AXI_PROT_EN_MCU_MFG |
+					       MT8167_TOP_AXI_PROT_EN_MFG_EMI),
+		},
+	},
+	[MT8167_POWER_DOMAIN_MFG_2D] = {
+		.sta_mask = MT8167_PWR_STATUS_MFG_2D,
+		.ctl_offs = SPM_MFG_2D_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+	},
+	[MT8167_POWER_DOMAIN_MFG] = {
+		.sta_mask = PWR_STATUS_MFG,
+		.ctl_offs = SPM_MFG_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+	},
+	[MT8167_POWER_DOMAIN_CONN] = {
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = SPM_CONN_PWR_CON,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = 0,
+		.caps = MTK_SCPD_ACTIVE_WAKEUP,
+		.bp_infracfg = {
+			BUS_PROT_UPDATE_TOPAXI(MT8167_TOP_AXI_PROT_EN_CONN_EMI |
+					       MT8167_TOP_AXI_PROT_EN_CONN_MCU |
+					       MT8167_TOP_AXI_PROT_EN_MCU_CONN),
+		},
+	},
+};
+
+static const struct scpsys_soc_data mt8167_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt8167,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt8167),
+	.pwr_sta_offs = SPM_PWR_STATUS,
+	.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND,
+};
+
+#endif /* __SOC_MEDIATEK_MT8167_PM_DOMAINS_H */
+
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index fb70cb3b07b3..2d0d50ff35f0 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -15,6 +15,7 @@
 #include <linux/regmap.h>
 #include <linux/soc/mediatek/infracfg.h>
 
+#include "mt8167-pm-domains.h"
 #include "mt8173-pm-domains.h"
 #include "mt8183-pm-domains.h"
 #include "mt8192-pm-domains.h"
@@ -514,6 +515,10 @@ static void scpsys_domain_cleanup(struct scpsys *scpsys)
 }
 
 static const struct of_device_id scpsys_of_match[] = {
+	{
+		.compatible = "mediatek,mt8167-power-controller",
+		.data = &mt8167_scpsys_data,
+	},
 	{
 		.compatible = "mediatek,mt8173-power-controller",
 		.data = &mt8173_scpsys_data,
diff --git a/drivers/soc/mediatek/mtk-pm-domains.h b/drivers/soc/mediatek/mtk-pm-domains.h
index a2f4d8f97e05..88f5835e1648 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.h
+++ b/drivers/soc/mediatek/mtk-pm-domains.h
@@ -14,6 +14,7 @@
 #define SPM_VEN_PWR_CON			0x0230
 #define SPM_ISP_PWR_CON			0x0238
 #define SPM_DIS_PWR_CON			0x023c
+#define SPM_CONN_PWR_CON		0x0280
 #define SPM_VEN2_PWR_CON		0x0298
 #define SPM_AUDIO_PWR_CON		0x029c
 #define SPM_MFG_2D_PWR_CON		0x02c0
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index e7842debc05d..4615a228da51 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -123,6 +123,14 @@
 #define MT8173_TOP_AXI_PROT_EN_MFG_M1		BIT(22)
 #define MT8173_TOP_AXI_PROT_EN_MFG_SNOOP_OUT	BIT(23)
 
+#define MT8167_TOP_AXI_PROT_EN_MM_EMI		BIT(1)
+#define MT8167_TOP_AXI_PROT_EN_MCU_MFG		BIT(2)
+#define MT8167_TOP_AXI_PROT_EN_CONN_EMI		BIT(4)
+#define MT8167_TOP_AXI_PROT_EN_MFG_EMI		BIT(5)
+#define MT8167_TOP_AXI_PROT_EN_CONN_MCU		BIT(8)
+#define MT8167_TOP_AXI_PROT_EN_MCU_CONN		BIT(9)
+#define MT8167_TOP_AXI_PROT_EN_MCU_MM		BIT(11)
+
 #define MT2701_TOP_AXI_PROT_EN_MM_M0		BIT(1)
 #define MT2701_TOP_AXI_PROT_EN_CONN_M		BIT(2)
 #define MT2701_TOP_AXI_PROT_EN_CONN_S		BIT(8)
-- 
cgit v1.2.3


From 1570db1da9f57dfbe5dfa3010233c98947497466 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 3 Dec 2020 07:58:55 +0800
Subject: soc: mediatek: cmdq: Remove cmdq_pkt_flush()

rx_callback is a standard mailbox callback mechanism and could
cover the function of proprietary cmdq_task_cb, so it is better
to use the standard one instead of the proprietary one. But
register rx_callback should before mbox_request_channel(),
so remove cmdq_pkt_flush() and let client driver implement
its own synchronous flush.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20201202235856.7652-1-chunkuang.hu@kernel.org
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 32 --------------------------------
 include/linux/soc/mediatek/mtk-cmdq.h  | 12 ------------
 2 files changed, 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 280d3bd9f675..3c8e4212d941 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -463,36 +463,4 @@ int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb,
 }
 EXPORT_SYMBOL(cmdq_pkt_flush_async);
 
-struct cmdq_flush_completion {
-	struct completion cmplt;
-	bool err;
-};
-
-static void cmdq_pkt_flush_cb(struct cmdq_cb_data data)
-{
-	struct cmdq_flush_completion *cmplt;
-
-	cmplt = (struct cmdq_flush_completion *)data.data;
-	if (data.sta != CMDQ_CB_NORMAL)
-		cmplt->err = true;
-	else
-		cmplt->err = false;
-	complete(&cmplt->cmplt);
-}
-
-int cmdq_pkt_flush(struct cmdq_pkt *pkt)
-{
-	struct cmdq_flush_completion cmplt;
-	int err;
-
-	init_completion(&cmplt.cmplt);
-	err = cmdq_pkt_flush_async(pkt, cmdq_pkt_flush_cb, &cmplt);
-	if (err < 0)
-		return err;
-	wait_for_completion(&cmplt.cmplt);
-
-	return cmplt.err ? -EFAULT : 0;
-}
-EXPORT_SYMBOL(cmdq_pkt_flush);
-
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 8e9996610978..ac6b5f3cba95 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -280,16 +280,4 @@ int cmdq_pkt_finalize(struct cmdq_pkt *pkt);
 int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb,
 			 void *data);
 
-/**
- * cmdq_pkt_flush() - trigger CMDQ to execute the CMDQ packet
- * @pkt:	the CMDQ packet
- *
- * Return: 0 for success; else the error code is returned
- *
- * Trigger CMDQ to execute the CMDQ packet. Note that this is a
- * synchronous flush function. When the function returned, the recorded
- * commands have been done.
- */
-int cmdq_pkt_flush(struct cmdq_pkt *pkt);
-
 #endif	/* __MTK_CMDQ_H__ */
-- 
cgit v1.2.3


From fb02e3ebfb2da27172da47f79a5481c4fe83d751 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 29 Jan 2021 20:51:47 +0100
Subject: staging: hikey9xx: spmi driver: convert to regmap

Instead of doing its own SPMI I/O implementation, use the
already-existing regmap one.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/7084885f3007ca5daf0d5bc85d038e26ee82dc0d.1611949675.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/hikey9xx/Kconfig                |   2 +
 drivers/staging/hikey9xx/hi6421-spmi-pmic.c     | 115 ++++++++----------------
 drivers/staging/hikey9xx/hi6421v600-regulator.c |  26 +++---
 include/linux/mfd/hi6421-spmi-pmic.h            |   7 +-
 4 files changed, 54 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/hikey9xx/Kconfig b/drivers/staging/hikey9xx/Kconfig
index 2e48ded92a7e..82bb4a22b286 100644
--- a/drivers/staging/hikey9xx/Kconfig
+++ b/drivers/staging/hikey9xx/Kconfig
@@ -29,6 +29,7 @@ config MFD_HI6421_SPMI
 	depends on OF
 	depends on SPMI
 	select MFD_CORE
+	select REGMAP_SPMI
 	help
 	  Add support for HiSilicon Hi6421v600 SPMI PMIC. Hi6421 includes
 	  multi-functions, such as regulators, RTC, codec, Coulomb counter,
@@ -44,6 +45,7 @@ config REGULATOR_HI6421V600
 	tristate "HiSilicon Hi6421v600 PMIC voltage regulator support"
 	depends on MFD_HI6421_SPMI && OF
 	depends on REGULATOR
+	select REGMAP
 	help
 	  This driver provides support for the voltage regulators on
 	  HiSilicon Hi6421v600 PMU / Codec IC.
diff --git a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
index 69570876f93e..3d612bd46231 100644
--- a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
+++ b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
@@ -41,81 +41,22 @@ static const struct mfd_cell hi6421v600_devs[] = {
 	{ .name = "hi6421v600-regulator", },
 };
 
-/*
- * The PMIC register is only 8-bit.
- * Hisilicon SoC use hardware to map PMIC register into SoC mapping.
- * At here, we are accessing SoC register with 32-bit.
- */
-int hi6421_spmi_pmic_read(struct hi6421_spmi_pmic *pmic, int reg)
+static irqreturn_t hi6421_spmi_irq_handler(int irq, void *priv)
 {
-	struct spmi_device *pdev;
-	u8 read_value = 0;
-	u32 ret;
-
-	pdev = to_spmi_device(pmic->dev);
-	if (!pdev) {
-		pr_err("%s: pdev get failed!\n", __func__);
-		return -ENODEV;
-	}
-
-	ret = spmi_ext_register_readl(pdev, reg, &read_value, 1);
-	if (ret) {
-		pr_err("%s: spmi_ext_register_readl failed!\n", __func__);
-		return ret;
-	}
-	return read_value;
-}
-EXPORT_SYMBOL(hi6421_spmi_pmic_read);
-
-int hi6421_spmi_pmic_write(struct hi6421_spmi_pmic *pmic, int reg, u32 val)
-{
-	struct spmi_device *pdev;
-	u32 ret;
-
-	pdev = to_spmi_device(pmic->dev);
-	if (!pdev) {
-		pr_err("%s: pdev get failed!\n", __func__);
-		return -ENODEV;
-	}
-
-	ret = spmi_ext_register_writel(pdev, reg, (unsigned char *)&val, 1);
-	if (ret)
-		pr_err("%s: spmi_ext_register_writel failed!\n", __func__);
-
-	return ret;
-}
-EXPORT_SYMBOL(hi6421_spmi_pmic_write);
-
-int hi6421_spmi_pmic_rmw(struct hi6421_spmi_pmic *pmic, int reg,
-			 u32 mask, u32 bits)
-{
-	unsigned long flags;
-	u32 data;
-	int ret;
-
-	spin_lock_irqsave(&pmic->lock, flags);
-	data = hi6421_spmi_pmic_read(pmic, reg) & ~mask;
-	data |= mask & bits;
-	ret = hi6421_spmi_pmic_write(pmic, reg, data);
-	spin_unlock_irqrestore(&pmic->lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(hi6421_spmi_pmic_rmw);
-
-static irqreturn_t hi6421_spmi_irq_handler(int irq, void *data)
-{
-	struct hi6421_spmi_pmic *pmic = (struct hi6421_spmi_pmic *)data;
+	struct hi6421_spmi_pmic *pmic = (struct hi6421_spmi_pmic *)priv;
 	unsigned long pending;
+	unsigned int data;
 	int i, offset;
 
 	for (i = 0; i < HISI_IRQ_ARRAY; i++) {
-		pending = hi6421_spmi_pmic_read(pmic, (i + SOC_PMIC_IRQ0_ADDR));
-		pending &= HISI_MASK_FIELD;
-		if (pending != 0)
-			pr_debug("pending[%d]=0x%lx\n\r", i, pending);
+		regmap_read(pmic->map, offset, &data);
+		data &= HISI_MASK_FIELD;
+		if (data != 0)
+			pr_debug("data[%d]=0x%d\n\r", i, data);
+		regmap_write(pmic->map, i + SOC_PMIC_IRQ0_ADDR, data);
 
-		hi6421_spmi_pmic_write(pmic, (i + SOC_PMIC_IRQ0_ADDR), pending);
+		/* for_each_set_bit() macro requires unsigned long */
+		pending = data;
 
 		/* solve powerkey order */
 		if ((i == HISI_IRQ_KEY_NUM) &&
@@ -137,16 +78,18 @@ static irqreturn_t hi6421_spmi_irq_handler(int irq, void *data)
 static void hi6421_spmi_irq_mask(struct irq_data *d)
 {
 	struct hi6421_spmi_pmic *pmic = irq_data_get_irq_chip_data(d);
-	u32 data, offset;
 	unsigned long flags;
+	unsigned int data;
+	u32 offset;
 
 	offset = (irqd_to_hwirq(d) >> 3);
 	offset += SOC_PMIC_IRQ_MASK_0_ADDR;
 
 	spin_lock_irqsave(&pmic->lock, flags);
-	data = hi6421_spmi_pmic_read(pmic, offset);
+
+	regmap_read(pmic->map, offset, &data);
 	data |= (1 << (irqd_to_hwirq(d) & 0x07));
-	hi6421_spmi_pmic_write(pmic, offset, data);
+	regmap_write(pmic->map, offset, data);
 	spin_unlock_irqrestore(&pmic->lock, flags);
 }
 
@@ -160,9 +103,9 @@ static void hi6421_spmi_irq_unmask(struct irq_data *d)
 	offset += SOC_PMIC_IRQ_MASK_0_ADDR;
 
 	spin_lock_irqsave(&pmic->lock, flags);
-	data = hi6421_spmi_pmic_read(pmic, offset);
+	regmap_read(pmic->map, offset, &data);
 	data &= ~(1 << (irqd_to_hwirq(d) & 0x07));
-	hi6421_spmi_pmic_write(pmic, offset, data);
+	regmap_write(pmic->map, offset, data);
 	spin_unlock_irqrestore(&pmic->lock, flags);
 }
 
@@ -194,27 +137,36 @@ static const struct irq_domain_ops hi6421_spmi_domain_ops = {
 
 static void hi6421_spmi_pmic_irq_prc(struct hi6421_spmi_pmic *pmic)
 {
-	int i, pending;
+	int i;
+	unsigned int pending;
 
 	for (i = 0 ; i < HISI_IRQ_ARRAY; i++)
-		hi6421_spmi_pmic_write(pmic, SOC_PMIC_IRQ_MASK_0_ADDR + i,
+		regmap_write(pmic->map, SOC_PMIC_IRQ_MASK_0_ADDR + i,
 				       HISI_MASK_STATE);
 
 	for (i = 0 ; i < HISI_IRQ_ARRAY; i++) {
-		pending = hi6421_spmi_pmic_read(pmic, SOC_PMIC_IRQ0_ADDR + i);
+		regmap_read(pmic->map, SOC_PMIC_IRQ0_ADDR + i, &pending);
 
 		pr_debug("PMU IRQ address value:irq[0x%x] = 0x%x\n",
 			 SOC_PMIC_IRQ0_ADDR + i, pending);
-		hi6421_spmi_pmic_write(pmic, SOC_PMIC_IRQ0_ADDR + i,
-				       HISI_MASK_STATE);
+		regmap_write(pmic->map, SOC_PMIC_IRQ0_ADDR + i,
+			     HISI_MASK_STATE);
 	}
 }
 
+static const struct regmap_config spmi_regmap_config = {
+	.reg_bits		= 16,
+	.val_bits		= 8,
+	.max_register		= 0xffff,
+	.fast_io		= true
+};
+
 static int hi6421_spmi_pmic_probe(struct spmi_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	struct hi6421_spmi_pmic *pmic;
+	struct regmap *map;
 	unsigned int virq;
 	int ret, i;
 
@@ -222,9 +174,14 @@ static int hi6421_spmi_pmic_probe(struct spmi_device *pdev)
 	if (!pmic)
 		return -ENOMEM;
 
+	map = devm_regmap_init_spmi_ext(pdev, &spmi_regmap_config);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
 	spin_lock_init(&pmic->lock);
 
 	pmic->dev = dev;
+	pmic->map = map;
 
 	pmic->gpio = of_get_gpio(np, 0);
 	if (pmic->gpio < 0)
diff --git a/drivers/staging/hikey9xx/hi6421v600-regulator.c b/drivers/staging/hikey9xx/hi6421v600-regulator.c
index 382a0b21643e..9e319fa11137 100644
--- a/drivers/staging/hikey9xx/hi6421v600-regulator.c
+++ b/drivers/staging/hikey9xx/hi6421v600-regulator.c
@@ -12,6 +12,7 @@
 #include <linux/mfd/hi6421-spmi-pmic.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/regmap.h>
 #include <linux/regulator/driver.h>
 #include <linux/spmi.h>
 
@@ -100,7 +101,7 @@ static int hi6421_spmi_regulator_is_enabled(struct regulator_dev *rdev)
 	struct hi6421_spmi_pmic *pmic = sreg->pmic;
 	u32 reg_val;
 
-	reg_val = hi6421_spmi_pmic_read(pmic, rdev->desc->enable_reg);
+	regmap_read(pmic->map, rdev->desc->enable_reg, &reg_val);
 
 	return ((reg_val & rdev->desc->enable_mask) != 0);
 }
@@ -114,9 +115,9 @@ static int hi6421_spmi_regulator_enable(struct regulator_dev *rdev)
 	/* cannot enable more than one regulator at one time */
 	mutex_lock(&sreg->enable_mutex);
 
-	ret = hi6421_spmi_pmic_rmw(pmic, rdev->desc->enable_reg,
-				   rdev->desc->enable_mask,
-				   rdev->desc->enable_mask);
+	ret = regmap_update_bits(pmic->map, rdev->desc->enable_reg,
+				 rdev->desc->enable_mask,
+			         rdev->desc->enable_mask);
 
 	/* Avoid powering up multiple devices at the same time */
 	usleep_range(rdev->desc->off_on_delay, rdev->desc->off_on_delay + 60);
@@ -131,8 +132,8 @@ static int hi6421_spmi_regulator_disable(struct regulator_dev *rdev)
 	struct hi6421_spmi_reg_info *sreg = rdev_get_drvdata(rdev);
 	struct hi6421_spmi_pmic *pmic = sreg->pmic;
 
-	return hi6421_spmi_pmic_rmw(pmic, rdev->desc->enable_reg,
-				    rdev->desc->enable_mask, 0);
+	return regmap_update_bits(pmic->map, rdev->desc->enable_reg,
+				  rdev->desc->enable_mask, 0);
 }
 
 static int hi6421_spmi_regulator_get_voltage_sel(struct regulator_dev *rdev)
@@ -141,7 +142,7 @@ static int hi6421_spmi_regulator_get_voltage_sel(struct regulator_dev *rdev)
 	struct hi6421_spmi_pmic *pmic = sreg->pmic;
 	u32 reg_val;
 
-	reg_val = hi6421_spmi_pmic_read(pmic, rdev->desc->vsel_reg);
+	regmap_read(pmic->map, rdev->desc->vsel_reg, &reg_val);
 
 	return (reg_val & rdev->desc->vsel_mask) >> (ffs(rdev->desc->vsel_mask) - 1);
 }
@@ -159,8 +160,8 @@ static int hi6421_spmi_regulator_set_voltage_sel(struct regulator_dev *rdev,
 	reg_val = selector << (ffs(rdev->desc->vsel_mask) - 1);
 
 	/* set voltage selector */
-	return hi6421_spmi_pmic_rmw(pmic, rdev->desc->vsel_reg,
-				    rdev->desc->vsel_mask, reg_val);
+	return regmap_update_bits(pmic->map, rdev->desc->vsel_reg,
+				  rdev->desc->vsel_mask, reg_val);
 }
 
 static unsigned int hi6421_spmi_regulator_get_mode(struct regulator_dev *rdev)
@@ -169,7 +170,7 @@ static unsigned int hi6421_spmi_regulator_get_mode(struct regulator_dev *rdev)
 	struct hi6421_spmi_pmic *pmic = sreg->pmic;
 	u32 reg_val;
 
-	reg_val = hi6421_spmi_pmic_read(pmic, rdev->desc->enable_reg);
+	regmap_read(pmic->map, rdev->desc->enable_reg, &reg_val);
 
 	if (reg_val & sreg->eco_mode_mask)
 		return REGULATOR_MODE_IDLE;
@@ -195,8 +196,8 @@ static int hi6421_spmi_regulator_set_mode(struct regulator_dev *rdev,
 		return -EINVAL;
 	}
 
-	return hi6421_spmi_pmic_rmw(pmic, rdev->desc->enable_reg,
-				    sreg->eco_mode_mask, val);
+	return regmap_update_bits(pmic->map, rdev->desc->enable_reg,
+				  sreg->eco_mode_mask, val);
 }
 
 static unsigned int
@@ -304,6 +305,7 @@ static int hi6421_spmi_regulator_probe(struct platform_device *pdev)
 
 		config.dev = pdev->dev.parent;
 		config.driver_data = sreg;
+		config.regmap = pmic->map;
 
 		rdev = devm_regulator_register(dev, &info->desc, &config);
 		if (IS_ERR(rdev)) {
diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h
index 0c2214612c4e..c5a4eac1fdc0 100644
--- a/include/linux/mfd/hi6421-spmi-pmic.h
+++ b/include/linux/mfd/hi6421-spmi-pmic.h
@@ -12,6 +12,7 @@
 #define	__HISI_PMIC_H
 
 #include <linux/irqdomain.h>
+#include <linux/regmap.h>
 
 #define HISI_ECO_MODE_ENABLE		(1)
 #define HISI_ECO_MODE_DISABLE		(0)
@@ -25,13 +26,9 @@ struct hi6421_spmi_pmic {
 	int					irq;
 	int					gpio;
 	unsigned int				*irqs;
+	struct regmap				*map;
 };
 
-int hi6421_spmi_pmic_read(struct hi6421_spmi_pmic *pmic, int reg);
-int hi6421_spmi_pmic_write(struct hi6421_spmi_pmic *pmic, int reg, u32 val);
-int hi6421_spmi_pmic_rmw(struct hi6421_spmi_pmic *pmic, int reg,
-			 u32 mask, u32 bits);
-
 enum hi6421_spmi_pmic_irq_list {
 	OTMP = 0,
 	VBUS_CONNECT,
-- 
cgit v1.2.3


From fcd732406c5d65ba92515af81ca281a4f8348687 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 29 Jan 2021 20:51:49 +0100
Subject: staging: hikey9xx: hi6421-spmi-pmic: rename some vars

- When referring to regmap, rename map to regmap
- inside hi6421-spmi-pmic, call private data struct as
  ddata.

No functional changes.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/d23592b11ac606e3b9a3ff95a754cb75921e60aa.1611949675.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/hikey9xx/hi6421-spmi-pmic.c     | 104 ++++++++++++------------
 drivers/staging/hikey9xx/hi6421v600-regulator.c |  10 +--
 include/linux/mfd/hi6421-spmi-pmic.h            |   2 +-
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
index 3d612bd46231..a4ffeb06ed6a 100644
--- a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
+++ b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
@@ -43,17 +43,17 @@ static const struct mfd_cell hi6421v600_devs[] = {
 
 static irqreturn_t hi6421_spmi_irq_handler(int irq, void *priv)
 {
-	struct hi6421_spmi_pmic *pmic = (struct hi6421_spmi_pmic *)priv;
+	struct hi6421_spmi_pmic *ddata = (struct hi6421_spmi_pmic *)priv;
 	unsigned long pending;
 	unsigned int data;
 	int i, offset;
 
 	for (i = 0; i < HISI_IRQ_ARRAY; i++) {
-		regmap_read(pmic->map, offset, &data);
+		regmap_read(ddata->regmap, offset, &data);
 		data &= HISI_MASK_FIELD;
 		if (data != 0)
 			pr_debug("data[%d]=0x%d\n\r", i, data);
-		regmap_write(pmic->map, i + SOC_PMIC_IRQ0_ADDR, data);
+		regmap_write(ddata->regmap, i + SOC_PMIC_IRQ0_ADDR, data);
 
 		/* for_each_set_bit() macro requires unsigned long */
 		pending = data;
@@ -61,14 +61,14 @@ static irqreturn_t hi6421_spmi_irq_handler(int irq, void *priv)
 		/* solve powerkey order */
 		if ((i == HISI_IRQ_KEY_NUM) &&
 		    ((pending & HISI_IRQ_KEY_VALUE) == HISI_IRQ_KEY_VALUE)) {
-			generic_handle_irq(pmic->irqs[HISI_IRQ_KEY_DOWN]);
-			generic_handle_irq(pmic->irqs[HISI_IRQ_KEY_UP]);
+			generic_handle_irq(ddata->irqs[HISI_IRQ_KEY_DOWN]);
+			generic_handle_irq(ddata->irqs[HISI_IRQ_KEY_UP]);
 			pending &= (~HISI_IRQ_KEY_VALUE);
 		}
 
 		if (pending) {
 			for_each_set_bit(offset, &pending, HISI_BITS)
-				generic_handle_irq(pmic->irqs[offset + i * HISI_BITS]);
+				generic_handle_irq(ddata->irqs[offset + i * HISI_BITS]);
 		}
 	}
 
@@ -77,7 +77,7 @@ static irqreturn_t hi6421_spmi_irq_handler(int irq, void *priv)
 
 static void hi6421_spmi_irq_mask(struct irq_data *d)
 {
-	struct hi6421_spmi_pmic *pmic = irq_data_get_irq_chip_data(d);
+	struct hi6421_spmi_pmic *ddata = irq_data_get_irq_chip_data(d);
 	unsigned long flags;
 	unsigned int data;
 	u32 offset;
@@ -85,28 +85,28 @@ static void hi6421_spmi_irq_mask(struct irq_data *d)
 	offset = (irqd_to_hwirq(d) >> 3);
 	offset += SOC_PMIC_IRQ_MASK_0_ADDR;
 
-	spin_lock_irqsave(&pmic->lock, flags);
+	spin_lock_irqsave(&ddata->lock, flags);
 
-	regmap_read(pmic->map, offset, &data);
+	regmap_read(ddata->regmap, offset, &data);
 	data |= (1 << (irqd_to_hwirq(d) & 0x07));
-	regmap_write(pmic->map, offset, data);
-	spin_unlock_irqrestore(&pmic->lock, flags);
+	regmap_write(ddata->regmap, offset, data);
+	spin_unlock_irqrestore(&ddata->lock, flags);
 }
 
 static void hi6421_spmi_irq_unmask(struct irq_data *d)
 {
-	struct hi6421_spmi_pmic *pmic = irq_data_get_irq_chip_data(d);
+	struct hi6421_spmi_pmic *ddata = irq_data_get_irq_chip_data(d);
 	u32 data, offset;
 	unsigned long flags;
 
 	offset = (irqd_to_hwirq(d) >> 3);
 	offset += SOC_PMIC_IRQ_MASK_0_ADDR;
 
-	spin_lock_irqsave(&pmic->lock, flags);
-	regmap_read(pmic->map, offset, &data);
+	spin_lock_irqsave(&ddata->lock, flags);
+	regmap_read(ddata->regmap, offset, &data);
 	data &= ~(1 << (irqd_to_hwirq(d) & 0x07));
-	regmap_write(pmic->map, offset, data);
-	spin_unlock_irqrestore(&pmic->lock, flags);
+	regmap_write(ddata->regmap, offset, data);
+	spin_unlock_irqrestore(&ddata->lock, flags);
 }
 
 static struct irq_chip hi6421_spmi_pmu_irqchip = {
@@ -120,11 +120,11 @@ static struct irq_chip hi6421_spmi_pmu_irqchip = {
 static int hi6421_spmi_irq_map(struct irq_domain *d, unsigned int virq,
 			       irq_hw_number_t hw)
 {
-	struct hi6421_spmi_pmic *pmic = d->host_data;
+	struct hi6421_spmi_pmic *ddata = d->host_data;
 
 	irq_set_chip_and_handler_name(virq, &hi6421_spmi_pmu_irqchip,
 				      handle_simple_irq, "hisi");
-	irq_set_chip_data(virq, pmic);
+	irq_set_chip_data(virq, ddata);
 	irq_set_irq_type(virq, IRQ_TYPE_NONE);
 
 	return 0;
@@ -135,21 +135,21 @@ static const struct irq_domain_ops hi6421_spmi_domain_ops = {
 	.xlate	= irq_domain_xlate_twocell,
 };
 
-static void hi6421_spmi_pmic_irq_prc(struct hi6421_spmi_pmic *pmic)
+static void hi6421_spmi_pmic_irq_prc(struct hi6421_spmi_pmic *ddata)
 {
 	int i;
 	unsigned int pending;
 
 	for (i = 0 ; i < HISI_IRQ_ARRAY; i++)
-		regmap_write(pmic->map, SOC_PMIC_IRQ_MASK_0_ADDR + i,
+		regmap_write(ddata->regmap, SOC_PMIC_IRQ_MASK_0_ADDR + i,
 				       HISI_MASK_STATE);
 
 	for (i = 0 ; i < HISI_IRQ_ARRAY; i++) {
-		regmap_read(pmic->map, SOC_PMIC_IRQ0_ADDR + i, &pending);
+		regmap_read(ddata->regmap, SOC_PMIC_IRQ0_ADDR + i, &pending);
 
 		pr_debug("PMU IRQ address value:irq[0x%x] = 0x%x\n",
 			 SOC_PMIC_IRQ0_ADDR + i, pending);
-		regmap_write(pmic->map, SOC_PMIC_IRQ0_ADDR + i,
+		regmap_write(ddata->regmap, SOC_PMIC_IRQ0_ADDR + i,
 			     HISI_MASK_STATE);
 	}
 }
@@ -165,79 +165,79 @@ static int hi6421_spmi_pmic_probe(struct spmi_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
-	struct hi6421_spmi_pmic *pmic;
+	struct hi6421_spmi_pmic *ddata;
 	struct regmap *map;
 	unsigned int virq;
 	int ret, i;
 
-	pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL);
-	if (!pmic)
+	ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
+	if (!ddata)
 		return -ENOMEM;
 
 	map = devm_regmap_init_spmi_ext(pdev, &spmi_regmap_config);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	spin_lock_init(&pmic->lock);
+	spin_lock_init(&ddata->lock);
 
-	pmic->dev = dev;
-	pmic->map = map;
+	ddata->dev = dev;
+	ddata->regmap = map;
 
-	pmic->gpio = of_get_gpio(np, 0);
-	if (pmic->gpio < 0)
-		return pmic->gpio;
+	ddata->gpio = of_get_gpio(np, 0);
+	if (ddata->gpio < 0)
+		return ddata->gpio;
 
-	if (!gpio_is_valid(pmic->gpio))
+	if (!gpio_is_valid(ddata->gpio))
 		return -EINVAL;
 
-	ret = devm_gpio_request_one(dev, pmic->gpio, GPIOF_IN, "pmic");
+	ret = devm_gpio_request_one(dev, ddata->gpio, GPIOF_IN, "pmic");
 	if (ret < 0) {
-		dev_err(dev, "failed to request gpio%d\n", pmic->gpio);
+		dev_err(dev, "failed to request gpio%d\n", ddata->gpio);
 		return ret;
 	}
 
-	pmic->irq = gpio_to_irq(pmic->gpio);
+	ddata->irq = gpio_to_irq(ddata->gpio);
 
-	hi6421_spmi_pmic_irq_prc(pmic);
+	hi6421_spmi_pmic_irq_prc(ddata);
 
-	pmic->irqs = devm_kzalloc(dev, HISI_IRQ_NUM * sizeof(int), GFP_KERNEL);
-	if (!pmic->irqs) {
+	ddata->irqs = devm_kzalloc(dev, HISI_IRQ_NUM * sizeof(int), GFP_KERNEL);
+	if (!ddata->irqs) {
 		ret = -ENOMEM;
 		goto irq_malloc;
 	}
 
-	pmic->domain = irq_domain_add_simple(np, HISI_IRQ_NUM, 0,
-					     &hi6421_spmi_domain_ops, pmic);
-	if (!pmic->domain) {
+	ddata->domain = irq_domain_add_simple(np, HISI_IRQ_NUM, 0,
+					     &hi6421_spmi_domain_ops, ddata);
+	if (!ddata->domain) {
 		dev_err(dev, "failed irq domain add simple!\n");
 		ret = -ENODEV;
 		goto irq_malloc;
 	}
 
 	for (i = 0; i < HISI_IRQ_NUM; i++) {
-		virq = irq_create_mapping(pmic->domain, i);
+		virq = irq_create_mapping(ddata->domain, i);
 		if (!virq) {
 			dev_err(dev, "Failed mapping hwirq\n");
 			ret = -ENOSPC;
 			goto irq_malloc;
 		}
-		pmic->irqs[i] = virq;
-		dev_dbg(dev, "%s: pmic->irqs[%d] = %d\n",
-			__func__, i, pmic->irqs[i]);
+		ddata->irqs[i] = virq;
+		dev_dbg(dev, "%s: ddata->irqs[%d] = %d\n",
+			__func__, i, ddata->irqs[i]);
 	}
 
-	ret = request_threaded_irq(pmic->irq, hi6421_spmi_irq_handler, NULL,
+	ret = request_threaded_irq(ddata->irq, hi6421_spmi_irq_handler, NULL,
 				   IRQF_TRIGGER_LOW | IRQF_SHARED | IRQF_NO_SUSPEND,
-				   "pmic", pmic);
+				   "pmic", ddata);
 	if (ret < 0) {
 		dev_err(dev, "could not claim pmic IRQ: error %d\n", ret);
 		goto irq_malloc;
 	}
 
-	dev_set_drvdata(&pdev->dev, pmic);
+	dev_set_drvdata(&pdev->dev, ddata);
 
 	/*
-	 * The logic below will rely that the pmic is already stored at
+	 * The logic below will rely that the ddata is already stored at
 	 * drvdata.
 	 */
 	dev_dbg(&pdev->dev, "SPMI-PMIC: adding children for %pOF\n",
@@ -251,16 +251,16 @@ static int hi6421_spmi_pmic_probe(struct spmi_device *pdev)
 	dev_err(dev, "Failed to add child devices: %d\n", ret);
 
 irq_malloc:
-	free_irq(pmic->irq, pmic);
+	free_irq(ddata->irq, ddata);
 
 	return ret;
 }
 
 static void hi6421_spmi_pmic_remove(struct spmi_device *pdev)
 {
-	struct hi6421_spmi_pmic *pmic = dev_get_drvdata(&pdev->dev);
+	struct hi6421_spmi_pmic *ddata = dev_get_drvdata(&pdev->dev);
 
-	free_irq(pmic->irq, pmic);
+	free_irq(ddata->irq, ddata);
 }
 
 static const struct of_device_id pmic_spmi_id_table[] = {
diff --git a/drivers/staging/hikey9xx/hi6421v600-regulator.c b/drivers/staging/hikey9xx/hi6421v600-regulator.c
index 7090107b9ec2..c801bb840962 100644
--- a/drivers/staging/hikey9xx/hi6421v600-regulator.c
+++ b/drivers/staging/hikey9xx/hi6421v600-regulator.c
@@ -104,7 +104,7 @@ static int hi6421_spmi_regulator_enable(struct regulator_dev *rdev)
 	/* cannot enable more than one regulator at one time */
 	mutex_lock(&sreg->enable_mutex);
 
-	ret = regmap_update_bits(pmic->map, rdev->desc->enable_reg,
+	ret = regmap_update_bits(pmic->regmap, rdev->desc->enable_reg,
 				 rdev->desc->enable_mask,
 			         rdev->desc->enable_mask);
 
@@ -121,7 +121,7 @@ static int hi6421_spmi_regulator_disable(struct regulator_dev *rdev)
 	struct hi6421_spmi_reg_info *sreg = rdev_get_drvdata(rdev);
 	struct hi6421_spmi_pmic *pmic = sreg->pmic;
 
-	return regmap_update_bits(pmic->map, rdev->desc->enable_reg,
+	return regmap_update_bits(pmic->regmap, rdev->desc->enable_reg,
 				  rdev->desc->enable_mask, 0);
 }
 
@@ -131,7 +131,7 @@ static unsigned int hi6421_spmi_regulator_get_mode(struct regulator_dev *rdev)
 	struct hi6421_spmi_pmic *pmic = sreg->pmic;
 	u32 reg_val;
 
-	regmap_read(pmic->map, rdev->desc->enable_reg, &reg_val);
+	regmap_read(pmic->regmap, rdev->desc->enable_reg, &reg_val);
 
 	if (reg_val & sreg->eco_mode_mask)
 		return REGULATOR_MODE_IDLE;
@@ -157,7 +157,7 @@ static int hi6421_spmi_regulator_set_mode(struct regulator_dev *rdev,
 		return -EINVAL;
 	}
 
-	return regmap_update_bits(pmic->map, rdev->desc->enable_reg,
+	return regmap_update_bits(pmic->regmap, rdev->desc->enable_reg,
 				  sreg->eco_mode_mask, val);
 }
 
@@ -266,7 +266,7 @@ static int hi6421_spmi_regulator_probe(struct platform_device *pdev)
 
 		config.dev = pdev->dev.parent;
 		config.driver_data = sreg;
-		config.regmap = pmic->map;
+		config.regmap = pmic->regmap;
 
 		rdev = devm_regulator_register(dev, &info->desc, &config);
 		if (IS_ERR(rdev)) {
diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h
index c5a4eac1fdc0..aa8d5382f559 100644
--- a/include/linux/mfd/hi6421-spmi-pmic.h
+++ b/include/linux/mfd/hi6421-spmi-pmic.h
@@ -26,7 +26,7 @@ struct hi6421_spmi_pmic {
 	int					irq;
 	int					gpio;
 	unsigned int				*irqs;
-	struct regmap				*map;
+	struct regmap				*regmap;
 };
 
 enum hi6421_spmi_pmic_irq_list {
-- 
cgit v1.2.3


From 9d8dbe989029e6d767ed56773ac65409da625381 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 29 Jan 2021 20:51:51 +0100
Subject: staging: hikey9xx: hi6421-spmi-pmic: cleanup header file

Remove the IRQ list from the header, as this is used only
inside the driver itself. Also, get rid of two unused
defines.

The net result is that only struct hi6421_spmi_pmic remains
on it, as this is used by the regulator driver.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/138c3a11e4de0ebabdf27932957852136c2f7510.1611949675.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/hikey9xx/hi6421-spmi-pmic.c | 17 +++++++++++++++++
 include/linux/mfd/hi6421-spmi-pmic.h        | 20 --------------------
 2 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
index c8e55b7b08e2..909f7b106af4 100644
--- a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
+++ b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
@@ -17,6 +17,23 @@
 #include <linux/slab.h>
 #include <linux/spmi.h>
 
+enum hi6421_spmi_pmic_irq_list {
+	OTMP = 0,
+	VBUS_CONNECT,
+	VBUS_DISCONNECT,
+	ALARMON_R,
+	HOLD_6S,
+	HOLD_1S,
+	POWERKEY_UP,
+	POWERKEY_DOWN,
+	OCP_SCP_R,
+	COUL_R,
+	SIM0_HPD_R,
+	SIM0_HPD_F,
+	SIM1_HPD_R,
+	SIM1_HPD_F,
+	PMIC_IRQ_LIST_MAX,
+};
 /* 8-bit register offset in PMIC */
 #define HISI_MASK_STATE			0xff
 
diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h
index aa8d5382f559..4d61cb266a18 100644
--- a/include/linux/mfd/hi6421-spmi-pmic.h
+++ b/include/linux/mfd/hi6421-spmi-pmic.h
@@ -14,9 +14,6 @@
 #include <linux/irqdomain.h>
 #include <linux/regmap.h>
 
-#define HISI_ECO_MODE_ENABLE		(1)
-#define HISI_ECO_MODE_DISABLE		(0)
-
 struct hi6421_spmi_pmic {
 	struct resource				*res;
 	struct device				*dev;
@@ -29,21 +26,4 @@ struct hi6421_spmi_pmic {
 	struct regmap				*regmap;
 };
 
-enum hi6421_spmi_pmic_irq_list {
-	OTMP = 0,
-	VBUS_CONNECT,
-	VBUS_DISCONNECT,
-	ALARMON_R,
-	HOLD_6S,
-	HOLD_1S,
-	POWERKEY_UP,
-	POWERKEY_DOWN,
-	OCP_SCP_R,
-	COUL_R,
-	SIM0_HPD_R,
-	SIM0_HPD_F,
-	SIM1_HPD_R,
-	SIM1_HPD_F,
-	PMIC_IRQ_LIST_MAX,
-};
 #endif		/* __HISI_PMIC_H */
-- 
cgit v1.2.3


From fb7ba1870d5fdf45513fc04af8e46492eb65a5e3 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 29 Jan 2021 20:51:55 +0100
Subject: staging: hikey9xx: hi6421-spmi-pmic: update copyright notes

At PMIC subsystem, C89 comments are preferred over C99.
While here, also update the copyrights of the header file.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/5a86478c8ccb93d3105485b5f16e20e9c12e2196.1611949675.git.mchehab+huawei@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/hikey9xx/hi6421-spmi-pmic.c | 14 +++++++-------
 include/linux/mfd/hi6421-spmi-pmic.h        |  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
index 9c10f7c4e7c9..2301f4fcd48d 100644
--- a/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
+++ b/drivers/staging/hikey9xx/hi6421-spmi-pmic.c
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
-//
-// Device driver for regulators in HISI PMIC IC
-//
-// Copyright (c) 2013 Linaro Ltd.
-// Copyright (c) 2011 Hisilicon.
-//
-// Copyright (c) 2020-2021 Huawei Technologies Co., Ltd
+/*
+ * Device driver for regulators in HISI PMIC IC
+ *
+ * Copyright (c) 2013 Linaro Ltd.
+ * Copyright (c) 2011 Hisilicon.
+ * Copyright (c) 2020-2021 Huawei Technologies Co., Ltd
+ */
 
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h
index 4d61cb266a18..2660226138b8 100644
--- a/include/linux/mfd/hi6421-spmi-pmic.h
+++ b/include/linux/mfd/hi6421-spmi-pmic.h
@@ -4,6 +4,7 @@
  *
  * Copyright (c) 2013 Linaro Ltd.
  * Copyright (C) 2011 Hisilicon.
+ * Copyright (c) 2020-2021 Huawei Technologies Co., Ltd
  *
  * Guodong Xu <guodong.xu@linaro.org>
  */
-- 
cgit v1.2.3


From a0f2a1cb65c9d8a66853e1b67184022663950f6d Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Fri, 29 Jan 2021 21:31:17 +0200
Subject: dmaengine: ti: k3-psil: optimize struct psil_endpoint_config for size

Optimize struct psil_endpoint_config for size by
- reordering fields
- grouping bitfields
- change mapped_channel_id type to s16 (32K channel is enough)
- default_flow_id type to s16 as it's assigned to -1

before:
text            data     bss    dec	        hex	filename
12654100	5211472	 666904	18532476	11ac87c	vmlinux

after:
12654100	5208528	 666904	18529532	11abcfc	vmlinux

diff: 2944 bytes

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20210129193117.28833-1-grygorii.strashko@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-psil.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h
index 36e22c5a0f29..5f106d852f1c 100644
--- a/include/linux/dma/k3-psil.h
+++ b/include/linux/dma/k3-psil.h
@@ -42,14 +42,14 @@ enum psil_endpoint_type {
 /**
  * struct psil_endpoint_config - PSI-L Endpoint configuration
  * @ep_type:		PSI-L endpoint type
+ * @channel_tpl:	Desired throughput level for the channel
  * @pkt_mode:		If set, the channel must be in Packet mode, otherwise in
  *			TR mode
  * @notdpkt:		TDCM must be suppressed on the TX channel
  * @needs_epib:		Endpoint needs EPIB
- * @psd_size:		If set, PSdata is used by the endpoint
- * @channel_tpl:	Desired throughput level for the channel
  * @pdma_acc32:		ACC32 must be enabled on the PDMA side
  * @pdma_burst:		BURST must be enabled on the PDMA side
+ * @psd_size:		If set, PSdata is used by the endpoint
  * @mapped_channel_id:	PKTDMA thread to channel mapping for mapped channels.
  *			The thread must be serviced by the specified channel if
  *			mapped_channel_id is >= 0 in case of PKTDMA
@@ -62,23 +62,22 @@ enum psil_endpoint_type {
  */
 struct psil_endpoint_config {
 	enum psil_endpoint_type ep_type;
+	enum udma_tp_level channel_tpl;
 
 	unsigned pkt_mode:1;
 	unsigned notdpkt:1;
 	unsigned needs_epib:1;
-	u32 psd_size;
-	enum udma_tp_level channel_tpl;
-
 	/* PDMA properties, valid for PSIL_EP_PDMA_* */
 	unsigned pdma_acc32:1;
 	unsigned pdma_burst:1;
 
+	u32 psd_size;
 	/* PKDMA mapped channel */
-	int mapped_channel_id;
+	s16 mapped_channel_id;
 	/* PKTDMA tflow and rflow ranges for mapped channel */
 	u16 flow_start;
 	u16 flow_num;
-	u16 default_flow_id;
+	s16 default_flow_id;
 };
 
 int psil_set_new_ep_config(struct device *dev, const char *name,
-- 
cgit v1.2.3


From 67a73230fbaed9ff4902389d4f040abe57c5783c Mon Sep 17 00:00:00 2001
From: "H. Nikolaus Schaller" <hns@goldelico.com>
Date: Wed, 23 Dec 2020 10:01:51 +0100
Subject: mmc: omap: remove unused struct component card_detect_irq

I have not found any user for this struct component.

Signed-off-by: H. Nikolaus Schaller <hns@goldelico.com>
Link: https://lore.kernel.org/r/b6f2168b863e4273c6bca5a22fbd4a3a8ddf68d6.1608714110.git.hns@goldelico.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/platform_data/mmc-omap.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mmc-omap.h b/include/linux/platform_data/mmc-omap.h
index f0b8947e6b07..91051e9907f3 100644
--- a/include/linux/platform_data/mmc-omap.h
+++ b/include/linux/platform_data/mmc-omap.h
@@ -108,8 +108,7 @@ struct omap_mmc_platform_data {
 		const char *name;
 		u32 ocr_mask;
 
-		/* Card detection IRQs */
-		int card_detect_irq;
+		/* Card detection */
 		int (*card_detect)(struct device *dev, int slot);
 
 		unsigned int ban_openended:1;
-- 
cgit v1.2.3


From 3561afa02605b398d1b98e1ce913ea6411cdc5dd Mon Sep 17 00:00:00 2001
From: Andrew Jeffery <andrew@aj.id.au>
Date: Thu, 14 Jan 2021 13:44:28 +1030
Subject: mmc: core: Add helper for parsing clock phase properties

Drivers for MMC hosts that accept phase corrections can take advantage
of the helper by embedding an instance of struct mmc_clk_phase_map in
their private data and invoking mmc_of_parse_clk_phase() to extract
phase parameters. It is the responsibility of the host driver to
translate and apply the extracted values to hardware as required.

Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
Link: https://lore.kernel.org/r/20210114031433.2388532-2-andrew@aj.id.au
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h | 13 +++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 96b2ca1f1b06..74e853bb6948 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -163,6 +163,50 @@ static void mmc_retune_timer(struct timer_list *t)
 	mmc_retune_needed(host);
 }
 
+static void mmc_of_parse_timing_phase(struct device *dev, const char *prop,
+				      struct mmc_clk_phase *phase)
+{
+	int degrees[2] = {0};
+	int rc;
+
+	rc = device_property_read_u32_array(dev, prop, degrees, 2);
+	phase->valid = !rc;
+	if (phase->valid) {
+		phase->in_deg = degrees[0];
+		phase->out_deg = degrees[1];
+	}
+}
+
+void
+mmc_of_parse_clk_phase(struct mmc_host *host, struct mmc_clk_phase_map *map)
+{
+	struct device *dev = host->parent;
+
+	mmc_of_parse_timing_phase(dev, "clk-phase-legacy",
+				  &map->phase[MMC_TIMING_LEGACY]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-mmc-hs",
+				  &map->phase[MMC_TIMING_MMC_HS]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-sd-hs",
+				  &map->phase[MMC_TIMING_SD_HS]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-uhs-sdr12",
+				  &map->phase[MMC_TIMING_UHS_SDR12]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-uhs-sdr25",
+				  &map->phase[MMC_TIMING_UHS_SDR25]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-uhs-sdr50",
+				  &map->phase[MMC_TIMING_UHS_SDR50]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-uhs-sdr104",
+				  &map->phase[MMC_TIMING_UHS_SDR104]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-uhs-ddr50",
+				  &map->phase[MMC_TIMING_UHS_DDR50]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-mmc-ddr52",
+				  &map->phase[MMC_TIMING_MMC_DDR52]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-mmc-hs200",
+				  &map->phase[MMC_TIMING_MMC_HS200]);
+	mmc_of_parse_timing_phase(dev, "clk-phase-mmc-hs400",
+				  &map->phase[MMC_TIMING_MMC_HS400]);
+}
+EXPORT_SYMBOL(mmc_of_parse_clk_phase);
+
 /**
  *	mmc_of_parse() - parse host's device-tree node
  *	@host: host whose node should be parsed.
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 01bba36545c5..8cf686d98a68 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -79,6 +79,17 @@ struct mmc_ios {
 	bool enhanced_strobe;			/* hs400es selection */
 };
 
+struct mmc_clk_phase {
+	bool valid;
+	u16 in_deg;
+	u16 out_deg;
+};
+
+#define MMC_NUM_CLK_PHASES (MMC_TIMING_MMC_HS400 + 1)
+struct mmc_clk_phase_map {
+	struct mmc_clk_phase phase[MMC_NUM_CLK_PHASES];
+};
+
 struct mmc_host;
 
 struct mmc_host_ops {
@@ -490,6 +501,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *);
 int mmc_add_host(struct mmc_host *);
 void mmc_remove_host(struct mmc_host *);
 void mmc_free_host(struct mmc_host *);
+void mmc_of_parse_clk_phase(struct mmc_host *host,
+			    struct mmc_clk_phase_map *map);
 int mmc_of_parse(struct mmc_host *host);
 int mmc_of_parse_voltage(struct device_node *np, u32 *mask);
 
-- 
cgit v1.2.3


From 5851d3b042b694839d2241fbb3200ce958135cdf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 21 Jan 2021 00:21:54 -0800
Subject: block/keyslot-manager: introduce devm_blk_ksm_init()

Add a resource-managed variant of blk_ksm_init() so that drivers don't
have to worry about calling blk_ksm_destroy().

Note that the implementation uses a custom devres action to call
blk_ksm_destroy() rather than switching the two allocations to be
directly devres-managed, e.g. with devm_kmalloc().  This is because we
need to keep zeroing the memory containing the keyslots when it is
freed, and also because we want to continue using kvmalloc() (and there
is no devm_kvmalloc()).

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Satya Tangirala <satyat@google.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20210121082155.111333-2-ebiggers@kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/block/inline-encryption.rst | 12 ++++++------
 block/keyslot-manager.c                   | 29 +++++++++++++++++++++++++++++
 include/linux/keyslot-manager.h           |  3 +++
 3 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index e75151e467d3..7f9b40d6b416 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -182,8 +182,9 @@ API presented to device drivers
 
 A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in
 the ``request_queue`` of the device. The device driver needs to call
-``blk_ksm_init`` on the ``blk_keyslot_manager``, which specifying the number of
-keyslots supported by the hardware.
+``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the
+``blk_keyslot_manager``, while specifying the number of keyslots supported by
+the hardware.
 
 The device driver also needs to tell the KSM how to actually manipulate the
 IE hardware in the device to do things like programming the crypto key into
@@ -202,10 +203,9 @@ needs each and every of its keyslots to be reprogrammed with the key it
 "should have" at the point in time when the function is called. This is useful
 e.g. if a device loses all its keys on runtime power down/up.
 
-``blk_ksm_destroy`` should be called to free up all resources used by a keyslot
-manager upon ``blk_ksm_init``, once the ``blk_keyslot_manager`` is no longer
-needed.
-
+If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then
+``blk_ksm_destroy`` should be called to free up all resources used by a
+``blk_keyslot_manager`` once it is no longer needed.
 
 Layered Devices
 ===============
diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
index 86f8195d8039..324bf4244f5f 100644
--- a/block/keyslot-manager.c
+++ b/block/keyslot-manager.c
@@ -29,6 +29,7 @@
 #define pr_fmt(fmt) "blk-crypto: " fmt
 
 #include <linux/keyslot-manager.h>
+#include <linux/device.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/pm_runtime.h>
@@ -127,6 +128,34 @@ err_destroy_ksm:
 }
 EXPORT_SYMBOL_GPL(blk_ksm_init);
 
+static void blk_ksm_destroy_callback(void *ksm)
+{
+	blk_ksm_destroy(ksm);
+}
+
+/**
+ * devm_blk_ksm_init() - Resource-managed blk_ksm_init()
+ * @dev: The device which owns the blk_keyslot_manager.
+ * @ksm: The blk_keyslot_manager to initialize.
+ * @num_slots: The number of key slots to manage.
+ *
+ * Like blk_ksm_init(), but causes blk_ksm_destroy() to be called automatically
+ * on driver detach.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
+		      unsigned int num_slots)
+{
+	int err = blk_ksm_init(ksm, num_slots);
+
+	if (err)
+		return err;
+
+	return devm_add_action_or_reset(dev, blk_ksm_destroy_callback, ksm);
+}
+EXPORT_SYMBOL_GPL(devm_blk_ksm_init);
+
 static inline struct hlist_head *
 blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm,
 			    const struct blk_crypto_key *key)
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h
index 18f3f5346843..443ad817c6c5 100644
--- a/include/linux/keyslot-manager.h
+++ b/include/linux/keyslot-manager.h
@@ -85,6 +85,9 @@ struct blk_keyslot_manager {
 
 int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots);
 
+int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm,
+		      unsigned int num_slots);
+
 blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
 				      const struct blk_crypto_key *key,
 				      struct blk_ksm_keyslot **slot_ptr);
-- 
cgit v1.2.3


From 93f1c150cb0d043e1e8985db7824b4e2e1ac653f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 25 Jan 2021 16:14:48 -0800
Subject: mmc: core: Add basic support for inline encryption

In preparation for adding CQHCI crypto engine (inline encryption)
support, add the code required to make mmc_core and mmc_block aware of
inline encryption.  Specifically:

- Add a capability flag MMC_CAP2_CRYPTO to struct mmc_host.  Drivers
  will set this if the host and driver support inline encryption.

- Embed a blk_keyslot_manager in struct mmc_host.  Drivers will
  initialize this (as a device-managed resource) if the host and driver
  support inline encryption.  mmc_block registers this keyslot manager
  with the request_queue of any MMC card attached to the host.

- Make mmc_block copy the crypto keyslot and crypto data unit number
  from struct request to struct mmc_request, so that drivers will have
  access to them.

- If the MMC host is reset, reprogram all the keyslots to ensure that
  the software state stays in sync with the hardware state.

Co-developed-by: Satya Tangirala <satyat@google.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Satya Tangirala <satyat@google.com>
Reviewed-and-tested-by: Peng Zhou <peng.zhou@mediatek.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20210126001456.382989-2-ebiggers@kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/Kconfig  |  8 ++++++++
 drivers/mmc/core/Makefile |  1 +
 drivers/mmc/core/block.c  |  3 +++
 drivers/mmc/core/core.c   |  3 +++
 drivers/mmc/core/crypto.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/crypto.h | 40 +++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/host.c   |  1 +
 drivers/mmc/core/queue.c  |  3 +++
 include/linux/mmc/core.h  |  6 ++++++
 include/linux/mmc/host.h  | 11 +++++++++++
 10 files changed, 124 insertions(+)
 create mode 100644 drivers/mmc/core/crypto.c
 create mode 100644 drivers/mmc/core/crypto.h

(limited to 'include/linux')

diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig
index c12fe13e4b14..ae8b69aee619 100644
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -81,3 +81,11 @@ config MMC_TEST
 	  This driver is only of interest to those developing or
 	  testing a host driver. Most people should say N here.
 
+config MMC_CRYPTO
+	bool "MMC Crypto Engine Support"
+	depends on BLK_INLINE_ENCRYPTION
+	help
+	  Enable Crypto Engine Support in MMC.
+	  Enabling this makes it possible for the kernel to use the crypto
+	  capabilities of the MMC device (if present) to perform crypto
+	  operations on data being transferred to/from the device.
diff --git a/drivers/mmc/core/Makefile b/drivers/mmc/core/Makefile
index 95ffe008ebdf..6a907736cd7a 100644
--- a/drivers/mmc/core/Makefile
+++ b/drivers/mmc/core/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_MMC_BLOCK)		+= mmc_block.o
 mmc_block-objs			:= block.o queue.o
 obj-$(CONFIG_MMC_TEST)		+= mmc_test.o
 obj-$(CONFIG_SDIO_UART)		+= sdio_uart.o
+mmc_core-$(CONFIG_MMC_CRYPTO)	+= crypto.o
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 42e27a298218..b877f62df366 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -51,6 +51,7 @@
 #include "block.h"
 #include "core.h"
 #include "card.h"
+#include "crypto.h"
 #include "host.h"
 #include "bus.h"
 #include "mmc_ops.h"
@@ -1247,6 +1248,8 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq,
 
 	memset(brq, 0, sizeof(struct mmc_blk_request));
 
+	mmc_crypto_prepare_req(mqrq);
+
 	brq->mrq.data = &brq->data;
 	brq->mrq.tag = req->tag;
 
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 49d9117550af..1136b859ddd8 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -37,6 +37,7 @@
 
 #include "core.h"
 #include "card.h"
+#include "crypto.h"
 #include "bus.h"
 #include "host.h"
 #include "sdio_bus.h"
@@ -992,6 +993,8 @@ void mmc_set_initial_state(struct mmc_host *host)
 		host->ops->hs400_enhanced_strobe(host, &host->ios);
 
 	mmc_set_ios(host);
+
+	mmc_crypto_set_initial_state(host);
 }
 
 /**
diff --git a/drivers/mmc/core/crypto.c b/drivers/mmc/core/crypto.c
new file mode 100644
index 000000000000..419a368f8402
--- /dev/null
+++ b/drivers/mmc/core/crypto.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MMC crypto engine (inline encryption) support
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <linux/blk-crypto.h>
+#include <linux/mmc/host.h>
+
+#include "core.h"
+#include "crypto.h"
+#include "queue.h"
+
+void mmc_crypto_set_initial_state(struct mmc_host *host)
+{
+	/* Reset might clear all keys, so reprogram all the keys. */
+	if (host->caps2 & MMC_CAP2_CRYPTO)
+		blk_ksm_reprogram_all_keys(&host->ksm);
+}
+
+void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host)
+{
+	if (host->caps2 & MMC_CAP2_CRYPTO)
+		blk_ksm_register(&host->ksm, q);
+}
+EXPORT_SYMBOL_GPL(mmc_crypto_setup_queue);
+
+void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq)
+{
+	struct request *req = mmc_queue_req_to_req(mqrq);
+	struct mmc_request *mrq = &mqrq->brq.mrq;
+
+	if (!req->crypt_keyslot)
+		return;
+
+	mrq->crypto_enabled = true;
+	mrq->crypto_key_slot = blk_ksm_get_slot_idx(req->crypt_keyslot);
+
+	/*
+	 * For now we assume that all MMC drivers set max_dun_bytes_supported=4,
+	 * which is the limit for CQHCI crypto.  So all DUNs should be 32-bit.
+	 */
+	WARN_ON_ONCE(req->crypt_ctx->bc_dun[0] > U32_MAX);
+
+	mrq->data_unit_num = req->crypt_ctx->bc_dun[0];
+}
+EXPORT_SYMBOL_GPL(mmc_crypto_prepare_req);
diff --git a/drivers/mmc/core/crypto.h b/drivers/mmc/core/crypto.h
new file mode 100644
index 000000000000..fbe9a520bf90
--- /dev/null
+++ b/drivers/mmc/core/crypto.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MMC crypto engine (inline encryption) support
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#ifndef _MMC_CORE_CRYPTO_H
+#define _MMC_CORE_CRYPTO_H
+
+struct mmc_host;
+struct mmc_queue_req;
+struct request_queue;
+
+#ifdef CONFIG_MMC_CRYPTO
+
+void mmc_crypto_set_initial_state(struct mmc_host *host);
+
+void mmc_crypto_setup_queue(struct request_queue *q, struct mmc_host *host);
+
+void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq);
+
+#else /* CONFIG_MMC_CRYPTO */
+
+static inline void mmc_crypto_set_initial_state(struct mmc_host *host)
+{
+}
+
+static inline void mmc_crypto_setup_queue(struct request_queue *q,
+					  struct mmc_host *host)
+{
+}
+
+static inline void mmc_crypto_prepare_req(struct mmc_queue_req *mqrq)
+{
+}
+
+#endif /* !CONFIG_MMC_CRYPTO */
+
+#endif /* _MMC_CORE_CRYPTO_H */
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 74e853bb6948..9b89a91b6b47 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -25,6 +25,7 @@
 #include <linux/mmc/slot-gpio.h>
 
 #include "core.h"
+#include "crypto.h"
 #include "host.h"
 #include "slot-gpio.h"
 #include "pwrseq.h"
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 002426e3cf76..33e7e65b6dde 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -19,6 +19,7 @@
 #include "block.h"
 #include "core.h"
 #include "card.h"
+#include "crypto.h"
 #include "host.h"
 
 #define MMC_DMA_MAP_MERGE_SEGMENTS	512
@@ -407,6 +408,8 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 	mutex_init(&mq->complete_lock);
 
 	init_waitqueue_head(&mq->wait);
+
+	mmc_crypto_setup_queue(mq->queue, host);
 }
 
 static inline bool mmc_merge_capable(struct mmc_host *host)
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 29aa50711626..ab19245e9945 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -162,6 +162,12 @@ struct mmc_request {
 	bool			cap_cmd_during_tfr;
 
 	int			tag;
+
+#ifdef CONFIG_MMC_CRYPTO
+	bool			crypto_enabled;
+	int			crypto_key_slot;
+	u32			data_unit_num;
+#endif
 };
 
 struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 8cf686d98a68..927ba7566617 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -15,6 +15,7 @@
 #include <linux/mmc/card.h>
 #include <linux/mmc/pm.h>
 #include <linux/dma-direction.h>
+#include <linux/keyslot-manager.h>
 
 struct mmc_ios {
 	unsigned int	clock;			/* clock rate */
@@ -395,6 +396,11 @@ struct mmc_host {
 #define MMC_CAP2_CQE_DCMD	(1 << 24)	/* CQE can issue a direct command */
 #define MMC_CAP2_AVOID_3_3V	(1 << 25)	/* Host must negotiate down from 3.3V */
 #define MMC_CAP2_MERGE_CAPABLE	(1 << 26)	/* Host can merge a segment over the segment size */
+#ifdef CONFIG_MMC_CRYPTO
+#define MMC_CAP2_CRYPTO		(1 << 27)	/* Host supports inline encryption */
+#else
+#define MMC_CAP2_CRYPTO		0
+#endif
 
 	int			fixed_drv_type;	/* fixed driver type for non-removable media */
 
@@ -489,6 +495,11 @@ struct mmc_host {
 	bool			cqe_enabled;
 	bool			cqe_on;
 
+	/* Inline encryption support */
+#ifdef CONFIG_MMC_CRYPTO
+	struct blk_keyslot_manager ksm;
+#endif
+
 	/* Host Software Queue support */
 	bool			hsq_enabled;
 
-- 
cgit v1.2.3


From 40596d2f2b6075f6c33180b2f55c814ff4885475 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Mon, 11 Jan 2021 19:18:51 +0800
Subject: iommu/io-pgtable-arm-v7s: Extend PA34 for MediaTek

MediaTek extend the bit5 in lvl1 and lvl2 descriptor as PA34.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Link: https://lore.kernel.org/r/20210111111914.22211-11-yong.wu@mediatek.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/io-pgtable-arm-v7s.c | 9 +++++++--
 drivers/iommu/mtk_iommu.c          | 2 +-
 include/linux/io-pgtable.h         | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index 03f35065acff..b1b2ff069040 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -112,9 +112,10 @@
 #define ARM_V7S_TEX_MASK		0x7
 #define ARM_V7S_ATTR_TEX(val)		(((val) & ARM_V7S_TEX_MASK) << ARM_V7S_TEX_SHIFT)
 
-/* MediaTek extend the two bits for PA 32bit/33bit */
+/* MediaTek extend the bits below for PA 32bit/33bit/34bit */
 #define ARM_V7S_ATTR_MTK_PA_BIT32	BIT(9)
 #define ARM_V7S_ATTR_MTK_PA_BIT33	BIT(4)
+#define ARM_V7S_ATTR_MTK_PA_BIT34	BIT(5)
 
 /* *well, except for TEX on level 2 large pages, of course :( */
 #define ARM_V7S_CONT_PAGE_TEX_SHIFT	6
@@ -194,6 +195,8 @@ static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
 		pte |= ARM_V7S_ATTR_MTK_PA_BIT32;
 	if (paddr & BIT_ULL(33))
 		pte |= ARM_V7S_ATTR_MTK_PA_BIT33;
+	if (paddr & BIT_ULL(34))
+		pte |= ARM_V7S_ATTR_MTK_PA_BIT34;
 	return pte;
 }
 
@@ -218,6 +221,8 @@ static phys_addr_t iopte_to_paddr(arm_v7s_iopte pte, int lvl,
 		paddr |= BIT_ULL(32);
 	if (pte & ARM_V7S_ATTR_MTK_PA_BIT33)
 		paddr |= BIT_ULL(33);
+	if (pte & ARM_V7S_ATTR_MTK_PA_BIT34)
+		paddr |= BIT_ULL(34);
 	return paddr;
 }
 
@@ -748,7 +753,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	if (cfg->ias > ARM_V7S_ADDR_BITS)
 		return NULL;
 
-	if (cfg->oas > (arm_v7s_is_mtk_enabled(cfg) ? 34 : ARM_V7S_ADDR_BITS))
+	if (cfg->oas > (arm_v7s_is_mtk_enabled(cfg) ? 35 : ARM_V7S_ADDR_BITS))
 		return NULL;
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index f594971dbeb2..485f3b6d1a21 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -300,7 +300,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom)
 			IO_PGTABLE_QUIRK_ARM_MTK_EXT,
 		.pgsize_bitmap = mtk_iommu_ops.pgsize_bitmap,
 		.ias = 32,
-		.oas = 34,
+		.oas = 35,
 		.iommu_dev = data->dev,
 	};
 
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 06753323a15e..d69234a4a65e 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -69,8 +69,8 @@ struct io_pgtable_cfg {
 	 *	format, and/or requires some format-specific default value.
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_MTK_EXT: (ARM v7s format) MediaTek IOMMUs extend
-	 *	to support up to 34 bits PA where the bit32 and bit33 are
-	 *	encoded in the bit9 and bit4 of the PTE respectively.
+	 *	to support up to 35 bits PA where the bit32, bit33 and bit34 are
+	 *	encoded in the bit9, bit4 and bit5 of the PTE respectively.
 	 *
 	 * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
 	 *	on unmap, for DMA domains using the flush queue mechanism for
-- 
cgit v1.2.3


From f5030e252687be6e999bd52feb1f79d515b2f684 Mon Sep 17 00:00:00 2001
From: Benson Leung <bleung@chromium.org>
Date: Thu, 28 Jan 2021 22:14:02 -0800
Subject: usb: typec: Provide PD Specification Revision for cable and partner

The USB Power Delivery specification Section 6.2.1.1.5 outlines
revision backward compatibility requirements starting from Revision 3.0.

The Port, the Cable Plug, and the Port Partner may support either
revision 2 or revision 3 independently, and communication between ports,
partners, and cables of different revisions are allowed under rules
that the parties agree to communicate between each other using the
lowest common operating revision.

This may mean that Port-to-Partner operating revision comms may be
different than Port-to-CablePlug operating revision comms. For example,
it is possible for a R3.0 port to communicate with a R3.0 partner
using R3.0 messages, while the R3.0 port (in the same session) must
communicate with the R2.0 cable using R2.0 messages only.

Introduce individual revision number properties for cable
and port partner so that the port can track them independently.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Benson Leung <bleung@chromium.org>
Link: https://lore.kernel.org/r/20210129061406.2680146-3-bleung@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec | 13 +++++++++++++
 drivers/usb/typec/class.c                   | 30 +++++++++++++++++++++++++----
 include/linux/usb/typec.h                   | 10 ++++++++++
 3 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index b61480535fdc..40122d915ae1 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -112,6 +112,19 @@ Description:
 		- "3.0": USB Power Delivery Release 3.0
 		- "3.1": USB Power Delivery Release 3.1
 
+What:		/sys/class/typec/<port>-{partner|cable}/usb_power_delivery_revision
+Date:		January 2021
+Contact:	Benson Leung <bleung@chromium.org>
+Description:
+		Revision number of the supported USB Power Delivery
+		specification of the port partner or cable, or 0.0 when USB
+		Power Delivery is not supported.
+
+		Example values:
+		- "2.0": USB Power Delivery Release 2.0
+		- "3.0": USB Power Delivery Release 3.0
+		- "3.1": USB Power Delivery Release 3.1
+
 What:		/sys/class/typec/<port>/usb_typec_revision
 Date:		April 2017
 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index 4f60ee7ba76a..b5241f4756c2 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -27,6 +27,7 @@ struct typec_cable {
 	enum typec_plug_type		type;
 	struct usb_pd_identity		*identity;
 	unsigned int			active:1;
+	u16				pd_revision; /* 0300H = "3.0" */
 };
 
 struct typec_partner {
@@ -36,6 +37,7 @@ struct typec_partner {
 	enum typec_accessory		accessory;
 	struct ida			mode_ids;
 	int				num_altmodes;
+	u16				pd_revision; /* 0300H = "3.0" */
 };
 
 struct typec_port {
@@ -264,6 +266,11 @@ type_show(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR_RO(type);
 
+static ssize_t usb_power_delivery_revision_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf);
+static DEVICE_ATTR_RO(usb_power_delivery_revision);
+
 /* ------------------------------------------------------------------------- */
 /* Alternate Modes */
 
@@ -680,6 +687,7 @@ static struct attribute *typec_partner_attrs[] = {
 	&dev_attr_supports_usb_power_delivery.attr,
 	&dev_attr_number_of_alternate_modes.attr,
 	&dev_attr_type.attr,
+	&dev_attr_usb_power_delivery_revision.attr,
 	NULL
 };
 
@@ -815,6 +823,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 	partner->usb_pd = desc->usb_pd;
 	partner->accessory = desc->accessory;
 	partner->num_altmodes = -1;
+	partner->pd_revision = desc->pd_revision;
 
 	if (desc->identity) {
 		/*
@@ -1028,6 +1037,7 @@ static DEVICE_ATTR_RO(plug_type);
 static struct attribute *typec_cable_attrs[] = {
 	&dev_attr_type.attr,
 	&dev_attr_plug_type.attr,
+	&dev_attr_usb_power_delivery_revision.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(typec_cable);
@@ -1130,6 +1140,7 @@ struct typec_cable *typec_register_cable(struct typec_port *port,
 
 	cable->type = desc->type;
 	cable->active = desc->active;
+	cable->pd_revision = desc->pd_revision;
 
 	if (desc->identity) {
 		/*
@@ -1499,12 +1510,23 @@ static ssize_t usb_power_delivery_revision_show(struct device *dev,
 						struct device_attribute *attr,
 						char *buf)
 {
-	struct typec_port *p = to_typec_port(dev);
-	u16 rev = p->cap->pd_revision;
+	u16 rev = 0;
 
-	return sprintf(buf, "%d.%d\n", (rev >> 8) & 0xff, (rev >> 4) & 0xf);
+	if (is_typec_partner(dev)) {
+		struct typec_partner *partner = to_typec_partner(dev);
+
+		rev = partner->pd_revision;
+	} else if (is_typec_cable(dev)) {
+		struct typec_cable *cable = to_typec_cable(dev);
+
+		rev = cable->pd_revision;
+	} else if (is_typec_port(dev)) {
+		struct typec_port *p = to_typec_port(dev);
+
+		rev = p->cap->pd_revision;
+	}
+	return sysfs_emit(buf, "%d.%d\n", (rev >> 8) & 0xff, (rev >> 4) & 0xf);
 }
-static DEVICE_ATTR_RO(usb_power_delivery_revision);
 
 static ssize_t orientation_show(struct device *dev,
 				   struct device_attribute *attr,
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 54475323f83b..42c6b7c07a99 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -164,6 +164,7 @@ struct typec_plug_desc {
  * @type: The plug type from USB PD Cable VDO
  * @active: Is the cable active or passive
  * @identity: Result of Discover Identity command
+ * @pd_revision: USB Power Delivery Specification revision if supported
  *
  * Represents USB Type-C Cable attached to USB Type-C port.
  */
@@ -171,6 +172,8 @@ struct typec_cable_desc {
 	enum typec_plug_type	type;
 	unsigned int		active:1;
 	struct usb_pd_identity	*identity;
+	u16			pd_revision; /* 0300H = "3.0" */
+
 };
 
 /*
@@ -178,15 +181,22 @@ struct typec_cable_desc {
  * @usb_pd: USB Power Delivery support
  * @accessory: Audio, Debug or none.
  * @identity: Discover Identity command data
+ * @pd_revision: USB Power Delivery Specification Revision if supported
  *
  * Details about a partner that is attached to USB Type-C port. If @identity
  * member exists when partner is registered, a directory named "identity" is
  * created to sysfs for the partner device.
+ *
+ * @pd_revision is based on the setting of the "Specification Revision" field
+ * in the message header on the initial "Source Capabilities" message received
+ * from the partner, or a "Request" message received from the partner, depending
+ * on whether our port is a Sink or a Source.
  */
 struct typec_partner_desc {
 	unsigned int		usb_pd:1;
 	enum typec_accessory	accessory;
 	struct usb_pd_identity	*identity;
+	u16			pd_revision; /* 0300H = "3.0" */
 };
 
 /**
-- 
cgit v1.2.3


From 29b01295a829fba7399ee84afff4e64660e49f04 Mon Sep 17 00:00:00 2001
From: Benson Leung <bleung@chromium.org>
Date: Thu, 28 Jan 2021 22:14:03 -0800
Subject: usb: typec: Add typec_partner_set_pd_revision

The partner's PD revision may be resolved later than the port partner
registration since the port partner creation may take place once
Type-C detects the port has changed state, but before PD communication is
completed.

Add a setter so that the partner's PD revision can be attached to it once
it becomes available.

If the revision is set to a valid version (not 0), the setter will also
refresh the partner's usb_pd flag and notify on "supports_usb_power_delivery"
sysfs property as well.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Benson Leung <bleung@chromium.org>
Link: https://lore.kernel.org/r/20210129061406.2680146-4-bleung@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/class.c | 30 ++++++++++++++++++++++++++++++
 include/linux/usb/typec.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index b5241f4756c2..b6ceab3dc16b 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -748,6 +748,36 @@ int typec_partner_set_identity(struct typec_partner *partner)
 }
 EXPORT_SYMBOL_GPL(typec_partner_set_identity);
 
+/**
+ * typec_partner_set_pd_revision - Set the PD revision supported by the partner
+ * @partner: The partner to be updated.
+ * @pd_revision:  USB Power Delivery Specification Revision supported by partner
+ *
+ * This routine is used to report that the PD revision of the port partner has
+ * become available.
+ *
+ * Returns 0 on success or negative error number on failure.
+ */
+int typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision)
+{
+	int ret;
+
+	if (partner->pd_revision == pd_revision)
+		return 0;
+
+	partner->pd_revision = pd_revision;
+	sysfs_notify(&partner->dev.kobj, NULL, "usb_power_delivery_revision");
+	if (pd_revision != 0 && !partner->usb_pd) {
+		partner->usb_pd = 1;
+		sysfs_notify(&partner->dev.kobj, NULL,
+			     "supports_usb_power_delivery");
+	}
+	kobject_uevent(&partner->dev.kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_partner_set_pd_revision);
+
 /**
  * typec_partner_set_num_altmodes - Set the number of available partner altmodes
  * @partner: The partner to be updated.
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 42c6b7c07a99..4946eca742d5 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -126,6 +126,7 @@ struct typec_altmode_desc {
 	enum typec_port_data	roles;
 };
 
+int typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision);
 int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmodes);
 struct typec_altmode
 *typec_partner_register_altmode(struct typec_partner *partner,
-- 
cgit v1.2.3


From 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 Jan 2021 14:40:07 -0800
Subject: perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT

Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the
cost of an action represented by the sample. This allows the profiler
to scale the samples to be more informative to the programmer. It could
also help to locate a hotspot, e.g., when profiling by memory latencies,
the expensive load appear higher up in the histograms. But current
PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This
could be a problem, if users want two or more factors to contribute to
the weight. For example, Golden Cove core PMU can provide both the
instruction latency and the cache Latency information as factors for the
memory profiling.

For current X86 platforms, although meminfo::latency is defined as a
u64, only the lower 32 bits include the valid data in practice (No
memory access could last than 4G cycles). The higher 32 bits can be used
to store new factors.

Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new
sample weight structure. It shares the same space as the
PERF_SAMPLE_WEIGHT sample type.

Users can apply either the PERF_SAMPLE_WEIGHT sample type or the
PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but
they cannot apply both sample types simultaneously.

Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type.
- For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT
  sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT
  sample type. PowerPC can re-struct the weight field similarly later.
- For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT
  sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now.
  The following patches will apply the new factors for the
  PERF_SAMPLE_WEIGHT_STRUCT sample type.

The field in the union perf_sample_weight should be shared among
different architectures. A generic name is required, but it's hard to
abstract a name that applies to all architectures. For example, on X86,
the fields are to store all kinds of latency. While on PowerPC, it
stores MMCRA[TECX/TECM], which should not be latency. So a general name
prefix 'var$NUM' is used here.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.liang@linux.intel.com
---
 arch/powerpc/perf/core-book3s.c |  2 +-
 arch/x86/events/intel/ds.c      | 17 +++++++++--------
 include/linux/perf_event.h      |  4 ++--
 include/uapi/linux/perf_event.h | 42 +++++++++++++++++++++++++++++++++++++++--
 kernel/events/core.c            | 11 +++++++----
 5 files changed, 59 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 28206b1fe172..869d999a836e 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2195,7 +2195,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
 						ppmu->get_mem_weight)
-			ppmu->get_mem_weight(&data.weight);
+			ppmu->get_mem_weight(&data.weight.full);
 
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 67dbc91bccfe..2f54b1fbb895 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -960,7 +960,8 @@ static void adaptive_pebs_record_size_update(void)
 }
 
 #define PERF_PEBS_MEMINFO_TYPE	(PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC |   \
-				PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+				PERF_SAMPLE_PHYS_ADDR |			     \
+				PERF_SAMPLE_WEIGHT_TYPE |		     \
 				PERF_SAMPLE_TRANSACTION |		     \
 				PERF_SAMPLE_DATA_PAGE_SIZE)
 
@@ -987,7 +988,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
 	gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
 	       (attr->sample_regs_intr & PEBS_GP_REGS);
 
-	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+	tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
 		     ((attr->config & INTEL_ARCH_EVENT_MASK) ==
 		      x86_pmu.rtm_abort_event);
 
@@ -1369,8 +1370,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 	/*
 	 * Use latency for weight (only avail with PEBS-LL)
 	 */
-	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data->weight = pebs->lat;
+	if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+		data->weight.full = pebs->lat;
 
 	/*
 	 * data.data_src encodes the data source
@@ -1462,8 +1463,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
-		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+		if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+			data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
 
 		if (sample_type & PERF_SAMPLE_TRANSACTION)
 			data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1577,8 +1578,8 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_MEMINFO) {
-		if (sample_type & PERF_SAMPLE_WEIGHT)
-			data->weight = meminfo->latency ?:
+		if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+			data->weight.full = meminfo->latency ?:
 				intel_get_tsx_weight(meminfo->tsx_tuning);
 
 		if (sample_type & PERF_SAMPLE_DATA_SRC)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9a38f579bc76..fab42cfbd350 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -998,7 +998,7 @@ struct perf_sample_data {
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
 	u64				period;
-	u64				weight;
+	union perf_sample_weight	weight;
 	u64				txn;
 	union  perf_mem_data_src	data_src;
 
@@ -1047,7 +1047,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->raw  = NULL;
 	data->br_stack = NULL;
 	data->period = period;
-	data->weight = 0;
+	data->weight.full = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
 }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b15e3447cd9f..b2cc246ec119 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -145,12 +145,14 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_CGROUP			= 1U << 21,
 	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 22,
 	PERF_SAMPLE_CODE_PAGE_SIZE		= 1U << 23,
+	PERF_SAMPLE_WEIGHT_STRUCT		= 1U << 24,
 
-	PERF_SAMPLE_MAX = 1U << 24,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 25,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
 
+#define PERF_SAMPLE_WEIGHT_TYPE	(PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
 /*
  * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set
  *
@@ -890,7 +892,24 @@ enum perf_event_type {
 	 * 	  char			data[size];
 	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
 	 *
-	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+	 *	{ union perf_sample_weight
+	 *	 {
+	 *		u64		full; && PERF_SAMPLE_WEIGHT
+	 *	#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 *		struct {
+	 *			u32	var1_dw;
+	 *			u16	var2_w;
+	 *			u16	var3_w;
+	 *		} && PERF_SAMPLE_WEIGHT_STRUCT
+	 *	#elif defined(__BIG_ENDIAN_BITFIELD)
+	 *		struct {
+	 *			u16	var3_w;
+	 *			u16	var2_w;
+	 *			u32	var1_dw;
+	 *		} && PERF_SAMPLE_WEIGHT_STRUCT
+	 *	#endif
+	 *	 }
+	 *	}
 	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
@@ -1248,4 +1267,23 @@ struct perf_branch_entry {
 		reserved:40;
 };
 
+union perf_sample_weight {
+	__u64		full;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	struct {
+		__u32	var1_dw;
+		__u16	var2_w;
+		__u16	var3_w;
+	};
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	struct {
+		__u16	var3_w;
+		__u16	var2_w;
+		__u32	var1_dw;
+	};
+#else
+#error "Unknown endianness"
+#endif
+};
+
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 55d18791a72d..5206097d4d3d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1879,8 +1879,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		size += sizeof(data->period);
 
-	if (sample_type & PERF_SAMPLE_WEIGHT)
-		size += sizeof(data->weight);
+	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+		size += sizeof(data->weight.full);
 
 	if (sample_type & PERF_SAMPLE_READ)
 		size += event->read_size;
@@ -6907,8 +6907,8 @@ void perf_output_sample(struct perf_output_handle *handle,
 					  data->regs_user.regs);
 	}
 
-	if (sample_type & PERF_SAMPLE_WEIGHT)
-		perf_output_put(handle, data->weight);
+	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+		perf_output_put(handle, data->weight.full);
 
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		perf_output_put(handle, data->data_src.val);
@@ -11564,6 +11564,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (attr->sample_type & PERF_SAMPLE_CGROUP)
 		return -EINVAL;
 #endif
+	if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+		return -EINVAL;
 
 out:
 	return ret;
-- 
cgit v1.2.3


From 1af7e7f8c12f521c111bd7cf0d138be7e15b51a5 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 28 Jan 2021 09:55:01 -0500
Subject: NFS: Refactor nfs_readpage() and nfs_readpage_async() to use
 nfs_readdesc

Both nfs_readpage() and nfs_readpages() use similar code.
This patch should be no functional change, and refactors
nfs_readpage_async() to use nfs_readdesc to enable future
merging of nfs_readpage_async() and nfs_readpage_async_filler().

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/read.c          | 62 ++++++++++++++++++++++++--------------------------
 include/linux/nfs_fs.h |  3 +--
 2 files changed, 31 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 464077daf62f..8c05e56dab65 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -114,18 +114,23 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
 	nfs_release_request(req);
 }
 
-int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+struct nfs_readdesc {
+	struct nfs_pageio_descriptor pgio;
+	struct nfs_open_context *ctx;
+};
+
+int nfs_readpage_async(void *data, struct inode *inode,
 		       struct page *page)
 {
+	struct nfs_readdesc *desc = data;
 	struct nfs_page	*new;
 	unsigned int len;
-	struct nfs_pageio_descriptor pgio;
 	struct nfs_pgio_mirror *pgm;
 
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
-	new = nfs_create_request(ctx, page, 0, len);
+	new = nfs_create_request(desc->ctx, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
 		return PTR_ERR(new);
@@ -133,21 +138,21 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_SIZE)
 		zero_user_segment(page, len, PAGE_SIZE);
 
-	nfs_pageio_init_read(&pgio, inode, false,
+	nfs_pageio_init_read(&desc->pgio, inode, false,
 			     &nfs_async_read_completion_ops);
-	if (!nfs_pageio_add_request(&pgio, new)) {
+	if (!nfs_pageio_add_request(&desc->pgio, new)) {
 		nfs_list_remove_request(new);
-		nfs_readpage_release(new, pgio.pg_error);
+		nfs_readpage_release(new, desc->pgio.pg_error);
 	}
-	nfs_pageio_complete(&pgio);
+	nfs_pageio_complete(&desc->pgio);
 
 	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+	WARN_ON_ONCE(desc->pgio.pg_mirror_count != 1);
 
-	pgm = &pgio.pg_mirrors[0];
+	pgm = &desc->pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
 
-	return pgio.pg_error < 0 ? pgio.pg_error : 0;
+	return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0;
 }
 
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -312,7 +317,7 @@ static void nfs_readpage_result(struct rpc_task *task,
  */
 int nfs_readpage(struct file *file, struct page *page)
 {
-	struct nfs_open_context *ctx;
+	struct nfs_readdesc desc;
 	struct inode *inode = page_file_mapping(page)->host;
 	int ret;
 
@@ -339,39 +344,34 @@ int nfs_readpage(struct file *file, struct page *page)
 
 	if (file == NULL) {
 		ret = -EBADF;
-		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
-		if (ctx == NULL)
+		desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+		if (desc.ctx == NULL)
 			goto out_unlock;
 	} else
-		ctx = get_nfs_open_context(nfs_file_open_context(file));
+		desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
 
 	if (!IS_SYNC(inode)) {
-		ret = nfs_readpage_from_fscache(ctx, inode, page);
+		ret = nfs_readpage_from_fscache(desc.ctx, inode, page);
 		if (ret == 0)
 			goto out;
 	}
 
-	xchg(&ctx->error, 0);
-	ret = nfs_readpage_async(ctx, inode, page);
+	xchg(&desc.ctx->error, 0);
+	ret = nfs_readpage_async(&desc, inode, page);
 	if (!ret) {
 		ret = wait_on_page_locked_killable(page);
 		if (!PageUptodate(page) && !ret)
-			ret = xchg(&ctx->error, 0);
+			ret = xchg(&desc.ctx->error, 0);
 	}
 	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 out:
-	put_nfs_open_context(ctx);
+	put_nfs_open_context(desc.ctx);
 	return ret;
 out_unlock:
 	unlock_page(page);
 	return ret;
 }
 
-struct nfs_readdesc {
-	struct nfs_pageio_descriptor *pgio;
-	struct nfs_open_context *ctx;
-};
-
 static int
 readpage_async_filler(void *data, struct page *page)
 {
@@ -390,9 +390,9 @@ readpage_async_filler(void *data, struct page *page)
 
 	if (len < PAGE_SIZE)
 		zero_user_segment(page, len, PAGE_SIZE);
-	if (!nfs_pageio_add_request(desc->pgio, new)) {
+	if (!nfs_pageio_add_request(&desc->pgio, new)) {
 		nfs_list_remove_request(new);
-		error = desc->pgio->pg_error;
+		error = desc->pgio.pg_error;
 		nfs_readpage_release(new, error);
 		goto out;
 	}
@@ -407,7 +407,6 @@ out:
 int nfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct nfs_pageio_descriptor pgio;
 	struct nfs_pgio_mirror *pgm;
 	struct nfs_readdesc desc;
 	struct inode *inode = mapping->host;
@@ -440,17 +439,16 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	desc.pgio = &pgio;
-	nfs_pageio_init_read(&pgio, inode, false,
+	nfs_pageio_init_read(&desc.pgio, inode, false,
 			     &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-	nfs_pageio_complete(&pgio);
+	nfs_pageio_complete(&desc.pgio);
 
 	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+	WARN_ON_ONCE(desc.pgio.pg_mirror_count != 1);
 
-	pgm = &pgio.pg_mirrors[0];
+	pgm = &desc.pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
 	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
 		 PAGE_SHIFT;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 681ed98e4ba8..cb0248a34518 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -570,8 +570,7 @@ nfs_have_writebacks(struct inode *inode)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
-			       struct page *);
+extern int  nfs_readpage_async(void *, struct inode *, struct page *);
 
 /*
  * inline functions
-- 
cgit v1.2.3


From 1e83b173b2663b7d357309584678cf24787f0713 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 28 Jan 2021 09:55:03 -0500
Subject: NFS: Add nfs_pageio_complete_read() and remove nfs_readpage_async()

Add nfs_pageio_complete_read() and call this from both nfs_readpage()
and nfs_readpages(), since the submission and accounting is the same
for both functions.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/fscache.c       |   4 --
 fs/nfs/read.c          | 137 ++++++++++++++++++++++---------------------------
 include/linux/nfs_fs.h |   1 -
 3 files changed, 61 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a60df88efc40..c4c021c6ebbd 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page,
 	if (!error) {
 		SetPageUptodate(page);
 		unlock_page(page);
-	} else {
-		error = nfs_readpage_async(context, page->mapping->host, page);
-		if (error)
-			unlock_page(page);
 	}
 }
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0ed79e6bc486..d2b6dce1f99f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
+static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio,
+				     struct inode *inode)
+{
+	struct nfs_pgio_mirror *pgm;
+	unsigned long npages;
+
+	nfs_pageio_complete(pgio);
+
+	/* It doesn't make sense to do mirrored reads! */
+	WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+	pgm = &pgio->pg_mirrors[0];
+	NFS_I(inode)->read_io += pgm->pg_bytes_written;
+	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+}
+
+
 void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 {
 	struct nfs_pgio_mirror *mirror;
@@ -119,36 +137,6 @@ struct nfs_readdesc {
 	struct nfs_open_context *ctx;
 };
 
-static int readpage_async_filler(void *data, struct page *page);
-
-int nfs_readpage_async(void *data, struct inode *inode,
-		       struct page *page)
-{
-	struct nfs_readdesc *desc = data;
-	struct nfs_pgio_mirror *pgm;
-	int error;
-
-	nfs_pageio_init_read(&desc->pgio, inode, false,
-			     &nfs_async_read_completion_ops);
-
-	error = readpage_async_filler(desc, page);
-	if (error)
-		goto out;
-
-	nfs_pageio_complete(&desc->pgio);
-
-	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(desc->pgio.pg_mirror_count != 1);
-
-	pgm = &desc->pgio.pg_mirrors[0];
-	NFS_I(inode)->read_io += pgm->pg_bytes_written;
-
-	return desc->pgio.pg_error < 0 ? desc->pgio.pg_error : 0;
-
-out:
-	return error;
-}
-
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
 {
 	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
@@ -170,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
 
 		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
 			/* note: regions of the page not covered by a
-			 * request are zeroed in nfs_readpage_async /
-			 * readpage_async_filler */
+			 * request are zeroed in readpage_async_filler */
 			if (bytes > hdr->good_bytes) {
 				/* nothing in this request was good, so zero
 				 * the full extent of the request */
@@ -303,6 +290,38 @@ static void nfs_readpage_result(struct rpc_task *task,
 		nfs_readpage_retry(task, hdr);
 }
 
+static int
+readpage_async_filler(void *data, struct page *page)
+{
+	struct nfs_readdesc *desc = data;
+	struct nfs_page *new;
+	unsigned int len;
+	int error;
+
+	len = nfs_page_length(page);
+	if (len == 0)
+		return nfs_return_empty_page(page);
+
+	new = nfs_create_request(desc->ctx, page, 0, len);
+	if (IS_ERR(new))
+		goto out_error;
+
+	if (len < PAGE_SIZE)
+		zero_user_segment(page, len, PAGE_SIZE);
+	if (!nfs_pageio_add_request(&desc->pgio, new)) {
+		nfs_list_remove_request(new);
+		error = desc->pgio.pg_error;
+		nfs_readpage_release(new, error);
+		goto out;
+	}
+	return 0;
+out_error:
+	error = PTR_ERR(new);
+	unlock_page(page);
+out:
+	return error;
+}
+
 /*
  * Read a page over NFS.
  * We read the page synchronously in the following case:
@@ -351,13 +370,20 @@ int nfs_readpage(struct file *file, struct page *page)
 	}
 
 	xchg(&desc.ctx->error, 0);
-	ret = nfs_readpage_async(&desc, inode, page);
+	nfs_pageio_init_read(&desc.pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+
+	ret = readpage_async_filler(&desc, page);
+
+	if (!ret)
+		nfs_pageio_complete_read(&desc.pgio, inode);
+
+	ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
 	if (!ret) {
 		ret = wait_on_page_locked_killable(page);
 		if (!PageUptodate(page) && !ret)
 			ret = xchg(&desc.ctx->error, 0);
 	}
-	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 out:
 	put_nfs_open_context(desc.ctx);
 	return ret;
@@ -366,45 +392,11 @@ out_unlock:
 	return ret;
 }
 
-static int
-readpage_async_filler(void *data, struct page *page)
-{
-	struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
-	struct nfs_page *new;
-	unsigned int len;
-	int error;
-
-	len = nfs_page_length(page);
-	if (len == 0)
-		return nfs_return_empty_page(page);
-
-	new = nfs_create_request(desc->ctx, page, 0, len);
-	if (IS_ERR(new))
-		goto out_error;
-
-	if (len < PAGE_SIZE)
-		zero_user_segment(page, len, PAGE_SIZE);
-	if (!nfs_pageio_add_request(&desc->pgio, new)) {
-		nfs_list_remove_request(new);
-		error = desc->pgio.pg_error;
-		nfs_readpage_release(new, error);
-		goto out;
-	}
-	return 0;
-out_error:
-	error = PTR_ERR(new);
-	unlock_page(page);
-out:
-	return error;
-}
-
 int nfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct nfs_pgio_mirror *pgm;
 	struct nfs_readdesc desc;
 	struct inode *inode = mapping->host;
-	unsigned long npages;
 	int ret;
 
 	dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
@@ -437,16 +429,9 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
 			     &nfs_async_read_completion_ops);
 
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-	nfs_pageio_complete(&desc.pgio);
 
-	/* It doesn't make sense to do mirrored reads! */
-	WARN_ON_ONCE(desc.pgio.pg_mirror_count != 1);
+	nfs_pageio_complete_read(&desc.pgio, inode);
 
-	pgm = &desc.pgio.pg_mirrors[0];
-	NFS_I(inode)->read_io += pgm->pg_bytes_written;
-	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
-		 PAGE_SHIFT;
-	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
 	put_nfs_open_context(desc.ctx);
 out:
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cb0248a34518..3cfcf219e96b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -570,7 +570,6 @@ nfs_have_writebacks(struct inode *inode)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern int  nfs_readpage_async(void *, struct inode *, struct page *);
 
 /*
  * inline functions
-- 
cgit v1.2.3


From ec5e32940cc9d2a8a321cb7d756fb6ae45702d03 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 29 Jan 2021 08:54:10 -0800
Subject: vfio: iommu driver notify callback

Define a vfio_iommu_driver_ops notify callback, for sending events to
the driver.  Drivers are not required to provide the callback, and
may ignore any events.  The handling of events is driver specific.

Define the CONTAINER_CLOSE event, called when the container's file
descriptor is closed.  This event signifies that no further state changes
will occur via container ioctl's.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 5 +++++
 include/linux/vfio.h | 7 +++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 4ad8a35667a7..38779e6fd80c 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
 static int vfio_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_container *container = filep->private_data;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
+
+	if (driver && driver->ops->notify)
+		driver->ops->notify(container->iommu_data,
+				    VFIO_IOMMU_CONTAINER_CLOSE);
 
 	filep->private_data = NULL;
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f45940b38a02..b7e18bde5aa8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 extern void *vfio_device_data(struct vfio_device *device);
 
+/* events for the backend driver notify callback */
+enum vfio_iommu_notify_type {
+	VFIO_IOMMU_CONTAINER_CLOSE = 0,
+};
+
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops {
 				  void *data, size_t count, bool write);
 	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
 						   struct iommu_group *group);
+	void		(*notify)(void *iommu_data,
+				  enum vfio_iommu_notify_type event);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
-- 
cgit v1.2.3


From 9ff0c6db0605e9b88360048c8d0a6a9ff647eb71 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Fri, 22 Jan 2021 21:24:57 +0200
Subject: platform/x86: mlxcpld: Update module license

Update license to SPDX-License.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Acked-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/platform_data/x86/mlxcpld.h | 34 +++----------------------------
 1 file changed, 3 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/mlxcpld.h b/include/linux/platform_data/x86/mlxcpld.h
index b08dcb183fca..e6c18bf017dd 100644
--- a/include/linux/platform_data/x86/mlxcpld.h
+++ b/include/linux/platform_data/x86/mlxcpld.h
@@ -1,36 +1,8 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
 /*
- * mlxcpld.h - Mellanox I2C multiplexer support in CPLD
+ * Mellanox I2C multiplexer support in CPLD
  *
- * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2016 Michael Shych <michaels@mellanox.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the names of the copyright holders nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (C) 2016-2020 Mellanox Technologies
  */
 
 #ifndef _LINUX_I2C_MLXCPLD_H
-- 
cgit v1.2.3


From 98d29c410475f30b627502d845794352e9be4046 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Fri, 22 Jan 2021 21:24:58 +0200
Subject: i2c: mux: mlxcpld: Move header file out of x86 realm

Move out header file from include/linux/platform_data/x86/ to
include/linux/platform_data/, since it does not depend on x86
architecture.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Reviewed-by: Michael Shych <michaelsh@nvidia.com>
Acked-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-mlxcpld.c       |  2 +-
 include/linux/platform_data/mlxcpld.h     | 24 ++++++++++++++++++++++++
 include/linux/platform_data/x86/mlxcpld.h | 24 ------------------------
 3 files changed, 25 insertions(+), 25 deletions(-)
 create mode 100644 include/linux/platform_data/mlxcpld.h
 delete mode 100644 include/linux/platform_data/x86/mlxcpld.h

(limited to 'include/linux')

diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index 53bce81cf5c9..3d894cfb19df 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -11,7 +11,7 @@
 #include <linux/io.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/platform_data/x86/mlxcpld.h>
+#include <linux/platform_data/mlxcpld.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 
diff --git a/include/linux/platform_data/mlxcpld.h b/include/linux/platform_data/mlxcpld.h
new file mode 100644
index 000000000000..e6c18bf017dd
--- /dev/null
+++ b/include/linux/platform_data/mlxcpld.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/*
+ * Mellanox I2C multiplexer support in CPLD
+ *
+ * Copyright (C) 2016-2020 Mellanox Technologies
+ */
+
+#ifndef _LINUX_I2C_MLXCPLD_H
+#define _LINUX_I2C_MLXCPLD_H
+
+/* Platform data for the CPLD I2C multiplexers */
+
+/* mlxcpld_mux_plat_data - per mux data, used with i2c_register_board_info
+ * @adap_ids - adapter array
+ * @num_adaps - number of adapters
+ * @sel_reg_addr - mux select register offset in CPLD space
+ */
+struct mlxcpld_mux_plat_data {
+	int *adap_ids;
+	int num_adaps;
+	int sel_reg_addr;
+};
+
+#endif /* _LINUX_I2C_MLXCPLD_H */
diff --git a/include/linux/platform_data/x86/mlxcpld.h b/include/linux/platform_data/x86/mlxcpld.h
deleted file mode 100644
index e6c18bf017dd..000000000000
--- a/include/linux/platform_data/x86/mlxcpld.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
-/*
- * Mellanox I2C multiplexer support in CPLD
- *
- * Copyright (C) 2016-2020 Mellanox Technologies
- */
-
-#ifndef _LINUX_I2C_MLXCPLD_H
-#define _LINUX_I2C_MLXCPLD_H
-
-/* Platform data for the CPLD I2C multiplexers */
-
-/* mlxcpld_mux_plat_data - per mux data, used with i2c_register_board_info
- * @adap_ids - adapter array
- * @num_adaps - number of adapters
- * @sel_reg_addr - mux select register offset in CPLD space
- */
-struct mlxcpld_mux_plat_data {
-	int *adap_ids;
-	int num_adaps;
-	int sel_reg_addr;
-};
-
-#endif /* _LINUX_I2C_MLXCPLD_H */
-- 
cgit v1.2.3


From dd9267034c0e364b49261c3e0070b863286d1242 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Thu, 28 Jan 2021 10:10:48 +0100
Subject: i3c: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of struct bus_type::remove()
because there is only little that can be done. To simplify the quest to
make this function return void, let struct i3c_driver::remove() return
void, too. This makes it obvious that returning an error code is
a bad idea and future driver authors cannot get that wrong.

Up to now there are no drivers with a remove callback, so there is no
need to adapt drivers.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20210128091048.17006-2-u.kleine-koenig@pengutronix.de
---
 drivers/i3c/master.c       | 10 +++-------
 include/linux/i3c/device.h |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 57a4f699eb8d..f8e9b7305c13 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -326,17 +326,13 @@ static int i3c_device_remove(struct device *dev)
 {
 	struct i3c_device *i3cdev = dev_to_i3cdev(dev);
 	struct i3c_driver *driver = drv_to_i3cdrv(dev->driver);
-	int ret = 0;
 
-	if (driver->remove) {
-		ret = driver->remove(i3cdev);
-		if (ret)
-			return ret;
-	}
+	if (driver->remove)
+		driver->remove(i3cdev);
 
 	i3c_device_free_ibi(i3cdev);
 
-	return ret;
+	return 0;
 }
 
 struct bus_type i3c_bus_type = {
diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index de102e4418ab..8242e13e7b0b 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -176,7 +176,7 @@ struct i3c_device;
 struct i3c_driver {
 	struct device_driver driver;
 	int (*probe)(struct i3c_device *dev);
-	int (*remove)(struct i3c_device *dev);
+	void (*remove)(struct i3c_device *dev);
 	const struct i3c_device_id *id_table;
 };
 
-- 
cgit v1.2.3


From 8dd5cada393f6f4e825833a6ff05b1f51f36a791 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 18 Jan 2021 03:55:18 +0300
Subject: opp: Add dev_pm_opp_find_level_ceil()

Add a ceil version of the dev_pm_opp_find_level(). It's handy to have if
levels don't start from 0 in OPP table and zero usually means a minimal
level.

Tested-by: Peter Geis <pgwipeout@gmail.com>
Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Tested-by: Matt Merhar <mattmerhar@protonmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  8 ++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index dc7a298f3611..b29f31146770 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -449,6 +449,55 @@ struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_exact);
 
+/**
+ * dev_pm_opp_find_level_ceil() - search for an rounded up level
+ * @dev:		device for which we do this operation
+ * @level:		level to search for
+ *
+ * Return: Searches for rounded up match in the opp table and returns pointer
+ * to the  matching opp if found, else returns ERR_PTR in case of error and
+ * should be handled using IS_ERR. Error return values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
+					      unsigned int *level)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int r = PTR_ERR(opp_table);
+
+		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
+		return ERR_PTR(r);
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available && temp_opp->level >= *level) {
+			opp = temp_opp;
+			*level = opp->level;
+
+			/* Increment the reference count of OPP */
+			dev_pm_opp_get(opp);
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_ceil);
+
 static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table,
 						   unsigned long *freq)
 {
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 1435c054016a..2b3030cb2ed2 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -111,6 +111,8 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      bool available);
 struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 					       unsigned int level);
+struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
+					      unsigned int *level);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 					      unsigned long *freq);
@@ -226,6 +228,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 	return ERR_PTR(-ENOTSUPP);
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
+					unsigned int *level)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 					unsigned long *freq)
 {
-- 
cgit v1.2.3


From 597ff5431fd41afa888809f7936508a15c977cde Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 18 Jan 2021 03:55:19 +0300
Subject: opp: Add dev_pm_opp_get_required_pstate()

Add dev_pm_opp_get_required_pstate() which allows OPP users to retrieve
required performance state of a given OPP.

Tested-by: Peter Geis <pgwipeout@gmail.com>
Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Tested-by: Matt Merhar <mattmerhar@protonmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 22 ++++++++++++++++++++++
 include/linux/pm_opp.h | 10 ++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index b29f31146770..64a95aaa90eb 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -145,6 +145,28 @@ unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_level);
 
+/**
+ * dev_pm_opp_get_required_pstate() - Gets the required performance state
+ *                                    corresponding to an available opp
+ * @opp:	opp for which performance state has to be returned for
+ * @index:	index of the required opp
+ *
+ * Return: performance state read from device tree corresponding to the
+ * required opp, else return 0.
+ */
+unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp,
+					    unsigned int index)
+{
+	if (IS_ERR_OR_NULL(opp) || !opp->available ||
+	    index >= opp->opp_table->required_opp_count) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return 0;
+	}
+
+	return opp->required_opps[index]->pstate;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_required_pstate);
+
 /**
  * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
  * @opp: opp for which turbo mode is being verified
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 2b3030cb2ed2..8f926815bad9 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -98,6 +98,9 @@ unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
 
 unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp);
 
+unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp,
+					    unsigned int index);
+
 bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
@@ -186,6 +189,13 @@ static inline unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp)
 	return 0;
 }
 
+static inline
+unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp,
+					    unsigned int index)
+{
+	return 0;
+}
+
 static inline bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
 {
 	return false;
-- 
cgit v1.2.3


From ce8073d83f63a2cdcfc1b86d769456726faad51d Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Thu, 21 Jan 2021 01:26:47 +0300
Subject: opp: Add dev_pm_opp_sync_regulators()

Extend OPP API with dev_pm_opp_sync_regulators() function, which syncs
voltage state of regulators.

Tested-by: Peter Geis <pgwipeout@gmail.com>
Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Tested-by: Matt Merhar <mattmerhar@protonmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
[ Viresh: Added unlikely() ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  6 ++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 64a95aaa90eb..bf7cdab0ba64 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2584,3 +2584,44 @@ void dev_pm_opp_remove_table(struct device *dev)
 	dev_pm_opp_put_opp_table(opp_table);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table);
+
+/**
+ * dev_pm_opp_sync_regulators() - Sync state of voltage regulators
+ * @dev:	device for which we do this operation
+ *
+ * Sync voltage state of the OPP table regulators.
+ *
+ * Return: 0 on success or a negative error value.
+ */
+int dev_pm_opp_sync_regulators(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct regulator *reg;
+	int i, ret = 0;
+
+	/* Device may not have OPP table */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	/* Regulator may not be required for the device */
+	if (unlikely(!opp_table->regulators))
+		goto put_table;
+
+	/* Nothing to sync if voltage wasn't changed */
+	if (!opp_table->enabled)
+		goto put_table;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		reg = opp_table->regulators[i];
+		ret = regulator_sync_voltage(reg);
+		if (ret)
+			break;
+	}
+put_table:
+	/* Drop reference taken by _find_opp_table() */
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_sync_regulators);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 8f926815bad9..979b208bc4a8 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -161,6 +161,7 @@ int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cp
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
 void dev_pm_opp_remove_table(struct device *dev);
 void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask);
+int dev_pm_opp_sync_regulators(struct device *dev);
 #else
 static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
 {
@@ -384,6 +385,11 @@ static inline void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask
 {
 }
 
+static inline int dev_pm_opp_sync_regulators(struct device *dev)
+{
+	return -ENOTSUPP;
+}
+
 #endif		/* CONFIG_PM_OPP */
 
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
-- 
cgit v1.2.3


From 559fef0dfd91145b59b7c61061504f344ecf9ad8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 27 Jan 2021 14:23:45 +0530
Subject: opp: Add dev_pm_opp_of_add_table_noclk()

A few drivers have device's clk but they don't want the OPP core to
handle that. Add a new helper for them, dev_pm_opp_of_add_table_noclk().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/opp/of.c       | 18 ++++++++++++++++++
 include/linux/pm_opp.h |  6 ++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index d4b51b2e384f..a905497c75f8 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1030,6 +1030,24 @@ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed);
 
+/**
+ * dev_pm_opp_of_add_table_noclk() - Initialize indexed opp table from device
+ *		tree without getting clk for device.
+ * @dev:	device pointer used to lookup OPP table.
+ * @index:	Index number.
+ *
+ * Register the initial OPP table with the OPP library for given device only
+ * using the "operating-points-v2" property. Do not try to get the clk for the
+ * device.
+ *
+ * Return: Refer to dev_pm_opp_of_add_table() for return values.
+ */
+int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
+{
+	return _of_add_table_indexed(dev, index, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_noclk);
+
 /* CPU device specific helpers */
 
 /**
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 979b208bc4a8..158158620dde 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -395,6 +395,7 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev)
 #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
 int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);
+int dev_pm_opp_of_add_table_noclk(struct device *dev, int index);
 void dev_pm_opp_of_remove_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
 void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask);
@@ -419,6 +420,11 @@ static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 	return -ENOTSUPP;
 }
 
+static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
+{
+	return -ENOTSUPP;
+}
+
 static inline void dev_pm_opp_of_remove_table(struct device *dev)
 {
 }
-- 
cgit v1.2.3


From a3c47af6942dc8e07a4328913d0263a965786895 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 18 Jan 2021 03:55:20 +0300
Subject: opp: Add devm_pm_opp_register_set_opp_helper

Add resource-managed version of dev_pm_opp_register_set_opp_helper().

Tested-by: Peter Geis <pgwipeout@gmail.com>
Tested-by: Nicolas Chauvet <kwizart@gmail.com>
Tested-by: Matt Merhar <mattmerhar@protonmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
[ Viresh: Manually apply the patch and relocate the routines ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 34 ++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  8 ++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 52f4a64926e6..09069a564896 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2106,6 +2106,40 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
 
+static void devm_pm_opp_unregister_set_opp_helper(void *data)
+{
+	dev_pm_opp_unregister_set_opp_helper(data);
+}
+
+/**
+ * devm_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * @dev: Device for which the helper is getting registered.
+ * @set_opp: Custom set OPP helper.
+ *
+ * This is a resource-managed version of dev_pm_opp_register_set_opp_helper().
+ *
+ * Return: pointer to 'struct opp_table' on success and errorno otherwise.
+ */
+struct opp_table *
+devm_pm_opp_register_set_opp_helper(struct device *dev,
+				    int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct opp_table *opp_table;
+	int err;
+
+	opp_table = dev_pm_opp_register_set_opp_helper(dev, set_opp);
+	if (IS_ERR(opp_table))
+		return opp_table;
+
+	err = devm_add_action_or_reset(dev, devm_pm_opp_unregister_set_opp_helper,
+				       opp_table);
+	if (err)
+		return ERR_PTR(err);
+
+	return opp_table;
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_register_set_opp_helper);
+
 static void _opp_detach_genpd(struct opp_table *opp_table)
 {
 	int index;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 158158620dde..473daf34160d 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -152,6 +152,7 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
+struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
@@ -324,6 +325,13 @@ static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device
 
 static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
 
+static inline struct opp_table *
+devm_pm_opp_register_set_opp_helper(struct device *dev,
+				    int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	return ERR_PTR(-ENOTSUPP);
-- 
cgit v1.2.3


From b4b9e223eccaeec6e05d927c292d4425fd18f243 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Mon, 18 Jan 2021 03:55:21 +0300
Subject: opp: Add devm_pm_opp_attach_genpd

Add resource-managed version of dev_pm_opp_attach_genpd().

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
[ Viresh: Manually apply the patch and relocate the routines ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  7 +++++++
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 09069a564896..ce0ec5bde22a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2275,6 +2275,42 @@ void dev_pm_opp_detach_genpd(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_detach_genpd);
 
+static void devm_pm_opp_detach_genpd(void *data)
+{
+	dev_pm_opp_detach_genpd(data);
+}
+
+/**
+ * devm_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual
+ *			      device pointer
+ * @dev: Consumer device for which the genpd is getting attached.
+ * @names: Null terminated array of pointers containing names of genpd to attach.
+ * @virt_devs: Pointer to return the array of virtual devices.
+ *
+ * This is a resource-managed version of dev_pm_opp_attach_genpd().
+ *
+ * Return: pointer to 'struct opp_table' on success and errorno otherwise.
+ */
+struct opp_table *
+devm_pm_opp_attach_genpd(struct device *dev, const char **names,
+			 struct device ***virt_devs)
+{
+	struct opp_table *opp_table;
+	int err;
+
+	opp_table = dev_pm_opp_attach_genpd(dev, names, virt_devs);
+	if (IS_ERR(opp_table))
+		return opp_table;
+
+	err = devm_add_action_or_reset(dev, devm_pm_opp_detach_genpd,
+				       opp_table);
+	if (err)
+		return ERR_PTR(err);
+
+	return opp_table;
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
+
 /**
  * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table.
  * @src_table: OPP table which has dst_table as one of its required OPP table.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 473daf34160d..a2c871799603 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -155,6 +155,7 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
+struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp);
@@ -360,6 +361,12 @@ static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, cons
 
 static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
 
+static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev,
+				const char **names, struct device ***virt_devs)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
 static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From abbe348340c7df9e08fd7c24491c1be31ab65370 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 21 Jan 2021 12:15:36 +0530
Subject: opp: Implement dev_pm_opp_set_opp()

The new helper dev_pm_opp_set_opp() can be used for configuring the
devices for a particular OPP and can be used by different type of
devices, even the ones which don't change frequency (like power
domains).

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/opp/core.c     | 28 ++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  6 ++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 2b5584ef0350..fac84d5a1d45 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1136,6 +1136,34 @@ put_opp_table:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
 
+/**
+ * dev_pm_opp_set_opp() - Configure device for OPP
+ * @dev: device for which we do this operation
+ * @opp: OPP to set to
+ *
+ * This configures the device based on the properties of the OPP passed to this
+ * routine.
+ *
+ * Return: 0 on success, a negative error number otherwise.
+ */
+int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		return PTR_ERR(opp_table);
+	}
+
+	ret = _set_opp(dev, opp_table, opp, opp ? opp->rate : 0);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_opp);
+
 /* OPP-dev Helpers */
 static void _remove_opp_dev(struct opp_device *opp_dev,
 			    struct opp_table *opp_table)
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index a2c871799603..7b1d47ab3fb3 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -158,6 +158,7 @@ void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
+int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp);
 int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -377,6 +378,11 @@ static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_f
 	return -ENOTSUPP;
 }
 
+static inline int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp)
+{
+	return -ENOTSUPP;
+}
+
 static inline int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 240ae50e23061cd1fe1937daab195c17226ffd2e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 21 Jan 2021 15:27:55 +0530
Subject: opp: Remove dev_pm_opp_set_bw()

All the users have migrated to dev_pm_opp_set_opp() now, get rid of the
duplicate API, dev_pm_opp_set_bw(), which only performs a part of the new API.

While at it, remove the unnecessary parameter to _set_opp_bw().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/opp/core.c     | 41 +++++------------------------------------
 include/linux/pm_opp.h |  6 ------
 2 files changed, 5 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index fac84d5a1d45..6958a5cd2fd8 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -798,7 +798,7 @@ restore_voltage:
 }
 
 static int _set_opp_bw(const struct opp_table *opp_table,
-		       struct dev_pm_opp *opp, struct device *dev, bool remove)
+		       struct dev_pm_opp *opp, struct device *dev)
 {
 	u32 avg, peak;
 	int i, ret;
@@ -807,7 +807,7 @@ static int _set_opp_bw(const struct opp_table *opp_table,
 		return 0;
 
 	for (i = 0; i < opp_table->path_count; i++) {
-		if (remove) {
+		if (!opp) {
 			avg = 0;
 			peak = 0;
 		} else {
@@ -817,7 +817,7 @@ static int _set_opp_bw(const struct opp_table *opp_table,
 		ret = icc_set_bw(opp_table->paths[i], avg, peak);
 		if (ret) {
 			dev_err(dev, "Failed to %s bandwidth[%d]: %d\n",
-				remove ? "remove" : "set", i, ret);
+				opp ? "set" : "remove", i, ret);
 			return ret;
 		}
 	}
@@ -917,37 +917,6 @@ static int _set_required_opps(struct device *dev,
 	return ret;
 }
 
-/**
- * dev_pm_opp_set_bw() - sets bandwidth levels corresponding to an opp
- * @dev:	device for which we do this operation
- * @opp:	opp based on which the bandwidth levels are to be configured
- *
- * This configures the bandwidth to the levels specified by the OPP. However
- * if the OPP specified is NULL the bandwidth levels are cleared out.
- *
- * Return: 0 on success or a negative error value.
- */
-int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "%s: device opp table doesn't exist\n", __func__);
-		return PTR_ERR(opp_table);
-	}
-
-	if (opp)
-		ret = _set_opp_bw(opp_table, opp, dev, false);
-	else
-		ret = _set_opp_bw(opp_table, NULL, dev, true);
-
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw);
-
 static void _find_current_opp(struct device *dev, struct opp_table *opp_table)
 {
 	struct dev_pm_opp *opp = ERR_PTR(-ENODEV);
@@ -988,7 +957,7 @@ static int _disable_opp_table(struct device *dev, struct opp_table *opp_table)
 	if (!_get_opp_count(opp_table))
 		return 0;
 
-	ret = _set_opp_bw(opp_table, NULL, dev, true);
+	ret = _set_opp_bw(opp_table, NULL, dev);
 	if (ret)
 		return ret;
 
@@ -1056,7 +1025,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 	}
 
 	if (!ret) {
-		ret = _set_opp_bw(opp_table, opp, dev, false);
+		ret = _set_opp_bw(opp_table, opp, dev);
 		if (!ret) {
 			opp_table->enabled = true;
 			dev_pm_opp_put(old_opp);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 7b1d47ab3fb3..25e47ab937b9 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -159,7 +159,6 @@ struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **name
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp);
-int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
 void dev_pm_opp_remove_table(struct device *dev);
@@ -383,11 +382,6 @@ static inline int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp)
 	return -ENOTSUPP;
 }
 
-static inline int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From f3988bc5d58b768c5cf0dadf5f0e49f7176432df Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 1 Feb 2021 10:35:07 +0530
Subject: opp: Fix "foo * bar" should be "foo *bar"

Fix checkpatch warning:

ERROR: "foo * bar" should be "foo *bar".

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 25e47ab937b9..c6c7d73eb015 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -148,7 +148,7 @@ struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name);
+struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
@@ -347,7 +347,7 @@ static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, co
 
 static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name)
+static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
-- 
cgit v1.2.3


From 1d614920318b914f86c1fec2adec06ad2f7c3f55 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 1 Feb 2021 10:48:54 +0530
Subject: opp: Replace ENOTSUPP with EOPNOTSUPP

Checkpatch gives following warning for new patches, and the new patches
normally follow the existing standards for such stuff. Lets fix it
properly.

WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_opp.h | 64 +++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index c6c7d73eb015..ab1d15ce559d 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -167,12 +167,12 @@ int dev_pm_opp_sync_regulators(struct device *dev);
 #else
 static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, int index)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {}
@@ -232,37 +232,37 @@ static inline unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					unsigned long freq, bool available)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 					unsigned int level)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev,
 					unsigned int *level)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 					unsigned long *freq)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
 					unsigned long u_volt)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 					unsigned long *freq)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {}
@@ -270,7 +270,7 @@ static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {}
 static inline int dev_pm_opp_add(struct device *dev, unsigned long freq,
 					unsigned long u_volt)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline void dev_pm_opp_remove(struct device *dev, unsigned long freq)
@@ -301,19 +301,19 @@ static inline int dev_pm_opp_disable(struct device *dev, unsigned long freq)
 
 static inline int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
 							    const u32 *versions,
 							    unsigned int count)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {}
@@ -321,7 +321,7 @@ static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {}
 static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 			int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
@@ -330,33 +330,33 @@ static inline struct opp_table *
 devm_pm_opp_register_set_opp_helper(struct device *dev,
 				    int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
 static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
 
 static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {}
 
 static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
@@ -364,27 +364,27 @@ static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
 static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev,
 				const char **names, struct device ***virt_devs)
 {
-	return ERR_PTR(-ENOTSUPP);
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
@@ -402,7 +402,7 @@ static inline void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask
 
 static inline int dev_pm_opp_sync_regulators(struct device *dev)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 #endif		/* CONFIG_PM_OPP */
@@ -427,17 +427,17 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev)
 #else
 static inline int dev_pm_opp_of_add_table(struct device *dev)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline void dev_pm_opp_of_remove_table(struct device *dev)
@@ -446,7 +446,7 @@ static inline void dev_pm_opp_of_remove_table(struct device *dev)
 
 static inline int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask)
@@ -455,7 +455,7 @@ static inline void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpum
 
 static inline int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
@@ -471,7 +471,7 @@ static inline struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp)
 static inline int dev_pm_opp_of_register_em(struct device *dev,
 					    struct cpumask *cpus)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline void dev_pm_opp_of_unregister_em(struct device *dev)
@@ -480,12 +480,12 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev)
 
 static inline int of_get_required_opp_performance_state(struct device_node *np, int index)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 #endif
 
-- 
cgit v1.2.3


From 3254899e0b52f10b9a3e7db4d10f081f60705ba9 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 21 Jan 2021 09:09:47 +0000
Subject: nvme: update enumerations for status codes

All the updates are mentioned in the ratified NVMe 1.4 spec.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index bfed36e342cc..458719544253 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1473,20 +1473,29 @@ enum {
 	NVME_SC_SGL_INVALID_DATA	= 0xf,
 	NVME_SC_SGL_INVALID_METADATA	= 0x10,
 	NVME_SC_SGL_INVALID_TYPE	= 0x11,
-
+	NVME_SC_CMB_INVALID_USE		= 0x12,
+	NVME_SC_PRP_INVALID_OFFSET	= 0x13,
+	NVME_SC_ATOMIC_WU_EXCEEDED	= 0x14,
+	NVME_SC_OP_DENIED		= 0x15,
 	NVME_SC_SGL_INVALID_OFFSET	= 0x16,
-	NVME_SC_SGL_INVALID_SUBTYPE	= 0x17,
-
+	NVME_SC_RESERVED		= 0x17,
+	NVME_SC_HOST_ID_INCONSIST	= 0x18,
+	NVME_SC_KA_TIMEOUT_EXPIRED	= 0x19,
+	NVME_SC_KA_TIMEOUT_INVALID	= 0x1A,
+	NVME_SC_ABORTED_PREEMPT_ABORT	= 0x1B,
 	NVME_SC_SANITIZE_FAILED		= 0x1C,
 	NVME_SC_SANITIZE_IN_PROGRESS	= 0x1D,
-
+	NVME_SC_SGL_INVALID_GRANULARITY	= 0x1E,
+	NVME_SC_CMD_NOT_SUP_CMB_QUEUE	= 0x1F,
 	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
 	NVME_SC_CMD_INTERRUPTED		= 0x21,
+	NVME_SC_TRANSIENT_TR_ERR	= 0x22,
 
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
 	NVME_SC_NS_NOT_READY		= 0x82,
 	NVME_SC_RESERVATION_CONFLICT	= 0x83,
+	NVME_SC_FORMAT_IN_PROGRESS	= 0x84,
 
 	/*
 	 * Command Specific Status:
@@ -1519,8 +1528,15 @@ enum {
 	NVME_SC_NS_NOT_ATTACHED		= 0x11a,
 	NVME_SC_THIN_PROV_NOT_SUPP	= 0x11b,
 	NVME_SC_CTRL_LIST_INVALID	= 0x11c,
+	NVME_SC_SELT_TEST_IN_PROGRESS	= 0x11d,
 	NVME_SC_BP_WRITE_PROHIBITED	= 0x11e,
+	NVME_SC_CTRL_ID_INVALID		= 0x11f,
+	NVME_SC_SEC_CTRL_STATE_INVALID	= 0x120,
+	NVME_SC_CTRL_RES_NUM_INVALID	= 0x121,
+	NVME_SC_RES_ID_INVALID		= 0x122,
 	NVME_SC_PMR_SAN_PROHIBITED	= 0x123,
+	NVME_SC_ANA_GROUP_ID_INVALID	= 0x124,
+	NVME_SC_ANA_ATTACH_FAILED	= 0x125,
 
 	/*
 	 * I/O Command Set Specific - NVM commands:
-- 
cgit v1.2.3


From 4a407d5ebc7ac1ea8c6e2692bd79320459dc60f6 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 27 Jan 2021 02:50:00 +0900
Subject: nvme: add tracing of zns commands

When support for the NVMe ZNS commands was merged, tracing of these has
been omitted.

Add nvme_cmd_zone_mgmt_send, nvme_cmd_zone_mgmt_recv as well as
nvme_cmd_zone_append to the nvme driver's tracing facility.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/trace.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/nvme.h      |  6 +++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index e0400de713b5..6543015b6121 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -148,6 +148,35 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
 	return ret;
 }
 
+static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 slba = get_unaligned_le64(cdw10);
+	u8 zsa = cdw10[12];
+	u8 all = cdw10[13];
+
+	trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 slba = get_unaligned_le64(cdw10);
+	u32 numd = get_unaligned_le32(cdw10 + 8);
+	u8 zra = cdw10[12];
+	u8 zrasf = cdw10[13];
+	u8 pr = cdw10[14];
+
+	trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u",
+			 slba, numd, zra, zrasf, pr);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
 static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -190,9 +219,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
 	case nvme_cmd_read:
 	case nvme_cmd_write:
 	case nvme_cmd_write_zeroes:
+	case nvme_cmd_zone_append:
 		return nvme_trace_read_write(p, cdw10);
 	case nvme_cmd_dsm:
 		return nvme_trace_dsm(p, cdw10);
+	case nvme_cmd_zone_mgmt_send:
+		return nvme_trace_zone_mgmt_send(p, cdw10);
+	case nvme_cmd_zone_mgmt_recv:
+		return nvme_trace_zone_mgmt_recv(p, cdw10);
 	default:
 		return nvme_trace_common(p, cdw10);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 458719544253..b08787cd0881 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -697,7 +697,11 @@ enum nvme_opcode {
 		nvme_opcode_name(nvme_cmd_resv_register),	\
 		nvme_opcode_name(nvme_cmd_resv_report),		\
 		nvme_opcode_name(nvme_cmd_resv_acquire),	\
-		nvme_opcode_name(nvme_cmd_resv_release))
+		nvme_opcode_name(nvme_cmd_resv_release),	\
+		nvme_opcode_name(nvme_cmd_zone_mgmt_send),	\
+		nvme_opcode_name(nvme_cmd_zone_mgmt_recv),	\
+		nvme_opcode_name(nvme_cmd_zone_append))
+
 
 
 /*
-- 
cgit v1.2.3


From 3fd269e74f2feec973f45ee11d822faeda4fe284 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 26 Jan 2021 17:58:34 +0100
Subject: amba: Make the remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All amba drivers return 0 in their remove callback. Together with the
driver core ignoring the return value anyhow, it doesn't make sense to
return a value here.

Change the remove prototype to return void, which makes it explicit that
returning an error value doesn't work as expected. This simplifies changing
the core remove callback to return void, too.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org> # for drivers/memory
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Suzuki K Poulose <suzuki.poulose@arm.com> # for hwtracing/coresight
Acked-By: Vinod Koul <vkoul@kernel.org> # for dmaengine
Acked-by: Guenter Roeck <linux@roeck-us.net> # for watchdog
Acked-by: Wolfram Sang <wsa@kernel.org> # for I2C
Acked-by: Takashi Iwai <tiwai@suse.de> # for sound
Acked-by: Vladimir Zapolskiy <vz@mleia.com> # for memory/pl172
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210126165835.687514-5-u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/amba/bus.c                                 | 5 ++---
 drivers/char/hw_random/nomadik-rng.c               | 3 +--
 drivers/dma/pl330.c                                | 3 +--
 drivers/gpu/drm/pl111/pl111_drv.c                  | 4 +---
 drivers/hwtracing/coresight/coresight-catu.c       | 3 +--
 drivers/hwtracing/coresight/coresight-cpu-debug.c  | 4 +---
 drivers/hwtracing/coresight/coresight-cti-core.c   | 4 +---
 drivers/hwtracing/coresight/coresight-etb10.c      | 4 +---
 drivers/hwtracing/coresight/coresight-etm3x-core.c | 4 +---
 drivers/hwtracing/coresight/coresight-etm4x-core.c | 4 +---
 drivers/hwtracing/coresight/coresight-funnel.c     | 4 ++--
 drivers/hwtracing/coresight/coresight-replicator.c | 4 ++--
 drivers/hwtracing/coresight/coresight-stm.c        | 4 +---
 drivers/hwtracing/coresight/coresight-tmc-core.c   | 4 +---
 drivers/hwtracing/coresight/coresight-tpiu.c       | 4 +---
 drivers/i2c/busses/i2c-nomadik.c                   | 4 +---
 drivers/input/serio/ambakmi.c                      | 3 +--
 drivers/memory/pl172.c                             | 4 +---
 drivers/memory/pl353-smc.c                         | 4 +---
 drivers/mmc/host/mmci.c                            | 4 +---
 drivers/rtc/rtc-pl030.c                            | 4 +---
 drivers/rtc/rtc-pl031.c                            | 4 +---
 drivers/spi/spi-pl022.c                            | 5 ++---
 drivers/tty/serial/amba-pl010.c                    | 4 +---
 drivers/tty/serial/amba-pl011.c                    | 3 +--
 drivers/vfio/platform/vfio_amba.c                  | 3 +--
 drivers/video/fbdev/amba-clcd.c                    | 4 +---
 drivers/watchdog/sp805_wdt.c                       | 4 +---
 include/linux/amba/bus.h                           | 2 +-
 sound/arm/aaci.c                                   | 4 +---
 30 files changed, 34 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 8c4a42df47c6..48b5d4b4e889 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -300,11 +300,10 @@ static int amba_remove(struct device *dev)
 {
 	struct amba_device *pcdev = to_amba_device(dev);
 	struct amba_driver *drv = to_amba_driver(dev->driver);
-	int ret = 0;
 
 	pm_runtime_get_sync(dev);
 	if (drv->remove)
-		ret = drv->remove(pcdev);
+		drv->remove(pcdev);
 	pm_runtime_put_noidle(dev);
 
 	/* Undo the runtime PM settings in amba_probe() */
@@ -315,7 +314,7 @@ static int amba_remove(struct device *dev)
 	amba_put_disable_pclk(pcdev);
 	dev_pm_domain_detach(dev, true);
 
-	return ret;
+	return 0;
 }
 
 static void amba_shutdown(struct device *dev)
diff --git a/drivers/char/hw_random/nomadik-rng.c b/drivers/char/hw_random/nomadik-rng.c
index b0ded41eb865..67947a19aa22 100644
--- a/drivers/char/hw_random/nomadik-rng.c
+++ b/drivers/char/hw_random/nomadik-rng.c
@@ -69,11 +69,10 @@ out_clk:
 	return ret;
 }
 
-static int nmk_rng_remove(struct amba_device *dev)
+static void nmk_rng_remove(struct amba_device *dev)
 {
 	amba_release_regions(dev);
 	clk_disable(rng_clk);
-	return 0;
 }
 
 static const struct amba_id nmk_rng_ids[] = {
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index bc0f66af0f11..fd8d2bc3be9f 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -3195,7 +3195,7 @@ probe_err2:
 	return ret;
 }
 
-static int pl330_remove(struct amba_device *adev)
+static void pl330_remove(struct amba_device *adev)
 {
 	struct pl330_dmac *pl330 = amba_get_drvdata(adev);
 	struct dma_pl330_chan *pch, *_p;
@@ -3235,7 +3235,6 @@ static int pl330_remove(struct amba_device *adev)
 
 	if (pl330->rstc)
 		reset_control_assert(pl330->rstc);
-	return 0;
 }
 
 static const struct amba_id pl330_ids[] = {
diff --git a/drivers/gpu/drm/pl111/pl111_drv.c b/drivers/gpu/drm/pl111/pl111_drv.c
index 40e6708fbbe2..1fb5eacefd2d 100644
--- a/drivers/gpu/drm/pl111/pl111_drv.c
+++ b/drivers/gpu/drm/pl111/pl111_drv.c
@@ -320,7 +320,7 @@ dev_put:
 	return ret;
 }
 
-static int pl111_amba_remove(struct amba_device *amba_dev)
+static void pl111_amba_remove(struct amba_device *amba_dev)
 {
 	struct device *dev = &amba_dev->dev;
 	struct drm_device *drm = amba_get_drvdata(amba_dev);
@@ -331,8 +331,6 @@ static int pl111_amba_remove(struct amba_device *amba_dev)
 		drm_panel_bridge_remove(priv->bridge);
 	drm_dev_put(drm);
 	of_reserved_mem_device_release(dev);
-
-	return 0;
 }
 
 /*
diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index a61313f320bd..8e19e8cdcce5 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -567,12 +567,11 @@ out:
 	return ret;
 }
 
-static int catu_remove(struct amba_device *adev)
+static void catu_remove(struct amba_device *adev)
 {
 	struct catu_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
 	coresight_unregister(drvdata->csdev);
-	return 0;
 }
 
 static struct amba_id catu_ids[] = {
diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c
index e1d232411d8d..2dcf13de751f 100644
--- a/drivers/hwtracing/coresight/coresight-cpu-debug.c
+++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c
@@ -627,7 +627,7 @@ err:
 	return ret;
 }
 
-static int debug_remove(struct amba_device *adev)
+static void debug_remove(struct amba_device *adev)
 {
 	struct device *dev = &adev->dev;
 	struct debug_drvdata *drvdata = amba_get_drvdata(adev);
@@ -642,8 +642,6 @@ static int debug_remove(struct amba_device *adev)
 
 	if (!--debug_count)
 		debug_func_exit();
-
-	return 0;
 }
 
 static const struct amba_cs_uci_id uci_id_debug[] = {
diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
index 61dbc1afd8da..30e48809ba00 100644
--- a/drivers/hwtracing/coresight/coresight-cti-core.c
+++ b/drivers/hwtracing/coresight/coresight-cti-core.c
@@ -836,7 +836,7 @@ static void cti_device_release(struct device *dev)
 	if (drvdata->csdev_release)
 		drvdata->csdev_release(dev);
 }
-static int cti_remove(struct amba_device *adev)
+static void cti_remove(struct amba_device *adev)
 {
 	struct cti_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
@@ -845,8 +845,6 @@ static int cti_remove(struct amba_device *adev)
 	mutex_unlock(&ect_mutex);
 
 	coresight_unregister(drvdata->csdev);
-
-	return 0;
 }
 
 static int cti_probe(struct amba_device *adev, const struct amba_id *id)
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 0cf6f0b947b6..51c801c05e5c 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -803,7 +803,7 @@ err_misc_register:
 	return ret;
 }
 
-static int etb_remove(struct amba_device *adev)
+static void etb_remove(struct amba_device *adev)
 {
 	struct etb_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
@@ -814,8 +814,6 @@ static int etb_remove(struct amba_device *adev)
 	 */
 	misc_deregister(&drvdata->miscdev);
 	coresight_unregister(drvdata->csdev);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-core.c b/drivers/hwtracing/coresight/coresight-etm3x-core.c
index 5bf5a5a4ce6d..683a69e88efd 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x-core.c
@@ -909,7 +909,7 @@ static void clear_etmdrvdata(void *info)
 	etmdrvdata[cpu] = NULL;
 }
 
-static int etm_remove(struct amba_device *adev)
+static void etm_remove(struct amba_device *adev)
 {
 	struct etm_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
@@ -932,8 +932,6 @@ static int etm_remove(struct amba_device *adev)
 	cpus_read_unlock();
 
 	coresight_unregister(drvdata->csdev);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index b20b6ff17cf6..82787cba537d 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -1680,7 +1680,7 @@ static void clear_etmdrvdata(void *info)
 	etmdrvdata[cpu] = NULL;
 }
 
-static int etm4_remove(struct amba_device *adev)
+static void etm4_remove(struct amba_device *adev)
 {
 	struct etmv4_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
@@ -1703,8 +1703,6 @@ static int etm4_remove(struct amba_device *adev)
 	cpus_read_unlock();
 
 	coresight_unregister(drvdata->csdev);
-
-	return 0;
 }
 
 static const struct amba_id etm4_ids[] = {
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 071c723227db..01f8f9285168 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -370,9 +370,9 @@ static int dynamic_funnel_probe(struct amba_device *adev,
 	return funnel_probe(&adev->dev, &adev->res);
 }
 
-static int dynamic_funnel_remove(struct amba_device *adev)
+static void dynamic_funnel_remove(struct amba_device *adev)
 {
-	return funnel_remove(&adev->dev);
+	funnel_remove(&adev->dev);
 }
 
 static const struct amba_id dynamic_funnel_ids[] = {
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 7e2a2b7f503f..34fc2f6f3ea9 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -388,9 +388,9 @@ static int dynamic_replicator_probe(struct amba_device *adev,
 	return replicator_probe(&adev->dev, &adev->res);
 }
 
-static int dynamic_replicator_remove(struct amba_device *adev)
+static void dynamic_replicator_remove(struct amba_device *adev)
 {
-	return replicator_remove(&adev->dev);
+	replicator_remove(&adev->dev);
 }
 
 static const struct amba_id dynamic_replicator_ids[] = {
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 99791773f682..423df0d30d9c 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -951,15 +951,13 @@ stm_unregister:
 	return ret;
 }
 
-static int stm_remove(struct amba_device *adev)
+static void stm_remove(struct amba_device *adev)
 {
 	struct stm_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
 	coresight_unregister(drvdata->csdev);
 
 	stm_unregister_device(&drvdata->stm);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c
index 8169dff5a9f6..e29b3914fc0f 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-core.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-core.c
@@ -559,7 +559,7 @@ out:
 	spin_unlock_irqrestore(&drvdata->spinlock, flags);
 }
 
-static int tmc_remove(struct amba_device *adev)
+static void tmc_remove(struct amba_device *adev)
 {
 	struct tmc_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
@@ -570,8 +570,6 @@ static int tmc_remove(struct amba_device *adev)
 	 */
 	misc_deregister(&drvdata->miscdev);
 	coresight_unregister(drvdata->csdev);
-
-	return 0;
 }
 
 static const struct amba_id tmc_ids[] = {
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index d5dfee9ee556..f77c4b0ea4aa 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -173,13 +173,11 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 	return PTR_ERR(drvdata->csdev);
 }
 
-static int tpiu_remove(struct amba_device *adev)
+static void tpiu_remove(struct amba_device *adev)
 {
 	struct tpiu_drvdata *drvdata = dev_get_drvdata(&adev->dev);
 
 	coresight_unregister(drvdata->csdev);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c
index d4b1b0865f67..a3363b20f168 100644
--- a/drivers/i2c/busses/i2c-nomadik.c
+++ b/drivers/i2c/busses/i2c-nomadik.c
@@ -1055,7 +1055,7 @@ static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id)
 	return ret;
 }
 
-static int nmk_i2c_remove(struct amba_device *adev)
+static void nmk_i2c_remove(struct amba_device *adev)
 {
 	struct resource *res = &adev->res;
 	struct nmk_i2c_dev *dev = amba_get_drvdata(adev);
@@ -1068,8 +1068,6 @@ static int nmk_i2c_remove(struct amba_device *adev)
 	i2c_clr_bit(dev->virtbase + I2C_CR, I2C_CR_PE);
 	clk_disable_unprepare(dev->clk);
 	release_mem_region(res->start, resource_size(res));
-
-	return 0;
 }
 
 static struct i2c_vendor_data vendor_stn8815 = {
diff --git a/drivers/input/serio/ambakmi.c b/drivers/input/serio/ambakmi.c
index ecdeca147ed7..4408245b61d2 100644
--- a/drivers/input/serio/ambakmi.c
+++ b/drivers/input/serio/ambakmi.c
@@ -159,7 +159,7 @@ static int amba_kmi_probe(struct amba_device *dev,
 	return ret;
 }
 
-static int amba_kmi_remove(struct amba_device *dev)
+static void amba_kmi_remove(struct amba_device *dev)
 {
 	struct amba_kmi_port *kmi = amba_get_drvdata(dev);
 
@@ -168,7 +168,6 @@ static int amba_kmi_remove(struct amba_device *dev)
 	iounmap(kmi->base);
 	kfree(kmi);
 	amba_release_regions(dev);
-	return 0;
 }
 
 static int __maybe_unused amba_kmi_resume(struct device *dev)
diff --git a/drivers/memory/pl172.c b/drivers/memory/pl172.c
index 575fadbffa30..9eb8cc7de494 100644
--- a/drivers/memory/pl172.c
+++ b/drivers/memory/pl172.c
@@ -273,14 +273,12 @@ err_clk_enable:
 	return ret;
 }
 
-static int pl172_remove(struct amba_device *adev)
+static void pl172_remove(struct amba_device *adev)
 {
 	struct pl172_data *pl172 = amba_get_drvdata(adev);
 
 	clk_disable_unprepare(pl172->clk);
 	amba_release_regions(adev);
-
-	return 0;
 }
 
 static const struct amba_id pl172_ids[] = {
diff --git a/drivers/memory/pl353-smc.c b/drivers/memory/pl353-smc.c
index 73bd3023202f..3b5b1045edd9 100644
--- a/drivers/memory/pl353-smc.c
+++ b/drivers/memory/pl353-smc.c
@@ -426,14 +426,12 @@ out_clk_dis_aper:
 	return err;
 }
 
-static int pl353_smc_remove(struct amba_device *adev)
+static void pl353_smc_remove(struct amba_device *adev)
 {
 	struct pl353_smc_data *pl353_smc = amba_get_drvdata(adev);
 
 	clk_disable_unprepare(pl353_smc->memclk);
 	clk_disable_unprepare(pl353_smc->aclk);
-
-	return 0;
 }
 
 static const struct amba_id pl353_ids[] = {
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index b5a41a7ce165..32f52d070bbd 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -2195,7 +2195,7 @@ static int mmci_probe(struct amba_device *dev,
 	return ret;
 }
 
-static int mmci_remove(struct amba_device *dev)
+static void mmci_remove(struct amba_device *dev)
 {
 	struct mmc_host *mmc = amba_get_drvdata(dev);
 
@@ -2223,8 +2223,6 @@ static int mmci_remove(struct amba_device *dev)
 		clk_disable_unprepare(host->clk);
 		mmc_free_host(mmc);
 	}
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index 5a880516f3e8..39038c0754ee 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -137,7 +137,7 @@ static int pl030_probe(struct amba_device *dev, const struct amba_id *id)
 	return ret;
 }
 
-static int pl030_remove(struct amba_device *dev)
+static void pl030_remove(struct amba_device *dev)
 {
 	struct pl030_rtc *rtc = amba_get_drvdata(dev);
 
@@ -146,8 +146,6 @@ static int pl030_remove(struct amba_device *dev)
 	free_irq(dev->irq[0], rtc);
 	iounmap(rtc->base);
 	amba_release_regions(dev);
-
-	return 0;
 }
 
 static struct amba_id pl030_ids[] = {
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index 224bbf096262..620c8dc33647 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -280,7 +280,7 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 	return 0;
 }
 
-static int pl031_remove(struct amba_device *adev)
+static void pl031_remove(struct amba_device *adev)
 {
 	struct pl031_local *ldata = dev_get_drvdata(&adev->dev);
 
@@ -289,8 +289,6 @@ static int pl031_remove(struct amba_device *adev)
 	if (adev->irq[0])
 		free_irq(adev->irq[0], ldata);
 	amba_release_regions(adev);
-
-	return 0;
 }
 
 static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
diff --git a/drivers/spi/spi-pl022.c b/drivers/spi/spi-pl022.c
index d1776fea287e..fd74ddfbb686 100644
--- a/drivers/spi/spi-pl022.c
+++ b/drivers/spi/spi-pl022.c
@@ -2314,13 +2314,13 @@ static int pl022_probe(struct amba_device *adev, const struct amba_id *id)
 	return status;
 }
 
-static int
+static void
 pl022_remove(struct amba_device *adev)
 {
 	struct pl022 *pl022 = amba_get_drvdata(adev);
 
 	if (!pl022)
-		return 0;
+		return;
 
 	/*
 	 * undo pm_runtime_put() in probe.  I assume that we're not
@@ -2335,7 +2335,6 @@ pl022_remove(struct amba_device *adev)
 	clk_disable_unprepare(pl022->clk);
 	amba_release_regions(adev);
 	tasklet_disable(&pl022->pump_transfers);
-	return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/tty/serial/amba-pl010.c b/drivers/tty/serial/amba-pl010.c
index 3284f34e9dfe..3f96edfe569c 100644
--- a/drivers/tty/serial/amba-pl010.c
+++ b/drivers/tty/serial/amba-pl010.c
@@ -754,7 +754,7 @@ static int pl010_probe(struct amba_device *dev, const struct amba_id *id)
 	return ret;
 }
 
-static int pl010_remove(struct amba_device *dev)
+static void pl010_remove(struct amba_device *dev)
 {
 	struct uart_amba_port *uap = amba_get_drvdata(dev);
 	int i;
@@ -770,8 +770,6 @@ static int pl010_remove(struct amba_device *dev)
 
 	if (!busy)
 		uart_unregister_driver(&amba_reg);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index c255476cce28..4ead0c9048a8 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2679,13 +2679,12 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	return pl011_register_port(uap);
 }
 
-static int pl011_remove(struct amba_device *dev)
+static void pl011_remove(struct amba_device *dev)
 {
 	struct uart_amba_port *uap = amba_get_drvdata(dev);
 
 	uart_remove_one_port(&amba_reg, &uap->port);
 	pl011_unregister_port(uap);
-	return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 7b3ebf1558e1..3626c2150101 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -71,14 +71,13 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id)
 	return ret;
 }
 
-static int vfio_amba_remove(struct amba_device *adev)
+static void vfio_amba_remove(struct amba_device *adev)
 {
 	struct vfio_platform_device *vdev =
 		vfio_platform_remove_common(&adev->dev);
 
 	kfree(vdev->name);
 	kfree(vdev);
-	return 0;
 }
 
 static const struct amba_id pl330_ids[] = {
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c
index b7682de412d8..33595cc4778e 100644
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -925,7 +925,7 @@ static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id)
 	return ret;
 }
 
-static int clcdfb_remove(struct amba_device *dev)
+static void clcdfb_remove(struct amba_device *dev)
 {
 	struct clcd_fb *fb = amba_get_drvdata(dev);
 
@@ -942,8 +942,6 @@ static int clcdfb_remove(struct amba_device *dev)
 	kfree(fb);
 
 	amba_release_regions(dev);
-
-	return 0;
 }
 
 static const struct amba_id clcdfb_id_table[] = {
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index 958dc32a708f..58a00e1ab23b 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -305,14 +305,12 @@ err:
 	return ret;
 }
 
-static int sp805_wdt_remove(struct amba_device *adev)
+static void sp805_wdt_remove(struct amba_device *adev)
 {
 	struct sp805_wdt *wdt = amba_get_drvdata(adev);
 
 	watchdog_unregister_device(&wdt->wdd);
 	watchdog_set_drvdata(&wdt->wdd, NULL);
-
-	return 0;
 }
 
 static int __maybe_unused sp805_wdt_suspend(struct device *dev)
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 0bbfd647f5c6..6cc93ab5b809 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -76,7 +76,7 @@ struct amba_device {
 struct amba_driver {
 	struct device_driver	drv;
 	int			(*probe)(struct amba_device *, const struct amba_id *);
-	int			(*remove)(struct amba_device *);
+	void			(*remove)(struct amba_device *);
 	void			(*shutdown)(struct amba_device *);
 	const struct amba_id	*id_table;
 };
diff --git a/sound/arm/aaci.c b/sound/arm/aaci.c
index a0996c47e58f..b326a5f5f0d5 100644
--- a/sound/arm/aaci.c
+++ b/sound/arm/aaci.c
@@ -1055,7 +1055,7 @@ static int aaci_probe(struct amba_device *dev,
 	return ret;
 }
 
-static int aaci_remove(struct amba_device *dev)
+static void aaci_remove(struct amba_device *dev)
 {
 	struct snd_card *card = amba_get_drvdata(dev);
 
@@ -1066,8 +1066,6 @@ static int aaci_remove(struct amba_device *dev)
 		snd_card_free(card);
 		amba_release_regions(dev);
 	}
-
-	return 0;
 }
 
 static struct amba_id aaci_ids[] = {
-- 
cgit v1.2.3


From c7020068bf2397b60bb62a1f71ca0fe626c1f7e7 Mon Sep 17 00:00:00 2001
From: Elvira Khabirova <e.khabirova@omprussia.ru>
Date: Sun, 20 Sep 2020 04:58:57 +0300
Subject: tee: fix some comment typos in header files

struct tee_param: revc -> recv.
TEE_IOC_SUPPL_SEND: typo introduced by copy-pasting, replace invalid
description with description from the according argument struct.

Signed-off-by: Elvira Khabirova <e.khabirova@omprussia.ru>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/linux/tee_drv.h  | 2 +-
 include/uapi/linux/tee.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index cdd049a724b1..54269e47ac9a 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -88,7 +88,7 @@ struct tee_param {
  * @close_session:	close a session
  * @invoke_func:	invoke a trusted function
  * @cancel_req:		request cancel of an ongoing invoke or open
- * @supp_revc:		called for supplicant to get a command
+ * @supp_recv:		called for supplicant to get a command
  * @supp_send:		called for supplicant to send a response
  * @shm_register:	register shared memory buffer in TEE
  * @shm_unregister:	unregister shared memory buffer in TEE
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index d67cadf221fc..25a6c534beb1 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -355,7 +355,7 @@ struct tee_iocl_supp_send_arg {
 };
 
 /**
- * TEE_IOC_SUPPL_SEND - Receive a request for a supplicant function
+ * TEE_IOC_SUPPL_SEND - Send a response to a received request
  *
  * Takes a struct tee_ioctl_buf_data which contains a struct
  * tee_iocl_supp_send_arg followed by any array of struct tee_param
-- 
cgit v1.2.3


From c8b186a8d54d7e12d28e9f9686cb00ff18fc2ab2 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 2 Feb 2021 18:23:26 +1100
Subject: tracepoint: Fix race between tracing and removing tracepoint

When executing a tracepoint, the tracepoint's func is dereferenced twice -
in __DO_TRACE() (where the returned pointer is checked) and later on in
__traceiter_##_name where the returned pointer is dereferenced without
checking which leads to races against tracepoint_removal_sync() and
crashes.

This adds a check before referencing the pointer in tracepoint_ptr_deref.

Link: https://lkml.kernel.org/r/20210202072326.120557-1-aik@ozlabs.ru

Cc: stable@vger.kernel.org
Fixes: d25e37d89dd2f ("tracepoint: Optimize using static_call()")
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0f21617f1a66..966ed8980327 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -307,11 +307,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 									\
 		it_func_ptr =						\
 			rcu_dereference_raw((&__tracepoint_##_name)->funcs); \
-		do {							\
-			it_func = (it_func_ptr)->func;			\
-			__data = (it_func_ptr)->data;			\
-			((void(*)(void *, proto))(it_func))(__data, args); \
-		} while ((++it_func_ptr)->func);			\
+		if (it_func_ptr) {					\
+			do {						\
+				it_func = (it_func_ptr)->func;		\
+				__data = (it_func_ptr)->data;		\
+				((void(*)(void *, proto))(it_func))(__data, args); \
+			} while ((++it_func_ptr)->func);		\
+		}							\
 		return 0;						\
 	}								\
 	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
-- 
cgit v1.2.3


From 4c9fb5d9140802db4db9f66c23887f43174e113c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Tue, 2 Feb 2021 15:54:19 +0100
Subject: iommu: Check dev->iommu in dev_iommu_priv_get() before dereferencing
 it

The dev_iommu_priv_get() needs a similar check to
dev_iommu_fwspec_get() to make sure no NULL-ptr is dereferenced.

Fixes: 05a0542b456e1 ("iommu/amd: Store dev_data as device iommu private data")
Cc: stable@vger.kernel.org	# v5.8+
Link: https://lore.kernel.org/r/20210202145419.29143-1-joro@8bytes.org
Reference: https://bugzilla.kernel.org/show_bug.cgi?id=211241
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b3f0e2018c62..efa96263b81b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -616,7 +616,10 @@ static inline void dev_iommu_fwspec_set(struct device *dev,
 
 static inline void *dev_iommu_priv_get(struct device *dev)
 {
-	return dev->iommu->priv;
+	if (dev->iommu)
+		return dev->iommu->priv;
+	else
+		return NULL;
 }
 
 static inline void dev_iommu_priv_set(struct device *dev, void *priv)
-- 
cgit v1.2.3


From a69bdb283f79949b67632878ef1822badae9299f Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Mon, 1 Feb 2021 16:30:59 -0800
Subject: usb: typec: tcpm: Add Callback to Usb Communication capable partner

The USB Communications Capable bit indicates if port
partner is capable of communication over the USB data lines
(e.g. D+/- or SS Tx/Rx). Notify the status of the bit to low
level drivers to perform chip specific operation.
For instance, low level driver enables USB switches on D+/D-
lines to set up data path when the bit is set.

Refactored from patch initially authored by
Kyle Tso <kyletso@google.com>

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Link: https://lore.kernel.org/r/20210202003101.221145-1-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 13 +++++++++++++
 include/linux/usb/tcpm.h      |  5 +++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 7747c7a154a4..8558ab006885 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -3430,6 +3430,14 @@ static void tcpm_unregister_altmodes(struct tcpm_port *port)
 	memset(modep, 0, sizeof(*modep));
 }
 
+static void tcpm_set_partner_usb_comm_capable(struct tcpm_port *port, bool capable)
+{
+	tcpm_log(port, "Setting usb_comm capable %s", capable ? "true" : "false");
+
+	if (port->tcpc->set_partner_usb_comm_capable)
+		port->tcpc->set_partner_usb_comm_capable(port->tcpc, capable);
+}
+
 static void tcpm_reset_port(struct tcpm_port *port)
 {
 	int ret;
@@ -3446,6 +3454,7 @@ static void tcpm_reset_port(struct tcpm_port *port)
 	port->attached = false;
 	port->pd_capable = false;
 	port->pps_data.supported = false;
+	tcpm_set_partner_usb_comm_capable(port, false);
 
 	/*
 	 * First Rx ID should be 0; set this to a sentinel of -1 so that
@@ -3786,6 +3795,8 @@ static void run_state_machine(struct tcpm_port *port)
 			}
 		} else {
 			tcpm_pd_send_control(port, PD_CTRL_ACCEPT);
+			tcpm_set_partner_usb_comm_capable(port,
+							  !!(port->sink_request & RDO_USB_COMM));
 			tcpm_set_state(port, SRC_TRANSITION_SUPPLY,
 				       PD_T_SRC_TRANSITION);
 		}
@@ -4005,6 +4016,8 @@ static void run_state_machine(struct tcpm_port *port)
 		break;
 	case SNK_NEGOTIATE_CAPABILITIES:
 		port->pd_capable = true;
+		tcpm_set_partner_usb_comm_capable(port,
+						  !!(port->source_caps[0] & PDO_FIXED_USB_COMM));
 		port->hard_reset_count = 0;
 		ret = tcpm_pd_send_request(port);
 		if (ret < 0) {
diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h
index 3af99f85e8b9..42fcfbe10590 100644
--- a/include/linux/usb/tcpm.h
+++ b/include/linux/usb/tcpm.h
@@ -108,6 +108,10 @@ enum tcpm_transmit_type {
  *		is supported by TCPC, set this callback for TCPM to query
  *		whether vbus is at VSAFE0V when needed.
  *		Returns true when vbus is at VSAFE0V, false otherwise.
+ * @set_partner_usb_comm_capable:
+ *              Optional; The USB Communications Capable bit indicates if port
+ *              partner is capable of communication over the USB data lines
+ *              (e.g. D+/- or SS Tx/Rx). Called to notify the status of the bit.
  */
 struct tcpc_dev {
 	struct fwnode_handle *fwnode;
@@ -139,6 +143,7 @@ struct tcpc_dev {
 	int (*set_auto_vbus_discharge_threshold)(struct tcpc_dev *dev, enum typec_pwr_opmode mode,
 						 bool pps_active, u32 requested_vbus_voltage);
 	bool (*is_vbus_vsafe0v)(struct tcpc_dev *dev);
+	void (*set_partner_usb_comm_capable)(struct tcpc_dev *dev, bool enable);
 };
 
 struct tcpm_port;
-- 
cgit v1.2.3


From 32e9b48d110ef5fae850036eafaf7895a25b37e3 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Tue, 2 Feb 2021 17:55:12 +0800
Subject: usb: typec: Return void in typec_partner_set_pd_revision

typec_partner_set_pd_revision doesn't need any return value.

Fixes: 29b01295a829 ("usb: typec: Add typec_partner_set_pd_revision")
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210202095512.761214-1-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/class.c | 10 ++--------
 include/linux/usb/typec.h |  2 +-
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index b6ceab3dc16b..a7d1bc83c2d4 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -755,15 +755,11 @@ EXPORT_SYMBOL_GPL(typec_partner_set_identity);
  *
  * This routine is used to report that the PD revision of the port partner has
  * become available.
- *
- * Returns 0 on success or negative error number on failure.
  */
-int typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision)
+void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision)
 {
-	int ret;
-
 	if (partner->pd_revision == pd_revision)
-		return 0;
+		return;
 
 	partner->pd_revision = pd_revision;
 	sysfs_notify(&partner->dev.kobj, NULL, "usb_power_delivery_revision");
@@ -773,8 +769,6 @@ int typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision
 			     "supports_usb_power_delivery");
 	}
 	kobject_uevent(&partner->dev.kobj, KOBJ_CHANGE);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(typec_partner_set_pd_revision);
 
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 4946eca742d5..a94df82ab62f 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -126,7 +126,7 @@ struct typec_altmode_desc {
 	enum typec_port_data	roles;
 };
 
-int typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision);
+void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision);
 int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmodes);
 struct typec_altmode
 *typec_partner_register_altmode(struct typec_partner *partner,
-- 
cgit v1.2.3


From bd0c9706430240e3b7e9323361bb25066540d2a8 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 21 Dec 2020 08:27:15 -0800
Subject: tracing: Add printf attribute to log function

Attributing the function allows the compiler to more thoroughly
check the use of the function with -Wformat and similar flags.

Link: https://lkml.kernel.org/r/20201221162715.3757291-1-trix@redhat.com

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace.h b/include/linux/trace.h
index 886a4ffd9d45..be1e130ed87c 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -34,8 +34,9 @@ int unregister_ftrace_export(struct trace_export *export);
 struct trace_array;
 
 void trace_printk_init_buffers(void);
+__printf(3, 4)
 int trace_array_printk(struct trace_array *tr, unsigned long ip,
-		const char *fmt, ...);
+		       const char *fmt, ...);
 int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
 struct trace_array *trace_array_get_by_name(const char *name);
-- 
cgit v1.2.3


From 36590c50b2d0729952511129916beeea30d31d81 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:08 +0100
Subject: tracing: Merge irqflags + preempt counter.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The state of the interrupts (irqflags) and the preemption counter are
both passed down to tracing_generic_entry_update(). Only one bit of
irqflags is actually required: The on/off state. The complete 32bit
of the preemption counter isn't needed. Just whether of the upper bits
(softirq, hardirq and NMI) are set and the preemption depth is needed.

The irqflags and the preemption counter could be evaluated early and the
information stored in an integer `trace_ctx'.
tracing_generic_entry_update() would use the upper bits as the
TRACE_FLAG_* and the lower 8bit as the disabled-preemption depth
(considering that one must be substracted from the counter in one
special cases).

The actual preemption value is not used except for the tracing record.
The `irqflags' variable is mostly used only for the tracing record. An
exception here is for instance wakeup_tracer_call() or
probe_wakeup_sched_switch() which explicilty disable interrupts and use
that `irqflags' to save (and restore) the IRQ state and to record the
state.

Struct trace_event_buffer has also the `pc' and flags' members which can
be replaced with `trace_ctx' since their actual value is not used
outside of trace recording.

This will reduce tracing_generic_entry_update() to simply assign values
to struct trace_entry. The evaluation of the TRACE_FLAG_* bits is moved
to _tracing_gen_ctx_flags() which replaces preempt_count() and
local_save_flags() invocations.

As an example, ftrace_syscall_enter() may invoke:
- trace_buffer_lock_reserve() -> … -> tracing_generic_entry_update()
- event_trigger_unlock_commit()
  -> ftrace_trace_stack() -> … -> tracing_generic_entry_update()
  -> ftrace_trace_userstack() -> … -> tracing_generic_entry_update()

In this case the TRACE_FLAG_* bits were evaluated three times. By using
the `trace_ctx' they are evaluated once and assigned three times.

A build with all tracers enabled on x86-64 with and without the patch:

    text     data      bss      dec      hex    filename
21970669 17084168  7639260 46694097  2c87ed1 vmlinux.old
21970293 17084168  7639260 46693721  2c87d59 vmlinux.new

text shrank by 379 bytes, data remained constant.

Link: https://lkml.kernel.org/r/20210125194511.3924915-2-bigeasy@linutronix.de

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h         |  25 +++--
 kernel/trace/blktrace.c              |  17 +--
 kernel/trace/trace.c                 | 208 +++++++++++++++++++----------------
 kernel/trace/trace.h                 |  38 +++----
 kernel/trace/trace_branch.c          |   6 +-
 kernel/trace/trace_event_perf.c      |   5 +-
 kernel/trace/trace_events.c          |  18 +--
 kernel/trace/trace_events_inject.c   |   6 +-
 kernel/trace/trace_functions.c       |  28 ++---
 kernel/trace/trace_functions_graph.c |  32 +++---
 kernel/trace/trace_hwlat.c           |   7 +-
 kernel/trace/trace_irqsoff.c         |  86 ++++++---------
 kernel/trace/trace_kprobe.c          |  10 +-
 kernel/trace/trace_mmiotrace.c       |  14 ++-
 kernel/trace/trace_sched_wakeup.c    |  71 ++++++------
 kernel/trace/trace_syscalls.c        |  20 ++--
 kernel/trace/trace_uprobe.c          |   4 +-
 17 files changed, 287 insertions(+), 308 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d321fe5ad1a1..091250b0895a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -148,17 +148,29 @@ enum print_line_t {
 
 enum print_line_t trace_handle_return(struct trace_seq *s);
 
-void tracing_generic_entry_update(struct trace_entry *entry,
-				  unsigned short type,
-				  unsigned long flags,
-				  int pc);
+static inline void tracing_generic_entry_update(struct trace_entry *entry,
+						unsigned short type,
+						unsigned int trace_ctx)
+{
+	struct task_struct *tsk = current;
+
+	entry->preempt_count		= trace_ctx & 0xff;
+	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->type			= type;
+	entry->flags =			trace_ctx >> 16;
+}
+
+unsigned int tracing_gen_ctx_flags(unsigned long irqflags);
+unsigned int tracing_gen_ctx(void);
+unsigned int tracing_gen_ctx_dec(void);
+
 struct trace_event_file;
 
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer,
 				struct trace_event_file *trace_file,
 				int type, unsigned long len,
-				unsigned long flags, int pc);
+				unsigned int trace_ctx);
 
 #define TRACE_RECORD_CMDLINE	BIT(0)
 #define TRACE_RECORD_TGID	BIT(1)
@@ -232,8 +244,7 @@ struct trace_event_buffer {
 	struct ring_buffer_event	*event;
 	struct trace_event_file		*trace_file;
 	void				*entry;
-	unsigned long			flags;
-	int				pc;
+	unsigned int			trace_ctx;
 	struct pt_regs			*regs;
 };
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb0fe4c66b84..c54eae2ab208 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	struct blk_io_trace *t;
 	struct ring_buffer_event *event = NULL;
 	struct trace_buffer *buffer = NULL;
-	int pc = 0;
+	unsigned int trace_ctx = 0;
 	int cpu = smp_processor_id();
 	bool blk_tracer = blk_tracer_enabled;
 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
 
 	if (blk_tracer) {
 		buffer = blk_tr->array_buffer.buffer;
-		pc = preempt_count();
+		trace_ctx = tracing_gen_ctx_flags(0);
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + len + cgid_len,
-						  0, pc);
+						  trace_ctx);
 		if (!event)
 			return;
 		t = ring_buffer_event_data(event);
@@ -107,7 +107,7 @@ record_it:
 		memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
 
 		if (blk_tracer)
-			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+			trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
 	}
 }
 
@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	struct blk_io_trace *t;
 	unsigned long flags = 0;
 	unsigned long *sequence;
+	unsigned int trace_ctx = 0;
 	pid_t pid;
-	int cpu, pc = 0;
+	int cpu;
 	bool blk_tracer = blk_tracer_enabled;
 	ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
 
@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		tracing_record_cmdline(current);
 
 		buffer = blk_tr->array_buffer.buffer;
-		pc = preempt_count();
+		trace_ctx = tracing_gen_ctx_flags(0);
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + pdu_len + cgid_len,
-						  0, pc);
+						  trace_ctx);
 		if (!event)
 			return;
 		t = ring_buffer_event_data(event);
@@ -301,7 +302,7 @@ record_it:
 			memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
 
 		if (blk_tracer) {
-			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
+			trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
 			return;
 		}
 	}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e4f4043a3df..0b3cce6ecf52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps;
 int tracing_set_tracer(struct trace_array *tr, const char *buf);
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   struct trace_buffer *buffer,
-				   unsigned long flags, int pc);
+				   unsigned int trace_ctx);
 
 #define MAX_TRACER_SIZE		100
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -906,23 +906,23 @@ static inline void trace_access_lock_init(void)
 
 #ifdef CONFIG_STACKTRACE
 static void __ftrace_trace_stack(struct trace_buffer *buffer,
-				 unsigned long flags,
-				 int skip, int pc, struct pt_regs *regs);
+				 unsigned int trace_ctx,
+				 int skip, struct pt_regs *regs);
 static inline void ftrace_trace_stack(struct trace_array *tr,
 				      struct trace_buffer *buffer,
-				      unsigned long flags,
-				      int skip, int pc, struct pt_regs *regs);
+				      unsigned int trace_ctx,
+				      int skip, struct pt_regs *regs);
 
 #else
 static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
-					unsigned long flags,
-					int skip, int pc, struct pt_regs *regs)
+					unsigned int trace_ctx,
+					int skip, struct pt_regs *regs)
 {
 }
 static inline void ftrace_trace_stack(struct trace_array *tr,
 				      struct trace_buffer *buffer,
-				      unsigned long flags,
-				      int skip, int pc, struct pt_regs *regs)
+				      unsigned long trace_ctx,
+				      int skip, struct pt_regs *regs)
 {
 }
 
@@ -930,24 +930,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
 
 static __always_inline void
 trace_event_setup(struct ring_buffer_event *event,
-		  int type, unsigned long flags, int pc)
+		  int type, unsigned int trace_ctx)
 {
 	struct trace_entry *ent = ring_buffer_event_data(event);
 
-	tracing_generic_entry_update(ent, type, flags, pc);
+	tracing_generic_entry_update(ent, type, trace_ctx);
 }
 
 static __always_inline struct ring_buffer_event *
 __trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
 			  unsigned long len,
-			  unsigned long flags, int pc)
+			  unsigned int trace_ctx)
 {
 	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(buffer, len);
 	if (event != NULL)
-		trace_event_setup(event, type, flags, pc);
+		trace_event_setup(event, type, trace_ctx);
 
 	return event;
 }
@@ -1008,25 +1008,22 @@ int __trace_puts(unsigned long ip, const char *str, int size)
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct print_entry *entry;
-	unsigned long irq_flags;
+	unsigned int trace_ctx;
 	int alloc;
-	int pc;
 
 	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	pc = preempt_count();
-
 	if (unlikely(tracing_selftest_running || tracing_disabled))
 		return 0;
 
 	alloc = sizeof(*entry) + size + 2; /* possible \n added */
 
-	local_save_flags(irq_flags);
+	trace_ctx = tracing_gen_ctx();
 	buffer = global_trace.array_buffer.buffer;
 	ring_buffer_nest_start(buffer);
-	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-					    irq_flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+					    trace_ctx);
 	if (!event) {
 		size = 0;
 		goto out;
@@ -1045,7 +1042,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
 		entry->buf[size] = '\0';
 
 	__buffer_unlock_commit(buffer, event);
-	ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+	ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
  out:
 	ring_buffer_nest_end(buffer);
 	return size;
@@ -1062,25 +1059,22 @@ int __trace_bputs(unsigned long ip, const char *str)
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct bputs_entry *entry;
-	unsigned long irq_flags;
+	unsigned int trace_ctx;
 	int size = sizeof(struct bputs_entry);
 	int ret = 0;
-	int pc;
 
 	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	pc = preempt_count();
-
 	if (unlikely(tracing_selftest_running || tracing_disabled))
 		return 0;
 
-	local_save_flags(irq_flags);
+	trace_ctx = tracing_gen_ctx();
 	buffer = global_trace.array_buffer.buffer;
 
 	ring_buffer_nest_start(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-					    irq_flags, pc);
+					    trace_ctx);
 	if (!event)
 		goto out;
 
@@ -1089,7 +1083,7 @@ int __trace_bputs(unsigned long ip, const char *str)
 	entry->str			= str;
 
 	__buffer_unlock_commit(buffer, event);
-	ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
+	ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
 
 	ret = 1;
  out:
@@ -2585,36 +2579,69 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 }
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
-void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
-			     unsigned long flags, int pc)
+unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
 {
-	struct task_struct *tsk = current;
+	unsigned int trace_flags = 0;
+	unsigned int pc;
+
+	pc = preempt_count();
 
-	entry->preempt_count		= pc & 0xff;
-	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->type			= type;
-	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+	if (irqs_disabled_flags(irqflags))
+		trace_flags |= TRACE_FLAG_IRQS_OFF;
 #else
-		TRACE_FLAG_IRQS_NOSUPPORT |
+	trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT;
 #endif
-		((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
-		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
-		((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
-		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
-		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+
+	if (pc & NMI_MASK)
+		trace_flags |= TRACE_FLAG_NMI;
+	if (pc & HARDIRQ_MASK)
+		trace_flags |= TRACE_FLAG_HARDIRQ;
+
+	if (pc & SOFTIRQ_OFFSET)
+		trace_flags |= TRACE_FLAG_SOFTIRQ;
+
+	if (tif_need_resched())
+		trace_flags |= TRACE_FLAG_NEED_RESCHED;
+	if (test_preempt_need_resched())
+		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+	return (trace_flags << 16) | (pc & 0xff);
+}
+
+unsigned int tracing_gen_ctx(void)
+{
+	unsigned long irqflags;
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+	local_save_flags(irqflags);
+#else
+	irqflags = 0;
+#endif
+	return tracing_gen_ctx_flags(irqflags);
+}
+
+unsigned int tracing_gen_ctx_dec(void)
+{
+	unsigned int trace_ctx;
+
+	trace_ctx = tracing_gen_ctx();
+
+	/*
+	 * Subtract one from the preeption counter if preemption is enabled,
+	 * see trace_event_buffer_reserve()for details.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		trace_ctx--;
+	return trace_ctx;
 }
-EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
 			  unsigned long len,
-			  unsigned long flags, int pc)
+			  unsigned int trace_ctx)
 {
-	return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
+	return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
 }
 
 DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2734,7 +2761,7 @@ struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 			  struct trace_event_file *trace_file,
 			  int type, unsigned long len,
-			  unsigned long flags, int pc)
+			  unsigned int trace_ctx)
 {
 	struct ring_buffer_event *entry;
 	int val;
@@ -2747,15 +2774,15 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 		/* Try to use the per cpu buffer first */
 		val = this_cpu_inc_return(trace_buffered_event_cnt);
 		if (val == 1) {
-			trace_event_setup(entry, type, flags, pc);
+			trace_event_setup(entry, type, trace_ctx);
 			entry->array[0] = len;
 			return entry;
 		}
 		this_cpu_dec(trace_buffered_event_cnt);
 	}
 
-	entry = __trace_buffer_lock_reserve(*current_rb,
-					    type, len, flags, pc);
+	entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+					    trace_ctx);
 	/*
 	 * If tracing is off, but we have triggers enabled
 	 * we still need to look at the event data. Use the temp_buffer
@@ -2764,8 +2791,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 	 */
 	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
 		*current_rb = temp_buffer;
-		entry = __trace_buffer_lock_reserve(*current_rb,
-						    type, len, flags, pc);
+		entry = __trace_buffer_lock_reserve(*current_rb, type, len,
+						    trace_ctx);
 	}
 	return entry;
 }
@@ -2851,7 +2878,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 		ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
 	event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
 				    fbuffer->event, fbuffer->entry,
-				    fbuffer->flags, fbuffer->pc, fbuffer->regs);
+				    fbuffer->trace_ctx, fbuffer->regs);
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 
@@ -2867,7 +2894,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct trace_buffer *buffer,
 				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
+				     unsigned int trace_ctx,
 				     struct pt_regs *regs)
 {
 	__buffer_unlock_commit(buffer, event);
@@ -2878,8 +2905,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	 * and mmiotrace, but that's ok if they lose a function or
 	 * two. They are not that meaningful.
 	 */
-	ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
-	ftrace_trace_userstack(tr, buffer, flags, pc);
+	ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
+	ftrace_trace_userstack(tr, buffer, trace_ctx);
 }
 
 /*
@@ -2893,9 +2920,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
 }
 
 void
-trace_function(struct trace_array *tr,
-	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-	       int pc)
+trace_function(struct trace_array *tr, unsigned long ip, unsigned long
+	       parent_ip, unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_function;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -2903,7 +2929,7 @@ trace_function(struct trace_array *tr,
 	struct ftrace_entry *entry;
 
 	event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
-					    flags, pc);
+					    trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -2937,8 +2963,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
 static DEFINE_PER_CPU(int, ftrace_stack_reserve);
 
 static void __ftrace_trace_stack(struct trace_buffer *buffer,
-				 unsigned long flags,
-				 int skip, int pc, struct pt_regs *regs)
+				 unsigned int trace_ctx,
+				 int skip, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
@@ -2985,7 +3011,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 
 	size = nr_entries * sizeof(unsigned long);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
-					    sizeof(*entry) + size, flags, pc);
+					    sizeof(*entry) + size, trace_ctx);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -3006,22 +3032,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 
 static inline void ftrace_trace_stack(struct trace_array *tr,
 				      struct trace_buffer *buffer,
-				      unsigned long flags,
-				      int skip, int pc, struct pt_regs *regs)
+				      unsigned int trace_ctx,
+				      int skip, struct pt_regs *regs)
 {
 	if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
 		return;
 
-	__ftrace_trace_stack(buffer, flags, skip, pc, regs);
+	__ftrace_trace_stack(buffer, trace_ctx, skip, regs);
 }
 
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
-		   int pc)
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+		   int skip)
 {
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 
 	if (rcu_is_watching()) {
-		__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+		__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
 		return;
 	}
 
@@ -3035,7 +3061,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		return;
 
 	rcu_irq_enter_irqson();
-	__ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+	__ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
 	rcu_irq_exit_irqson();
 }
 
@@ -3045,19 +3071,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
  */
 void trace_dump_stack(int skip)
 {
-	unsigned long flags;
-
 	if (tracing_disabled || tracing_selftest_running)
 		return;
 
-	local_save_flags(flags);
-
 #ifndef CONFIG_UNWINDER_ORC
 	/* Skip 1 to skip this function. */
 	skip++;
 #endif
 	__ftrace_trace_stack(global_trace.array_buffer.buffer,
-			     flags, skip, preempt_count(), NULL);
+			     tracing_gen_ctx(), skip, NULL);
 }
 EXPORT_SYMBOL_GPL(trace_dump_stack);
 
@@ -3066,7 +3088,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
 
 static void
 ftrace_trace_userstack(struct trace_array *tr,
-		       struct trace_buffer *buffer, unsigned long flags, int pc)
+		       struct trace_buffer *buffer, unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
@@ -3093,7 +3115,7 @@ ftrace_trace_userstack(struct trace_array *tr,
 	__this_cpu_inc(user_stack_count);
 
 	event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
-					    sizeof(*entry), flags, pc);
+					    sizeof(*entry), trace_ctx);
 	if (!event)
 		goto out_drop_count;
 	entry	= ring_buffer_event_data(event);
@@ -3113,7 +3135,7 @@ ftrace_trace_userstack(struct trace_array *tr,
 #else /* CONFIG_USER_STACKTRACE_SUPPORT */
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   struct trace_buffer *buffer,
-				   unsigned long flags, int pc)
+				   unsigned int trace_ctx)
 {
 }
 #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
@@ -3243,9 +3265,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct trace_buffer *buffer;
 	struct trace_array *tr = &global_trace;
 	struct bprint_entry *entry;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	char *tbuffer;
-	int len = 0, size, pc;
+	int len = 0, size;
 
 	if (unlikely(tracing_selftest_running || tracing_disabled))
 		return 0;
@@ -3253,7 +3275,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	/* Don't pollute graph traces with trace_vprintk internals */
 	pause_graph_tracing();
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 	tbuffer = get_trace_buf();
@@ -3267,12 +3289,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
 		goto out_put;
 
-	local_save_flags(flags);
 	size = sizeof(*entry) + sizeof(u32) * len;
 	buffer = tr->array_buffer.buffer;
 	ring_buffer_nest_start(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
-					    flags, pc);
+					    trace_ctx);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -3282,7 +3303,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	memcpy(entry->buf, tbuffer, sizeof(u32) * len);
 	if (!call_filter_check_discard(call, entry, buffer, event)) {
 		__buffer_unlock_commit(buffer, event);
-		ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
+		ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
 	}
 
 out:
@@ -3305,9 +3326,9 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 {
 	struct trace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
-	int len = 0, size, pc;
+	int len = 0, size;
 	struct print_entry *entry;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	char *tbuffer;
 
 	if (tracing_disabled || tracing_selftest_running)
@@ -3316,7 +3337,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 	/* Don't pollute graph traces with trace_vprintk internals */
 	pause_graph_tracing();
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 
@@ -3328,11 +3349,10 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 
 	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
 
-	local_save_flags(flags);
 	size = sizeof(*entry) + len + 1;
 	ring_buffer_nest_start(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					    flags, pc);
+					    trace_ctx);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -3341,7 +3361,7 @@ __trace_array_vprintk(struct trace_buffer *buffer,
 	memcpy(&entry->buf, tbuffer, len + 1);
 	if (!call_filter_check_discard(call, entry, buffer, event)) {
 		__buffer_unlock_commit(buffer, event);
-		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
+		ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
 	}
 
 out:
@@ -6654,7 +6674,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	enum event_trigger_type tt = ETT_NONE;
 	struct trace_buffer *buffer;
 	struct print_entry *entry;
-	unsigned long irq_flags;
 	ssize_t written;
 	int size;
 	int len;
@@ -6674,7 +6693,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
 
 	/* If less than "<faulted>", then make sure we can still add that */
@@ -6683,7 +6701,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					    irq_flags, preempt_count());
+					    tracing_gen_ctx());
 	if (unlikely(!event))
 		/* Ring buffer disabled, return as if not open for write */
 		return -EBADF;
@@ -6735,7 +6753,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
 	struct raw_data_entry *entry;
-	unsigned long irq_flags;
 	ssize_t written;
 	int size;
 	int len;
@@ -6757,14 +6774,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 
 	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt;
 	if (cnt < FAULT_SIZE_ID)
 		size += FAULT_SIZE_ID - cnt;
 
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
-					    irq_flags, preempt_count());
+					    tracing_gen_ctx());
 	if (!event)
 		/* Ring buffer disabled, return as if not open for write */
 		return -EBADF;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e448d2da0b99..8daf3a0758b1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -589,8 +589,7 @@ struct ring_buffer_event *
 trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
 			  unsigned long len,
-			  unsigned long flags,
-			  int pc);
+			  unsigned int trace_ctx);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
@@ -615,11 +614,11 @@ unsigned long trace_total_entries(struct trace_array *tr);
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
-		    unsigned long flags, int pc);
+		    unsigned int trace_ctx);
 void trace_graph_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
-		    unsigned long flags, int pc);
+		    unsigned int trace_ctx);
 void trace_latency_header(struct seq_file *m);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
@@ -687,11 +686,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { }
 #endif
 
 #ifdef CONFIG_STACKTRACE
-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
-		   int pc);
+void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip);
 #else
-static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
-				 int skip, int pc)
+static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
+				 int skip)
 {
 }
 #endif /* CONFIG_STACKTRACE */
@@ -831,10 +829,10 @@ extern void graph_trace_open(struct trace_iterator *iter);
 extern void graph_trace_close(struct trace_iterator *iter);
 extern int __trace_graph_entry(struct trace_array *tr,
 			       struct ftrace_graph_ent *trace,
-			       unsigned long flags, int pc);
+			       unsigned int trace_ctx);
 extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
-				 unsigned long flags, int pc);
+				 unsigned int trace_ctx);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1297,15 +1295,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct trace_buffer *buffer,
 				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
+				     unsigned int trcace_ctx,
 				     struct pt_regs *regs);
 
 static inline void trace_buffer_unlock_commit(struct trace_array *tr,
 					      struct trace_buffer *buffer,
 					      struct ring_buffer_event *event,
-					      unsigned long flags, int pc)
+					      unsigned int trace_ctx)
 {
-	trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+	trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
 }
 
 DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -1366,8 +1364,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
  *
  * This is a helper function to handle triggers that require data
  * from the event itself. It also tests the event against filters and
@@ -1377,12 +1374,12 @@ static inline void
 event_trigger_unlock_commit(struct trace_event_file *file,
 			    struct trace_buffer *buffer,
 			    struct ring_buffer_event *event,
-			    void *entry, unsigned long irq_flags, int pc)
+			    void *entry, unsigned int trace_ctx)
 {
 	enum event_trigger_type tt = ETT_NONE;
 
 	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+		trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx);
 
 	if (tt)
 		event_triggers_post_call(file, tt);
@@ -1394,8 +1391,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
+ * @trace_ctx: The tracing context flags.
  *
  * This is a helper function to handle triggers that require data
  * from the event itself. It also tests the event against filters and
@@ -1408,14 +1404,14 @@ static inline void
 event_trigger_unlock_commit_regs(struct trace_event_file *file,
 				 struct trace_buffer *buffer,
 				 struct ring_buffer_event *event,
-				 void *entry, unsigned long irq_flags, int pc,
+				 void *entry, unsigned int trace_ctx,
 				 struct pt_regs *regs)
 {
 	enum event_trigger_type tt = ETT_NONE;
 
 	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
 		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
-						irq_flags, pc, regs);
+						trace_ctx, regs);
 
 	if (tt)
 		event_triggers_post_call(file, tt);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index eff099123aa2..e47fdb4c92fb 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 	const char *p;
 
 	if (current->trace_recursion & TRACE_BRANCH_BIT)
@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 	if (atomic_read(&data->disabled))
 		goto out;
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx_flags(flags);
 	buffer = tr->array_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		goto out;
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a71181655958..288ad2c274fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc);
 void perf_trace_buf_update(void *record, u16 type)
 {
 	struct trace_entry *entry = record;
-	int pc = preempt_count();
-	unsigned long flags;
 
-	local_save_flags(flags);
-	tracing_generic_entry_update(entry, type, flags, pc);
+	tracing_generic_entry_update(entry, type, tracing_gen_ctx());
 }
 NOKPROBE_SYMBOL(perf_trace_buf_update);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e9d28eeccb7e..20ccce3e4ffb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -258,22 +258,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 	    trace_event_ignore_this_pid(trace_file))
 		return NULL;
 
-	local_save_flags(fbuffer->flags);
-	fbuffer->pc = preempt_count();
 	/*
 	 * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
 	 * preemption (adding one to the preempt_count). Since we are
 	 * interested in the preempt_count at the time the tracepoint was
 	 * hit, we need to subtract one to offset the increment.
 	 */
-	if (IS_ENABLED(CONFIG_PREEMPTION))
-		fbuffer->pc--;
+	fbuffer->trace_ctx = tracing_gen_ctx_dec();
 	fbuffer->trace_file = trace_file;
 
 	fbuffer->event =
 		trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
 						event_call->event.type, len,
-						fbuffer->flags, fbuffer->pc);
+						fbuffer->trace_ctx);
 	if (!fbuffer->event)
 		return NULL;
 
@@ -3678,12 +3675,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 	struct trace_buffer *buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	long disabled;
 	int cpu;
-	int pc;
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
@@ -3691,11 +3687,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 	if (disabled != 1)
 		goto out;
 
-	local_save_flags(flags);
-
 	event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
 						TRACE_FN, sizeof(*entry),
-						flags, pc);
+						trace_ctx);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -3703,7 +3697,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 	entry->parent_ip		= parent_ip;
 
 	event_trigger_unlock_commit(&event_trace_file, buffer, event,
-				    entry, flags, pc);
+				    entry, trace_ctx);
  out:
 	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 22bcf7c51d1e..c188045c5f97 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size)
 static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
 {
 	struct ftrace_event_field *field;
-	unsigned long irq_flags;
 	void *entry = NULL;
 	int entry_size;
 	u64 val = 0;
@@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry)
 	if (!entry)
 		return -ENOMEM;
 
-	local_save_flags(irq_flags);
-	tracing_generic_entry_update(entry, call->event.type, irq_flags,
-				     preempt_count());
+	tracing_generic_entry_update(entry, call->event.type,
+				     tracing_gen_ctx());
 
 	while ((len = parse_field(str, call, &field, &val)) > 0) {
 		if (is_function_field(field))
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f67aec5bb771..f93723ca66bc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -131,10 +131,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 {
 	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
-	unsigned long flags;
+	unsigned int trace_ctx;
 	int bit;
 	int cpu;
-	int pc;
 
 	if (unlikely(!tr->function_enabled))
 		return;
@@ -143,15 +142,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 	cpu = smp_processor_id();
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
-	if (!atomic_read(&data->disabled)) {
-		local_save_flags(flags);
-		trace_function(tr, ip, parent_ip, flags, pc);
-	}
+	if (!atomic_read(&data->disabled))
+		trace_function(tr, ip, parent_ip, trace_ctx);
+
 	ftrace_test_recursion_unlock(bit);
 	preempt_enable_notrace();
 }
@@ -183,7 +181,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	unsigned long flags;
 	long disabled;
 	int cpu;
-	int pc;
+	unsigned int trace_ctx;
 
 	if (unlikely(!tr->function_enabled))
 		return;
@@ -198,9 +196,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		trace_function(tr, ip, parent_ip, flags, pc);
-		__trace_stack(tr, flags, STACK_SKIP, pc);
+		trace_ctx = tracing_gen_ctx_flags(flags);
+		trace_function(tr, ip, parent_ip, trace_ctx);
+		__trace_stack(tr, trace_ctx, STACK_SKIP);
 	}
 
 	atomic_dec(&data->disabled);
@@ -403,13 +401,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
 
 static __always_inline void trace_stack(struct trace_array *tr)
 {
-	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
-	local_save_flags(flags);
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
-	__trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
+	__trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
 }
 
 static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d874dec87131..0aa6e6faa943 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 
 int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
-				unsigned long flags,
-				int pc)
+				unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
@@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr,
 	struct ftrace_graph_ent_entry *entry;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return 0;
 	entry	= ring_buffer_event_data(event);
@@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = graph_array;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 	long disabled;
 	int ret;
 	int cpu;
-	int pc;
 
 	if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
 		return 0;
@@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		ret = __trace_graph_entry(tr, trace, flags, pc);
+		trace_ctx = tracing_gen_ctx_flags(flags);
+		ret = __trace_graph_entry(tr, trace, trace_ctx);
 	} else {
 		ret = 0;
 	}
@@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 
 static void
 __trace_graph_function(struct trace_array *tr,
-		unsigned long ip, unsigned long flags, int pc)
+		unsigned long ip, unsigned int trace_ctx)
 {
 	u64 time = trace_clock_local();
 	struct ftrace_graph_ent ent = {
@@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr,
 		.rettime  = time,
 	};
 
-	__trace_graph_entry(tr, &ent, flags, pc);
-	__trace_graph_return(tr, &ret, flags, pc);
+	__trace_graph_entry(tr, &ent, trace_ctx);
+	__trace_graph_return(tr, &ret, trace_ctx);
 }
 
 void
 trace_graph_function(struct trace_array *tr,
 		unsigned long ip, unsigned long parent_ip,
-		unsigned long flags, int pc)
+		unsigned int trace_ctx)
 {
-	__trace_graph_function(tr, ip, flags, pc);
+	__trace_graph_function(tr, ip, trace_ctx);
 }
 
 void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
-				unsigned long flags,
-				int pc)
+				unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
@@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr,
 	struct ftrace_graph_ret_entry *entry;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = graph_array;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 	long disabled;
 	int cpu;
-	int pc;
 
 	ftrace_graph_addr_finish(trace);
 
@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		__trace_graph_return(tr, trace, flags, pc);
+		trace_ctx = tracing_gen_ctx_flags(flags);
+		__trace_graph_return(tr, trace, trace_ctx);
 	}
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index c0df9b97f147..34dc1a712dcb 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct hwlat_entry *entry;
-	unsigned long flags;
-	int pc;
-
-	pc = preempt_count();
-	local_save_flags(flags);
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry),
-					  flags, pc);
+					  tracing_gen_ctx());
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 6756379b661f..590b3d51afae 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 
 	if (!func_prolog_dec(tr, &data, &flags))
 		return;
 
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	trace_ctx = tracing_gen_ctx_flags(flags);
+
+	trace_function(tr, ip, parent_ip, trace_ctx);
 
 	atomic_dec(&data->disabled);
 }
@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
+	unsigned int trace_ctx;
 	int ret;
-	int pc;
 
 	if (ftrace_graph_ignore_func(trace))
 		return 0;
@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	if (!func_prolog_dec(tr, &data, &flags))
 		return 0;
 
-	pc = preempt_count();
-	ret = __trace_graph_entry(tr, trace, flags, pc);
+	trace_ctx = tracing_gen_ctx_flags(flags);
+	ret = __trace_graph_entry(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 
 	return ret;
@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
 	ftrace_graph_addr_finish(trace);
 
 	if (!func_prolog_dec(tr, &data, &flags))
 		return;
 
-	pc = preempt_count();
-	__trace_graph_return(tr, trace, flags, pc);
+	trace_ctx = tracing_gen_ctx_flags(flags);
+	__trace_graph_return(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 }
 
@@ -267,12 +270,12 @@ static void irqsoff_print_header(struct seq_file *s)
 static void
 __trace_function(struct trace_array *tr,
 		 unsigned long ip, unsigned long parent_ip,
-		 unsigned long flags, int pc)
+		 unsigned int trace_ctx)
 {
 	if (is_graph(tr))
-		trace_graph_function(tr, ip, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, parent_ip, trace_ctx);
 	else
-		trace_function(tr, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, trace_ctx);
 }
 
 #else
@@ -322,15 +325,13 @@ check_critical_timing(struct trace_array *tr,
 {
 	u64 T0, T1, delta;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);
 	delta = T1-T0;
 
-	local_save_flags(flags);
-
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
 	if (!report_latency(tr, delta))
 		goto out;
@@ -341,9 +342,9 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(tr, delta))
 		goto out_unlock;
 
-	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
 	/* Skip 5 functions to get to the irq/preempt enable function */
-	__trace_stack(tr, flags, 5, pc);
+	__trace_stack(tr, trace_ctx, 5);
 
 	if (data->critical_sequence != max_sequence)
 		goto out_unlock;
@@ -363,16 +364,15 @@ out_unlock:
 out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
-	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx);
 }
 
 static nokprobe_inline void
-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
 {
 	int cpu;
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
 
 	if (!tracer_enabled || !tracing_is_enabled())
 		return;
@@ -393,9 +393,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 	data->preempt_timestamp = ftrace_now(cpu);
 	data->critical_start = parent_ip ? : ip;
 
-	local_save_flags(flags);
-
-	__trace_function(tr, ip, parent_ip, flags, pc);
+	__trace_function(tr, ip, parent_ip, tracing_gen_ctx());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -403,12 +401,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 }
 
 static nokprobe_inline void
-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 {
 	int cpu;
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
+	unsigned int trace_ctx;
 
 	cpu = raw_smp_processor_id();
 	/* Always clear the tracing cpu on stopping the trace */
@@ -428,8 +426,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 
 	atomic_inc(&data->disabled);
 
-	local_save_flags(flags);
-	__trace_function(tr, ip, parent_ip, flags, pc);
+	trace_ctx = tracing_gen_ctx();
+	__trace_function(tr, ip, parent_ip, trace_ctx);
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
@@ -438,20 +436,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
 /* start and stop critical timings used to for stoppage (in idle) */
 void start_critical_timings(void)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) || irq_trace())
-		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+	if (preempt_trace(preempt_count()) || irq_trace())
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL_GPL(start_critical_timings);
 NOKPROBE_SYMBOL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) || irq_trace())
-		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
+	if (preempt_trace(preempt_count()) || irq_trace())
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL_GPL(stop_critical_timings);
 NOKPROBE_SYMBOL(stop_critical_timings);
@@ -613,19 +607,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr)
  */
 void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned int pc = preempt_count();
-
-	if (!preempt_trace(pc) && irq_trace())
-		stop_critical_timing(a0, a1, pc);
+	if (!preempt_trace(preempt_count()) && irq_trace())
+		stop_critical_timing(a0, a1);
 }
 NOKPROBE_SYMBOL(tracer_hardirqs_on);
 
 void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned int pc = preempt_count();
-
-	if (!preempt_trace(pc) && irq_trace())
-		start_critical_timing(a0, a1, pc);
+	if (!preempt_trace(preempt_count()) && irq_trace())
+		start_critical_timing(a0, a1);
 }
 NOKPROBE_SYMBOL(tracer_hardirqs_off);
 
@@ -665,18 +655,14 @@ static struct tracer irqsoff_tracer __read_mostly =
 #ifdef CONFIG_PREEMPT_TRACER
 void tracer_preempt_on(unsigned long a0, unsigned long a1)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) && !irq_trace())
-		stop_critical_timing(a0, a1, pc);
+	if (preempt_trace(preempt_count()) && !irq_trace())
+		stop_critical_timing(a0, a1);
 }
 
 void tracer_preempt_off(unsigned long a0, unsigned long a1)
 {
-	int pc = preempt_count();
-
-	if (preempt_trace(pc) && !irq_trace())
-		start_critical_timing(a0, a1, pc);
+	if (preempt_trace(preempt_count()) && !irq_trace())
+		start_critical_timing(a0, a1);
 }
 
 static int preemptoff_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 56c7fbff7bd7..f6c459aba8a6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1386,8 +1386,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
-	local_save_flags(fbuffer.flags);
-	fbuffer.pc = preempt_count();
+	fbuffer.trace_ctx = tracing_gen_ctx();
 	fbuffer.trace_file = trace_file;
 
 	dsize = __get_data_size(&tk->tp, regs);
@@ -1396,7 +1395,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 		trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
 					call->event.type,
 					sizeof(*entry) + tk->tp.size + dsize,
-					fbuffer.flags, fbuffer.pc);
+					fbuffer.trace_ctx);
 	if (!fbuffer.event)
 		return;
 
@@ -1434,8 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
-	local_save_flags(fbuffer.flags);
-	fbuffer.pc = preempt_count();
+	fbuffer.trace_ctx = tracing_gen_ctx();
 	fbuffer.trace_file = trace_file;
 
 	dsize = __get_data_size(&tk->tp, regs);
@@ -1443,7 +1441,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
 					call->event.type,
 					sizeof(*entry) + tk->tp.size + dsize,
-					fbuffer.flags, fbuffer.pc);
+					fbuffer.trace_ctx);
 	if (!fbuffer.event)
 		return;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 84582bf1ed5f..7221ae0b4c47 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -300,10 +300,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	int pc = preempt_count();
+	unsigned int trace_ctx;
 
+	trace_ctx = tracing_gen_ctx_flags(0);
 	event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
-					  sizeof(*entry), 0, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -312,7 +313,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	entry->rw			= *rw;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -330,10 +331,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	int pc = preempt_count();
+	unsigned int trace_ctx;
 
+	trace_ctx = tracing_gen_ctx_flags(0);
 	event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
-					  sizeof(*entry), 0, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -342,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	entry->map			= *map;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index c0181066dbe9..e5778d1d7a5b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -67,7 +67,7 @@ static bool function_enabled;
 static int
 func_prolog_preempt_disable(struct trace_array *tr,
 			    struct trace_array_cpu **data,
-			    int *pc)
+			    unsigned int *trace_ctx)
 {
 	long disabled;
 	int cpu;
@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
 	if (likely(!wakeup_task))
 		return 0;
 
-	*pc = preempt_count();
+	*trace_ctx = tracing_gen_ctx();
 	preempt_disable_notrace();
 
 	cpu = raw_smp_processor_id();
@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
-	int pc, ret = 0;
+	unsigned int trace_ctx;
+	int ret = 0;
 
 	if (ftrace_graph_ignore_func(trace))
 		return 0;
@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
 	if (ftrace_graph_notrace_addr(trace->func))
 		return 1;
 
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
+	if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
 		return 0;
 
-	local_save_flags(flags);
-	ret = __trace_graph_entry(tr, trace, flags, pc);
+	ret = __trace_graph_entry(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 	preempt_enable_notrace();
 
@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
-	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
 	ftrace_graph_addr_finish(trace);
 
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
+	if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
 		return;
 
-	local_save_flags(flags);
-	__trace_graph_return(tr, trace, flags, pc);
+	__trace_graph_return(tr, trace, trace_ctx);
 	atomic_dec(&data->disabled);
 
 	preempt_enable_notrace();
@@ -217,13 +214,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
 	unsigned long flags;
-	int pc;
+	unsigned int trace_ctx;
 
-	if (!func_prolog_preempt_disable(tr, &data, &pc))
+	if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
 		return;
 
 	local_irq_save(flags);
-	trace_function(tr, ip, parent_ip, flags, pc);
+	trace_function(tr, ip, parent_ip, trace_ctx);
 	local_irq_restore(flags);
 
 	atomic_dec(&data->disabled);
@@ -303,12 +300,12 @@ static void wakeup_print_header(struct seq_file *s)
 static void
 __trace_function(struct trace_array *tr,
 		 unsigned long ip, unsigned long parent_ip,
-		 unsigned long flags, int pc)
+		 unsigned int trace_ctx)
 {
 	if (is_graph(tr))
-		trace_graph_function(tr, ip, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, parent_ip, trace_ctx);
 	else
-		trace_function(tr, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, trace_ctx);
 }
 
 static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -375,7 +372,7 @@ static void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *prev,
 			   struct task_struct *next,
-			   unsigned long flags, int pc)
+			   unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_context_switch;
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -383,7 +380,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	struct ctx_switch_entry *entry;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -396,14 +393,14 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_cpu	= task_cpu(next);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 static void
 tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *wakee,
 			   struct task_struct *curr,
-			   unsigned long flags, int pc)
+			   unsigned int trace_ctx)
 {
 	struct trace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
@@ -411,7 +408,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct trace_buffer *buffer = tr->array_buffer.buffer;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
-					  sizeof(*entry), flags, pc);
+					  sizeof(*entry), trace_ctx);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -424,7 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_cpu			= task_cpu(wakee);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
+		trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
 }
 
 static void notrace
@@ -436,7 +433,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	unsigned long flags;
 	long disabled;
 	int cpu;
-	int pc;
+	unsigned int trace_ctx;
 
 	tracing_record_cmdline(prev);
 
@@ -455,8 +452,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	if (next != wakeup_task)
 		return;
 
-	pc = preempt_count();
-
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
@@ -464,6 +459,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 		goto out;
 
 	local_irq_save(flags);
+	trace_ctx = tracing_gen_ctx_flags(flags);
+
 	arch_spin_lock(&wakeup_lock);
 
 	/* We could race with grabbing wakeup_lock */
@@ -473,9 +470,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 	/* The task we are waiting for is waking up */
 	data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
 
-	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
-	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
-	__trace_stack(wakeup_trace, flags, 0, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx);
+	tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx);
+	__trace_stack(wakeup_trace, trace_ctx, 0);
 
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);
@@ -527,9 +524,8 @@ probe_wakeup(void *ignore, struct task_struct *p)
 {
 	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
-	unsigned long flags;
 	long disabled;
-	int pc;
+	unsigned int trace_ctx;
 
 	if (likely(!tracer_enabled))
 		return;
@@ -550,11 +546,12 @@ probe_wakeup(void *ignore, struct task_struct *p)
 	    (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
 		return;
 
-	pc = preempt_count();
 	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
+	trace_ctx = tracing_gen_ctx();
+
 	/* interrupts should be off from try_to_wake_up */
 	arch_spin_lock(&wakeup_lock);
 
@@ -581,19 +578,17 @@ probe_wakeup(void *ignore, struct task_struct *p)
 
 	wakeup_task = get_task_struct(p);
 
-	local_save_flags(flags);
-
 	data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu);
 	data->preempt_timestamp = ftrace_now(cpu);
-	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
-	__trace_stack(wakeup_trace, flags, 0, pc);
+	tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx);
+	__trace_stack(wakeup_trace, trace_ctx, 0);
 
 	/*
 	 * We must be careful in using CALLER_ADDR2. But since wake_up
 	 * is not called by an assembly function  (where as schedule is)
 	 * it should be safe to use it here.
 	 */
-	__trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	__trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx);
 
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d85a2f0f316b..8bfcd3b09422 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
-	unsigned long irq_flags;
+	unsigned int trace_ctx;
 	unsigned long args[6];
-	int pc;
 	int syscall_nr;
 	int size;
 
@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	local_save_flags(irq_flags);
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
 	buffer = tr->array_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
-			sys_data->enter_event->event.type, size, irq_flags, pc);
+			sys_data->enter_event->event.type, size, trace_ctx);
 	if (!event)
 		return;
 
@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
 	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    irq_flags, pc);
+				    trace_ctx);
 }
 
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	struct trace_buffer *buffer;
-	unsigned long irq_flags;
-	int pc;
+	unsigned int trace_ctx;
 	int syscall_nr;
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
@@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	local_save_flags(irq_flags);
-	pc = preempt_count();
+	trace_ctx = tracing_gen_ctx();
 
 	buffer = tr->array_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
 			sys_data->exit_event->event.type, sizeof(*entry),
-			irq_flags, pc);
+			trace_ctx);
 	if (!event)
 		return;
 
@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	entry->ret = syscall_get_return_value(current, regs);
 
 	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    irq_flags, pc);
+				    trace_ctx);
 }
 
 static int reg_event_syscall_enter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3cf7128e1ad3..a1ed96a7a462 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -961,7 +961,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
 	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
-						call->event.type, size, 0, 0);
+						call->event.type, size, 0);
 	if (!event)
 		return;
 
@@ -977,7 +977,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	memcpy(data, ucb->buf, tu->tp.size + dsize);
 
-	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
+	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0);
 }
 
 /* uprobe handler */
-- 
cgit v1.2.3


From 0c02006e6f5b0a3e73499bbf5943d9174c5ed640 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:09 +0100
Subject: tracing: Inline tracing_gen_ctx_flags()

Inline tracing_gen_ctx_flags(). This allows to have one ifdef
CONFIG_TRACE_IRQFLAGS_SUPPORT.

This requires to move `trace_flag_type' so tracing_gen_ctx_flags() can
use it.

Link: https://lkml.kernel.org/r/20210125194511.3924915-3-bigeasy@linutronix.de

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20210125140323.6b1ff20c@gandalf.local.home
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 54 +++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace.c         | 38 ++-----------------------------
 kernel/trace/trace.h         | 19 ----------------
 3 files changed, 53 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 091250b0895a..67ae708de40d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -160,9 +160,57 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
 	entry->flags =			trace_ctx >> 16;
 }
 
-unsigned int tracing_gen_ctx_flags(unsigned long irqflags);
-unsigned int tracing_gen_ctx(void);
-unsigned int tracing_gen_ctx_dec(void);
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
+
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
+	TRACE_FLAG_NEED_RESCHED		= 0x04,
+	TRACE_FLAG_HARDIRQ		= 0x08,
+	TRACE_FLAG_SOFTIRQ		= 0x10,
+	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
+	TRACE_FLAG_NMI			= 0x40,
+};
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+{
+	unsigned int irq_status = irqs_disabled_flags(irqflags) ?
+		TRACE_FLAG_IRQS_OFF : 0;
+	return tracing_gen_ctx_irq_test(irq_status);
+}
+static inline unsigned int tracing_gen_ctx(void)
+{
+	unsigned long irqflags;
+
+	local_save_flags(irqflags);
+	return tracing_gen_ctx_flags(irqflags);
+}
+#else
+
+static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+{
+	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
+}
+static inline unsigned int tracing_gen_ctx(void)
+{
+	return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
+}
+#endif
+
+static inline unsigned int tracing_gen_ctx_dec(void)
+{
+	unsigned int trace_ctx;
+
+	trace_ctx = tracing_gen_ctx();
+	/*
+	 * Subtract one from the preeption counter if preemption is enabled,
+	 * see trace_event_buffer_reserve()for details.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		trace_ctx--;
+	return trace_ctx;
+}
 
 struct trace_event_file;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b3cce6ecf52..584fa2a1304a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2579,20 +2579,13 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 }
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
-unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
 {
-	unsigned int trace_flags = 0;
+	unsigned int trace_flags = irqs_status;
 	unsigned int pc;
 
 	pc = preempt_count();
 
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-	if (irqs_disabled_flags(irqflags))
-		trace_flags |= TRACE_FLAG_IRQS_OFF;
-#else
-	trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT;
-#endif
-
 	if (pc & NMI_MASK)
 		trace_flags |= TRACE_FLAG_NMI;
 	if (pc & HARDIRQ_MASK)
@@ -2608,33 +2601,6 @@ unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
 	return (trace_flags << 16) | (pc & 0xff);
 }
 
-unsigned int tracing_gen_ctx(void)
-{
-	unsigned long irqflags;
-
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-	local_save_flags(irqflags);
-#else
-	irqflags = 0;
-#endif
-	return tracing_gen_ctx_flags(irqflags);
-}
-
-unsigned int tracing_gen_ctx_dec(void)
-{
-	unsigned int trace_ctx;
-
-	trace_ctx = tracing_gen_ctx();
-
-	/*
-	 * Subtract one from the preeption counter if preemption is enabled,
-	 * see trace_event_buffer_reserve()for details.
-	 */
-	if (IS_ENABLED(CONFIG_PREEMPTION))
-		trace_ctx--;
-	return trace_ctx;
-}
-
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct trace_buffer *buffer,
 			  int type,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8daf3a0758b1..93fb08ab8bb6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head {
 	unsigned long		ret_ip;
 };
 
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- *  IRQS_OFF		- interrupts were disabled
- *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags
- *  NEED_RESCHED	- reschedule is requested
- *  HARDIRQ		- inside an interrupt handler
- *  SOFTIRQ		- inside a softirq handler
- */
-enum trace_flag_type {
-	TRACE_FLAG_IRQS_OFF		= 0x01,
-	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
-	TRACE_FLAG_NEED_RESCHED		= 0x04,
-	TRACE_FLAG_HARDIRQ		= 0x08,
-	TRACE_FLAG_SOFTIRQ		= 0x10,
-	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,
-	TRACE_FLAG_NMI			= 0x40,
-};
-
 #define TRACE_BUF_SIZE		1024
 
 struct trace_array;
-- 
cgit v1.2.3


From 5817708493bec547914d23dcdd064919ac409f33 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:11 +0100
Subject: tracing: Remove NULL check from current in
 tracing_generic_entry_update().

I can't imagine when or why `current' would return a NULL pointer. This
check was added in commit
      72829bc3d63cd ("ftrace: move enums to ftrace.h and make helper function global")

but it doesn't give me hint why it was needed.

Assume `current' never returns a NULL pointer and remove the check.

Link: https://lkml.kernel.org/r/20210125194511.3924915-5-bigeasy@linutronix.de

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 67ae708de40d..5d1eeac4bfbe 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -152,10 +152,8 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry,
 						unsigned short type,
 						unsigned int trace_ctx)
 {
-	struct task_struct *tsk = current;
-
 	entry->preempt_count		= trace_ctx & 0xff;
-	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->pid			= current->pid;
 	entry->type			= type;
 	entry->flags =			trace_ctx >> 16;
 }
-- 
cgit v1.2.3


From 0e1d6f55a12e47942ce207dfb93e23049b454c9e Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Wed, 3 Feb 2021 00:17:27 +0800
Subject: usb: pd: Update VDO definitions

"PD Spec Revision 3.0 Version 2.0 + ECNs 2020-12-10" introduces several
changes regarding the ID Header VDO and the Product Type VDOs.

Signed-off-by: Kyle Tso <kyletso@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20210202161733.932215-3-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/class.c  |   8 +-
 include/linux/usb/pd_vdo.h | 308 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 242 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index a7d1bc83c2d4..b4a5d9d4564f 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -88,7 +88,7 @@ static const char * const typec_accessory_modes[] = {
 
 /* Product types defined in USB PD Specification R3.0 V2.0 */
 static const char * const product_type_ufp[8] = {
-	[IDH_PTYPE_UNDEF]		= "undefined",
+	[IDH_PTYPE_NOT_UFP]		= "not_ufp",
 	[IDH_PTYPE_HUB]			= "hub",
 	[IDH_PTYPE_PERIPH]		= "peripheral",
 	[IDH_PTYPE_PSD]			= "psd",
@@ -96,17 +96,17 @@ static const char * const product_type_ufp[8] = {
 };
 
 static const char * const product_type_dfp[8] = {
-	[IDH_PTYPE_DFP_UNDEF]		= "undefined",
+	[IDH_PTYPE_NOT_DFP]		= "not_dfp",
 	[IDH_PTYPE_DFP_HUB]		= "hub",
 	[IDH_PTYPE_DFP_HOST]		= "host",
 	[IDH_PTYPE_DFP_PB]		= "power_brick",
-	[IDH_PTYPE_DFP_AMC]		= "amc",
 };
 
 static const char * const product_type_cable[8] = {
-	[IDH_PTYPE_UNDEF]		= "undefined",
+	[IDH_PTYPE_NOT_CABLE]		= "not_cable",
 	[IDH_PTYPE_PCABLE]		= "passive",
 	[IDH_PTYPE_ACABLE]		= "active",
+	[IDH_PTYPE_VPD]			= "vpd",
 };
 
 static struct usb_pd_identity *get_pd_identity(struct device *dev)
diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index 8c08eeb9a74b..e9b6822c54c2 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -103,34 +103,46 @@
  * --------------------
  * <31>     :: data capable as a USB host
  * <30>     :: data capable as a USB device
- * <29:27>  :: product type (UFP / Cable)
+ * <29:27>  :: product type (UFP / Cable / VPD)
  * <26>     :: modal operation supported (1b == yes)
- * <25:16>  :: product type (DFP)
+ * <25:23>  :: product type (DFP) (SVDM version 2.0+ only; set to zero in version 1.0)
+ * <22:21>  :: connector type (SVDM version 2.0+ only; set to zero in version 1.0)
+ * <20:16>  :: Reserved, Shall be set to zero
  * <15:0>   :: USB-IF assigned VID for this cable vendor
  */
-#define IDH_PTYPE_UNDEF		0
+/* SOP Product Type (UFP) */
+#define IDH_PTYPE_NOT_UFP	0
 #define IDH_PTYPE_HUB		1
 #define IDH_PTYPE_PERIPH	2
 #define IDH_PTYPE_PSD		3
 #define IDH_PTYPE_AMA		5
 
+/* SOP' Product Type (Cable Plug / VPD) */
+#define IDH_PTYPE_NOT_CABLE	0
 #define IDH_PTYPE_PCABLE	3
 #define IDH_PTYPE_ACABLE	4
+#define IDH_PTYPE_VPD		6
 
-#define IDH_PTYPE_DFP_UNDEF	0
+/* SOP Product Type (DFP) */
+#define IDH_PTYPE_NOT_DFP	0
 #define IDH_PTYPE_DFP_HUB	1
 #define IDH_PTYPE_DFP_HOST	2
 #define IDH_PTYPE_DFP_PB	3
-#define IDH_PTYPE_DFP_AMC	4
 
-#define VDO_IDH(usbh, usbd, ptype, is_modal, vid)		\
-	((usbh) << 31 | (usbd) << 30 | ((ptype) & 0x7) << 27	\
-	 | (is_modal) << 26 | ((vid) & 0xffff))
+/* ID Header Mask */
+#define IDH_DFP_MASK		GENMASK(25, 23)
+#define IDH_CONN_MASK		GENMASK(22, 21)
+
+#define VDO_IDH(usbh, usbd, ufp_cable, is_modal, dfp, conn, vid)		\
+	((usbh) << 31 | (usbd) << 30 | ((ufp_cable) & 0x7) << 27		\
+	 | (is_modal) << 26 | ((dfp) & 0x7) << 23 | ((conn) & 0x3) << 21	\
+	 | ((vid) & 0xffff))
 
 #define PD_IDH_PTYPE(vdo)	(((vdo) >> 27) & 0x7)
 #define PD_IDH_VID(vdo)		((vdo) & 0xffff)
 #define PD_IDH_MODAL_SUPP(vdo)	((vdo) & (1 << 26))
 #define PD_IDH_DFP_PTYPE(vdo)	(((vdo) >> 23) & 0x7)
+#define PD_IDH_CONN_TYPE(vdo)	(((vdo) >> 21) & 0x3)
 
 /*
  * Cert Stat VDO
@@ -138,6 +150,7 @@
  * <31:0>  : USB-IF assigned XID for this cable
  */
 #define PD_CSTAT_XID(vdo)	(vdo)
+#define VDO_CERT(xid)		((xid) & 0xffffffff)
 
 /*
  * Product VDO
@@ -149,112 +162,267 @@
 #define PD_PRODUCT_PID(vdo)	(((vdo) >> 16) & 0xffff)
 
 /*
- * UFP VDO1
+ * UFP VDO (PD Revision 3.0+ only)
  * --------
  * <31:29> :: UFP VDO version
  * <28>    :: Reserved
  * <27:24> :: Device capability
- * <23:6>  :: Reserved
+ * <23:22> :: Connector type (10b == receptacle, 11b == captive plug)
+ * <21:11> :: Reserved
+ * <10:8>  :: Vconn power (AMA only)
+ * <7>     :: Vconn required (AMA only, 0b == no, 1b == yes)
+ * <6>     :: Vbus required (AMA only, 0b == yes, 1b == no)
  * <5:3>   :: Alternate modes
  * <2:0>   :: USB highest speed
  */
-#define PD_VDO1_UFP_DEVCAP(vdo)	(((vdo) & GENMASK(27, 24)) >> 24)
+#define PD_VDO_UFP_DEVCAP(vdo)	(((vdo) & GENMASK(27, 24)) >> 24)
+
+/* UFP VDO Version */
+#define UFP_VDO_VER1_2		2
 
+/* Device Capability */
 #define DEV_USB2_CAPABLE	BIT(0)
 #define DEV_USB2_BILLBOARD	BIT(1)
 #define DEV_USB3_CAPABLE	BIT(2)
 #define DEV_USB4_CAPABLE	BIT(3)
 
+/* Connector Type */
+#define UFP_RECEPTACLE		2
+#define UFP_CAPTIVE		3
+
+/* Vconn Power (AMA only, set to AMA_VCONN_NOT_REQ if Vconn is not required) */
+#define AMA_VCONN_PWR_1W	0
+#define AMA_VCONN_PWR_1W5	1
+#define AMA_VCONN_PWR_2W	2
+#define AMA_VCONN_PWR_3W	3
+#define AMA_VCONN_PWR_4W	4
+#define AMA_VCONN_PWR_5W	5
+#define AMA_VCONN_PWR_6W	6
+
+/* Vconn Required (AMA only) */
+#define AMA_VCONN_NOT_REQ	0
+#define AMA_VCONN_REQ		1
+
+/* Vbus Required (AMA only) */
+#define AMA_VBUS_REQ		0
+#define AMA_VBUS_NOT_REQ	1
+
+/* Alternate Modes */
+#define UFP_ALTMODE_NOT_SUPP	0
+#define UFP_ALTMODE_TBT3	BIT(0)
+#define UFP_ALTMODE_RECFG	BIT(1)
+#define UFP_ALTMODE_NO_RECFG	BIT(2)
+
+/* USB Highest Speed */
+#define UFP_USB2_ONLY		0
+#define UFP_USB32_GEN1		1
+#define UFP_USB32_4_GEN2	2
+#define UFP_USB4_GEN3		3
+
+#define VDO_UFP(ver, cap, conn, vcpwr, vcr, vbr, alt, spd)			\
+	(((ver) & 0x7) << 29 | ((cap) & 0xf) << 24 | ((conn) & 0x3) << 22	\
+	 | ((vcpwr) & 0x7) << 8 | (vcr) << 7 | (vbr) << 6 | ((alt) & 0x7) << 3	\
+	 | ((spd) & 0x7))
+
 /*
- * DFP VDO
+ * DFP VDO (PD Revision 3.0+ only)
  * --------
  * <31:29> :: DFP VDO version
  * <28:27> :: Reserved
  * <26:24> :: Host capability
- * <23:5>  :: Reserved
+ * <23:22> :: Connector type (10b == receptacle, 11b == captive plug)
+ * <21:5>  :: Reserved
  * <4:0>   :: Port number
  */
 #define PD_VDO_DFP_HOSTCAP(vdo)	(((vdo) & GENMASK(26, 24)) >> 24)
 
+#define DFP_VDO_VER1_1		1
 #define HOST_USB2_CAPABLE	BIT(0)
 #define HOST_USB3_CAPABLE	BIT(1)
 #define HOST_USB4_CAPABLE	BIT(2)
+#define DFP_RECEPTACLE		2
+#define DFP_CAPTIVE		3
+
+#define VDO_DFP(ver, cap, conn, pnum)						\
+	(((ver) & 0x7) << 29 | ((cap) & 0x7) << 24 | ((conn) & 0x3) << 22	\
+	 | ((pnum) & 0x1f))
 
 /*
- * Cable VDO
+ * Passive Cable VDO
  * ---------
  * <31:28> :: Cable HW version
  * <27:24> :: Cable FW version
- * <23:20> :: Reserved, Shall be set to zero
- * <19:18> :: type-C to Type-A/B/C/Captive (00b == A, 01 == B, 10 == C, 11 == Captive)
- * <17>    :: Type-C to Plug/Receptacle (0b == plug, 1b == receptacle)
+ * <23:21> :: VDO version
+ * <20>    :: Reserved, Shall be set to zero
+ * <19:18> :: Type-C to Type-C/Captive (10b == C, 11b == Captive)
+ * <17>    :: Reserved, Shall be set to zero
  * <16:13> :: cable latency (0001 == <10ns(~1m length))
- * <12:11> :: cable termination type (11b == both ends active VCONN req)
- * <10>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
- * <9>     :: SSTX2 Directionality support
- * <8>     :: SSRX1 Directionality support
- * <7>     :: SSRX2 Directionality support
- * <6:5>   :: Vbus current handling capability
+ * <12:11> :: cable termination type (10b == Vconn not req, 01b == Vconn req)
+ * <10:9>  :: Maximum Vbus voltage (00b == 20V, 01b == 30V, 10b == 40V, 11b == 50V)
+ * <8:7>   :: Reserved, Shall be set to zero
+ * <6:5>   :: Vbus current handling capability (01b == 3A, 10b == 5A)
+ * <4:3>   :: Reserved, Shall be set to zero
+ * <2:0>   :: USB highest speed
+ *
+ * Active Cable VDO 1
+ * ---------
+ * <31:28> :: Cable HW version
+ * <27:24> :: Cable FW version
+ * <23:21> :: VDO version
+ * <20>    :: Reserved, Shall be set to zero
+ * <19:18> :: Connector type (10b == C, 11b == Captive)
+ * <17>    :: Reserved, Shall be set to zero
+ * <16:13> :: cable latency (0001 == <10ns(~1m length))
+ * <12:11> :: cable termination type (10b == one end active, 11b == both ends active VCONN req)
+ * <10:9>  :: Maximum Vbus voltage (00b == 20V, 01b == 30V, 10b == 40V, 11b == 50V)
+ * <8>     :: SBU supported (0b == supported, 1b == not supported)
+ * <7>     :: SBU type (0b == passive, 1b == active)
+ * <6:5>   :: Vbus current handling capability (01b == 3A, 10b == 5A)
  * <4>     :: Vbus through cable (0b == no, 1b == yes)
  * <3>     :: SOP" controller present? (0b == no, 1b == yes)
- * <2:0>   :: USB SS Signaling support
+ * <2:0>   :: USB highest speed
  */
-#define CABLE_ATYPE		0
-#define CABLE_BTYPE		1
+/* Cable VDO Version */
+#define CABLE_VDO_VER1_0	0
+#define CABLE_VDO_VER1_3	3
+
+/* Connector Type */
 #define CABLE_CTYPE		2
 #define CABLE_CAPTIVE		3
-#define CABLE_PLUG		0
-#define CABLE_RECEPTACLE	1
-#define CABLE_CURR_1A5		0
+
+/* Cable Latency */
+#define CABLE_LATENCY_1M	1
+#define CABLE_LATENCY_2M	2
+#define CABLE_LATENCY_3M	3
+#define CABLE_LATENCY_4M	4
+#define CABLE_LATENCY_5M	5
+#define CABLE_LATENCY_6M	6
+#define CABLE_LATENCY_7M	7
+#define CABLE_LATENCY_7M_PLUS	8
+
+/* Cable Termination Type */
+#define PCABLE_VCONN_NOT_REQ	0
+#define PCABLE_VCONN_REQ	1
+#define ACABLE_ONE_END		2
+#define ACABLE_BOTH_END		3
+
+/* Maximum Vbus Voltage */
+#define CABLE_MAX_VBUS_20V	0
+#define CABLE_MAX_VBUS_30V	1
+#define CABLE_MAX_VBUS_40V	2
+#define CABLE_MAX_VBUS_50V	3
+
+/* Active Cable SBU Supported/Type */
+#define ACABLE_SBU_SUPP		0
+#define ACABLE_SBU_NOT_SUPP	1
+#define ACABLE_SBU_PASSIVE	0
+#define ACABLE_SBU_ACTIVE	1
+
+/* Vbus Current Handling Capability */
+#define CABLE_CURR_DEF		0
 #define CABLE_CURR_3A		1
 #define CABLE_CURR_5A		2
-#define CABLE_USBSS_U2_ONLY	0
-#define CABLE_USBSS_U31_GEN1	1
-#define CABLE_USBSS_U31_GEN2	2
-#define VDO_CABLE(hw, fw, cbl, gdr, lat, term, tx1d, tx2d, rx1d, rx2d, cur,\
-		  vps, sopp, usbss) \
-	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24 | ((cbl) & 0x3) << 18	\
-	 | (gdr) << 17 | ((lat) & 0x7) << 13 | ((term) & 0x3) << 11	\
-	 | (tx1d) << 10 | (tx2d) << 9 | (rx1d) << 8 | (rx2d) << 7	\
-	 | ((cur) & 0x3) << 5 | (vps) << 4 | (sopp) << 3		\
-	 | ((usbss) & 0x7))
+
+/* USB Highest Speed */
+#define CABLE_USB2_ONLY		0
+#define CABLE_USB32_GEN1	1
+#define CABLE_USB32_4_GEN2	2
+#define CABLE_USB4_GEN3		3
+
+#define VDO_PCABLE(hw, fw, ver, conn, lat, term, vbm, cur, spd)			\
+	(((hw) & 0xf) << 28 | ((fw) & 0xf) << 24 | ((ver) & 0x7) << 21		\
+	 | ((conn) & 0x3) << 18 | ((lat) & 0xf) << 13 | ((term) & 0x3) << 11	\
+	 | ((vbm) & 0x3) << 9 | ((cur) & 0x3) << 5 | ((spd) & 0x7))
+#define VDO_ACABLE1(hw, fw, ver, conn, lat, term, vbm, sbu, sbut, cur, vbt, sopp, spd) \
+	(((hw) & 0xf) << 28 | ((fw) & 0xf) << 24 | ((ver) & 0x7) << 21		\
+	 | ((conn) & 0x3) << 18	| ((lat) & 0xf) << 13 | ((term) & 0x3) << 11	\
+	 | ((vbm) & 0x3) << 9 | (sbu) << 8 | (sbut) << 7 | ((cur) & 0x3) << 5	\
+	 | (vbt) << 4 | (sopp) << 3 | ((spd) & 0x7))
+
 #define VDO_TYPEC_CABLE_TYPE(vdo)	(((vdo) >> 18) & 0x3)
 
 /*
- * AMA VDO
+ * Active Cable VDO 2
  * ---------
- * <31:28> :: Cable HW version
- * <27:24> :: Cable FW version
- * <23:12> :: Reserved, Shall be set to zero
- * <11>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
- * <10>    :: SSTX2 Directionality support
- * <9>     :: SSRX1 Directionality support
- * <8>     :: SSRX2 Directionality support
- * <7:5>   :: Vconn power
- * <4>     :: Vconn power required
- * <3>     :: Vbus power required
- * <2:0>   :: USB SS Signaling support
+ * <31:24> :: Maximum operating temperature
+ * <23:16> :: Shutdown temperature
+ * <15>    :: Reserved, Shall be set to zero
+ * <14:12> :: U3/CLd power
+ * <11>    :: U3 to U0 transition mode (0b == direct, 1b == through U3S)
+ * <10>    :: Physical connection (0b == copper, 1b == optical)
+ * <9>     :: Active element (0b == redriver, 1b == retimer)
+ * <8>     :: USB4 supported (0b == yes, 1b == no)
+ * <7:6>   :: USB2 hub hops consumed
+ * <5>     :: USB2 supported (0b == yes, 1b == no)
+ * <4>     :: USB3.2 supported (0b == yes, 1b == no)
+ * <3>     :: USB lanes supported (0b == one lane, 1b == two lanes)
+ * <2>     :: Optically isolated active cable (0b == no, 1b == yes)
+ * <1>     :: Reserved, Shall be set to zero
+ * <0>     :: USB gen (0b == gen1, 1b == gen2+)
  */
-#define VDO_AMA(hw, fw, tx1d, tx2d, rx1d, rx2d, vcpwr, vcr, vbr, usbss) \
-	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24			\
-	 | (tx1d) << 11 | (tx2d) << 10 | (rx1d) << 9 | (rx2d) << 8	\
-	 | ((vcpwr) & 0x7) << 5 | (vcr) << 4 | (vbr) << 3		\
-	 | ((usbss) & 0x7))
-
-#define PD_VDO_AMA_VCONN_REQ(vdo)	(((vdo) >> 4) & 1)
-#define PD_VDO_AMA_VBUS_REQ(vdo)	(((vdo) >> 3) & 1)
+/* U3/CLd Power*/
+#define ACAB2_U3_CLD_10MW_PLUS	0
+#define ACAB2_U3_CLD_10MW	1
+#define ACAB2_U3_CLD_5MW	2
+#define ACAB2_U3_CLD_1MW	3
+#define ACAB2_U3_CLD_500UW	4
+#define ACAB2_U3_CLD_200UW	5
+#define ACAB2_U3_CLD_50UW	6
+
+/* Other Active Cable VDO 2 Fields */
+#define ACAB2_U3U0_DIRECT	0
+#define ACAB2_U3U0_U3S		1
+#define ACAB2_PHY_COPPER	0
+#define ACAB2_PHY_OPTICAL	1
+#define ACAB2_REDRIVER		0
+#define ACAB2_RETIMER		1
+#define ACAB2_USB4_SUPP		0
+#define ACAB2_USB4_NOT_SUPP	1
+#define ACAB2_USB2_SUPP		0
+#define ACAB2_USB2_NOT_SUPP	1
+#define ACAB2_USB32_SUPP	0
+#define ACAB2_USB32_NOT_SUPP	1
+#define ACAB2_LANES_ONE		0
+#define ACAB2_LANES_TWO		1
+#define ACAB2_OPT_ISO_NO	0
+#define ACAB2_OPT_ISO_YES	1
+#define ACAB2_GEN_1		0
+#define ACAB2_GEN_2_PLUS	1
+
+#define VDO_ACABLE2(mtemp, stemp, u3p, trans, phy, ele, u4, hops, u2, u32, lane, iso, gen)	\
+	(((mtemp) & 0xff) << 24 | ((stemp) & 0xff) << 16 | ((u3p) & 0x7) << 12	\
+	 | (trans) << 11 | (phy) << 10 | (ele) << 9 | (u4) << 8			\
+	 | ((hops) & 0x3) << 6 | (u2) << 5 | (u32) << 4 | (lane) << 3		\
+	 | (iso) << 2 | (gen))
 
-#define AMA_VCONN_PWR_1W	0
-#define AMA_VCONN_PWR_1W5	1
-#define AMA_VCONN_PWR_2W	2
-#define AMA_VCONN_PWR_3W	3
-#define AMA_VCONN_PWR_4W	4
-#define AMA_VCONN_PWR_5W	5
-#define AMA_VCONN_PWR_6W	6
-#define AMA_USBSS_U2_ONLY	0
-#define AMA_USBSS_U31_GEN1	1
-#define AMA_USBSS_U31_GEN2	2
-#define AMA_USBSS_BBONLY	3
+/*
+ * VPD VDO
+ * ---------
+ * <31:28> :: HW version
+ * <27:24> :: FW version
+ * <23:21> :: VDO version
+ * <20:17> :: Reserved, Shall be set to zero
+ * <16:15> :: Maximum Vbus voltage (00b == 20V, 01b == 30V, 10b == 40V, 11b == 50V)
+ * <14>    :: Charge through current support (0b == 3A, 1b == 5A)
+ * <13>    :: Reserved, Shall be set to zero
+ * <12:7>  :: Vbus impedance
+ * <6:1>   :: Ground impedance
+ * <0>     :: Charge through support (0b == no, 1b == yes)
+ */
+#define VPD_VDO_VER1_0		0
+#define VPD_MAX_VBUS_20V	0
+#define VPD_MAX_VBUS_30V	1
+#define VPD_MAX_VBUS_40V	2
+#define VPD_MAX_VBUS_50V	3
+#define VPDCT_CURR_3A		0
+#define VPDCT_CURR_5A		1
+#define VPDCT_NOT_SUPP		0
+#define VPDCT_SUPP		1
+
+#define VDO_VPD(hw, fw, ver, vbm, curr, vbi, gi, ct)			\
+	(((hw) & 0xf) << 28 | ((fw) & 0xf) << 24 | ((ver) & 0x7) << 21	\
+	 | ((vbm) & 0x3) << 15 | (curr) << 14 | ((vbi) & 0x3f) << 7	\
+	 | ((gi) & 0x3f) << 1 | (ct))
 
 /*
  * SVDM Discover SVIDs request -> response
-- 
cgit v1.2.3


From 01531ac354051667f80d23bb85bf2643ae11260a Mon Sep 17 00:00:00 2001
From: BingJing Chang <bingjingc@synology.com>
Date: Fri, 29 Jan 2021 12:52:42 +0800
Subject: parser: add unsigned int parser

Will be used by fs parsing options

Link: https://lore.kernel.org/r/20210129045242.10268-1-bingjingc@synology.com
Reviewed-by: Robbie Ko<robbieko@synology.com>
Reviewed-by: Chung-Chiang Cheng <cccheng@synology.com>
Signed-off-by: BingJing Chang <bingjingc@synology.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/parser.h |  1 +
 lib/parser.c           | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/parser.h b/include/linux/parser.h
index 89e2b23fb888..dd79f45a37b8 100644
--- a/include/linux/parser.h
+++ b/include/linux/parser.h
@@ -29,6 +29,7 @@ typedef struct {
 
 int match_token(char *, const match_table_t table, substring_t args[]);
 int match_int(substring_t *, int *result);
+int match_uint(substring_t *s, unsigned int *result);
 int match_u64(substring_t *, u64 *result);
 int match_octal(substring_t *, int *result);
 int match_hex(substring_t *, int *result);
diff --git a/lib/parser.c b/lib/parser.c
index f5b3e5d7a7f9..7785e41fa5eb 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -188,6 +188,28 @@ int match_int(substring_t *s, int *result)
 }
 EXPORT_SYMBOL(match_int);
 
+/*
+ * match_uint - scan a decimal representation of an integer from a substring_t
+ * @s: substring_t to be scanned
+ * @result: resulting integer on success
+ *
+ * Description: Attempts to parse the &substring_t @s as a decimal integer. On
+ * success, sets @result to the integer represented by the string and returns 0.
+ * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+int match_uint(substring_t *s, unsigned int *result)
+{
+	int err = -ENOMEM;
+	char *buf = match_strdup(s);
+
+	if (buf) {
+		err = kstrtouint(buf, 10, result);
+		kfree(buf);
+	}
+	return err;
+}
+EXPORT_SYMBOL(match_uint);
+
 /**
  * match_u64: - scan a decimal representation of a u64 from
  *                  a substring_t
-- 
cgit v1.2.3


From 4f4e54366eae20d5867864001db57c5d90693d8c Mon Sep 17 00:00:00 2001
From: Emil Renner Berthing <kernel@esmil.dk>
Date: Sun, 31 Jan 2021 00:46:37 +0100
Subject: net: usb: cdc_ncm: use new API for bh tasklet

This converts the driver to use the new tasklet API introduced in
commit 12cc923f1ccc ("tasklet: Introduce new initialization API")

It is unfortunate that we need to add a pointer to the driver context to
get back to the usbnet device, but the space will be reclaimed once
there are no more users of the old API left and we can remove the data
value and flag from the tasklet struct.

Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
Link: https://lore.kernel.org/r/20210130234637.26505-1-kernel@esmil.dk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/cdc_ncm.c   | 12 +++++++-----
 include/linux/usb/cdc_ncm.h |  2 ++
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 291e76d32abe..4087c9e33781 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -61,7 +61,7 @@ static bool prefer_mbim;
 module_param(prefer_mbim, bool, 0644);
 MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions");
 
-static void cdc_ncm_txpath_bh(unsigned long param);
+static void cdc_ncm_txpath_bh(struct tasklet_struct *t);
 static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx);
 static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *hr_timer);
 static struct usb_driver cdc_ncm_driver;
@@ -813,9 +813,11 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 	if (!ctx)
 		return -ENOMEM;
 
+	ctx->dev = dev;
+
 	hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	ctx->tx_timer.function = &cdc_ncm_tx_timer_cb;
-	tasklet_init(&ctx->bh, cdc_ncm_txpath_bh, (unsigned long)dev);
+	tasklet_setup(&ctx->bh, cdc_ncm_txpath_bh);
 	atomic_set(&ctx->stop, 0);
 	spin_lock_init(&ctx->mtx);
 
@@ -1472,10 +1474,10 @@ static enum hrtimer_restart cdc_ncm_tx_timer_cb(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static void cdc_ncm_txpath_bh(unsigned long param)
+static void cdc_ncm_txpath_bh(struct tasklet_struct *t)
 {
-	struct usbnet *dev = (struct usbnet *)param;
-	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+	struct cdc_ncm_ctx *ctx = from_tasklet(ctx, t, bh);
+	struct usbnet *dev = ctx->dev;
 
 	spin_lock_bh(&ctx->mtx);
 	if (ctx->tx_timer_pending != 0) {
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 0ce4377545f8..f7cb3ddce7fb 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -98,6 +98,8 @@ struct cdc_ncm_ctx {
 	struct hrtimer tx_timer;
 	struct tasklet_struct bh;
 
+	struct usbnet *dev;
+
 	const struct usb_cdc_ncm_desc *func_desc;
 	const struct usb_cdc_mbim_desc *mbim_desc;
 	const struct usb_cdc_mbim_extended_desc *mbim_extended_desc;
-- 
cgit v1.2.3


From 012ce4dd3102a0f4d80167de343e9d44b257c1b8 Mon Sep 17 00:00:00 2001
From: Danielle Ratson <danieller@nvidia.com>
Date: Tue, 2 Feb 2021 20:06:06 +0200
Subject: ethtool: Extend link modes settings uAPI with lanes

Currently, when auto negotiation is on, the user can advertise all the
linkmodes which correspond to a specific speed, but does not have a
similar selector for the number of lanes. This is significant when a
specific speed can be achieved using different number of lanes.  For
example, 2x50 or 4x25.

Add 'ETHTOOL_A_LINKMODES_LANES' attribute and expand 'struct
ethtool_link_settings' with lanes field in order to implement a new
lanes-selector that will enable the user to advertise a specific number
of lanes as well.

When auto negotiation is off, lanes parameter can be forced only if the
driver supports it. Add a capability bit in 'struct ethtool_ops' that
allows ethtool know if the driver can handle the lanes parameter when
auto negotiation is off, so if it does not, an error message will be
returned when trying to set lanes.

Example:

$ ethtool -s swp1 lanes 4
$ ethtool swp1
  Settings for swp1:
	Supported ports: [ FIBRE ]
        Supported link modes:   1000baseKX/Full
                                10000baseKR/Full
                                40000baseCR4/Full
				40000baseSR4/Full
				40000baseLR4/Full
                                25000baseCR/Full
                                25000baseSR/Full
				50000baseCR2/Full
                                100000baseSR4/Full
				100000baseCR4/Full
        Supported pause frame use: Symmetric Receive-only
        Supports auto-negotiation: Yes
        Supported FEC modes: Not reported
        Advertised link modes:  40000baseCR4/Full
				40000baseSR4/Full
				40000baseLR4/Full
                                100000baseSR4/Full
				100000baseCR4/Full
        Advertised pause frame use: No
        Advertised auto-negotiation: Yes
        Advertised FEC modes: Not reported
        Speed: Unknown!
        Duplex: Unknown! (255)
        Auto-negotiation: on
        Port: Direct Attach Copper
        PHYAD: 0
        Transceiver: internal
        Link detected: no

Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst | 11 ++--
 include/linux/ethtool.h                      |  4 ++
 include/uapi/linux/ethtool_netlink.h         |  1 +
 net/ethtool/linkmodes.c                      | 96 +++++++++++++++++++++++-----
 net/ethtool/netlink.h                        |  2 +-
 5 files changed, 93 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 30b98245979f..05073482db05 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -431,16 +431,17 @@ Request contents:
   ``ETHTOOL_A_LINKMODES_SPEED``               u32     link speed (Mb/s)
   ``ETHTOOL_A_LINKMODES_DUPLEX``              u8      duplex mode
   ``ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG``    u8      Master/slave port mode
+  ``ETHTOOL_A_LINKMODES_LANES``               u32     lanes
   ==========================================  ======  ==========================
 
 ``ETHTOOL_A_LINKMODES_OURS`` bit set allows setting advertised link modes. If
 autonegotiation is on (either set now or kept from before), advertised modes
 are not changed (no ``ETHTOOL_A_LINKMODES_OURS`` attribute) and at least one
-of speed and duplex is specified, kernel adjusts advertised modes to all
-supported modes matching speed, duplex or both (whatever is specified). This
-autoselection is done on ethtool side with ioctl interface, netlink interface
-is supposed to allow requesting changes without knowing what exactly kernel
-supports.
+of speed, duplex and lanes is specified, kernel adjusts advertised modes to all
+supported modes matching speed, duplex, lanes or all (whatever is specified).
+This autoselection is done on ethtool side with ioctl interface, netlink
+interface is supposed to allow requesting changes without knowing what exactly
+kernel supports.
 
 
 LINKSTATE_GET
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index e3da25b51ae4..1ab13c5dfb2f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -128,6 +128,7 @@ struct ethtool_link_ksettings {
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
 	} link_modes;
+	u32	lanes;
 };
 
 /**
@@ -265,6 +266,8 @@ struct ethtool_pause_stats {
 
 /**
  * struct ethtool_ops - optional netdev operations
+ * @cap_link_lanes_supported: indicates if the driver supports lanes
+ *	parameter.
  * @supported_coalesce_params: supported types of interrupt coalescing.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
@@ -420,6 +423,7 @@ struct ethtool_pause_stats {
  * of the generic netdev features interface.
  */
 struct ethtool_ops {
+	u32     cap_link_lanes_supported:1;
 	u32	supported_coalesce_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
 	int	(*get_regs_len)(struct net_device *);
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index e2bf36e6964b..a286635ac9b8 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -227,6 +227,7 @@ enum {
 	ETHTOOL_A_LINKMODES_DUPLEX,		/* u8 */
 	ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG,	/* u8 */
 	ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE,	/* u8 */
+	ETHTOOL_A_LINKMODES_LANES,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_LINKMODES_CNT,
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index bb8a3351fb72..db3e31fc6709 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -152,12 +152,47 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = {
 
 struct link_mode_info {
 	int				speed;
+	u8				lanes;
 	u8				duplex;
 };
 
-#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \
-	[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
-		.speed	= SPEED_ ## _speed, \
+#define __LINK_MODE_LANES_CR		1
+#define __LINK_MODE_LANES_CR2		2
+#define __LINK_MODE_LANES_CR4		4
+#define __LINK_MODE_LANES_CR8		8
+#define __LINK_MODE_LANES_DR		1
+#define __LINK_MODE_LANES_DR2		2
+#define __LINK_MODE_LANES_DR4		4
+#define __LINK_MODE_LANES_DR8		8
+#define __LINK_MODE_LANES_KR		1
+#define __LINK_MODE_LANES_KR2		2
+#define __LINK_MODE_LANES_KR4		4
+#define __LINK_MODE_LANES_KR8		8
+#define __LINK_MODE_LANES_SR		1
+#define __LINK_MODE_LANES_SR2		2
+#define __LINK_MODE_LANES_SR4		4
+#define __LINK_MODE_LANES_SR8		8
+#define __LINK_MODE_LANES_ER		1
+#define __LINK_MODE_LANES_KX		1
+#define __LINK_MODE_LANES_KX4		4
+#define __LINK_MODE_LANES_LR		1
+#define __LINK_MODE_LANES_LR4		4
+#define __LINK_MODE_LANES_LR4_ER4	4
+#define __LINK_MODE_LANES_LR_ER_FR	1
+#define __LINK_MODE_LANES_LR2_ER2_FR2	2
+#define __LINK_MODE_LANES_LR4_ER4_FR4	4
+#define __LINK_MODE_LANES_LR8_ER8_FR8	8
+#define __LINK_MODE_LANES_LRM		1
+#define __LINK_MODE_LANES_MLD2		2
+#define __LINK_MODE_LANES_T		1
+#define __LINK_MODE_LANES_T1		1
+#define __LINK_MODE_LANES_X		1
+#define __LINK_MODE_LANES_FX		1
+
+#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex)	\
+	[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = {		\
+		.speed  = SPEED_ ## _speed, \
+		.lanes  = __LINK_MODE_LANES_ ## _type, \
 		.duplex	= __DUPLEX_ ## _duplex \
 	}
 #define __DUPLEX_Half DUPLEX_HALF
@@ -165,6 +200,7 @@ struct link_mode_info {
 #define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
 	[ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \
 		.speed	= SPEED_UNKNOWN, \
+		.lanes	= 0, \
 		.duplex	= DUPLEX_UNKNOWN, \
 	}
 
@@ -274,16 +310,17 @@ const struct nla_policy ethnl_linkmodes_set_policy[] = {
 	[ETHTOOL_A_LINKMODES_SPEED]		= { .type = NLA_U32 },
 	[ETHTOOL_A_LINKMODES_DUPLEX]		= { .type = NLA_U8 },
 	[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]	= { .type = NLA_U8 },
+	[ETHTOOL_A_LINKMODES_LANES]		= NLA_POLICY_RANGE(NLA_U32, 1, 8),
 };
 
-/* Set advertised link modes to all supported modes matching requested speed
- * and duplex values. Called when autonegotiation is on, speed or duplex is
- * requested but no link mode change. This is done in userspace with ioctl()
- * interface, move it into kernel for netlink.
+/* Set advertised link modes to all supported modes matching requested speed,
+ * lanes and duplex values. Called when autonegotiation is on, speed, lanes or
+ * duplex is requested but no link mode change. This is done in userspace with
+ * ioctl() interface, move it into kernel for netlink.
  * Returns true if advertised modes bitmap was modified.
  */
 static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
-				 bool req_speed, bool req_duplex)
+				 bool req_speed, bool req_lanes, bool req_duplex)
 {
 	unsigned long *advertising = ksettings->link_modes.advertising;
 	unsigned long *supported = ksettings->link_modes.supported;
@@ -302,6 +339,7 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
 			continue;
 		if (test_bit(i, supported) &&
 		    (!req_speed || info->speed == ksettings->base.speed) &&
+		    (!req_lanes || info->lanes == ksettings->lanes) &&
 		    (!req_duplex || info->duplex == ksettings->base.duplex))
 			set_bit(i, advertising);
 		else
@@ -327,7 +365,7 @@ static bool ethnl_validate_master_slave_cfg(u8 cfg)
 
 static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb)
 {
-	const struct nlattr *master_slave_cfg;
+	const struct nlattr *master_slave_cfg, *lanes_cfg;
 
 	master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
 	if (master_slave_cfg &&
@@ -337,16 +375,23 @@ static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb)
 		return -EOPNOTSUPP;
 	}
 
+	lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES];
+	if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) {
+		NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg,
+				    "lanes value is invalid");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
 static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 				  struct ethtool_link_ksettings *ksettings,
-				  bool *mod)
+				  bool *mod, const struct net_device *dev)
 {
 	struct ethtool_link_settings *lsettings = &ksettings->base;
-	bool req_speed, req_duplex;
-	const struct nlattr *master_slave_cfg;
+	bool req_speed, req_lanes, req_duplex;
+	const struct nlattr *master_slave_cfg, *lanes_cfg;
 	int ret;
 
 	master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
@@ -360,10 +405,30 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 
 	*mod = false;
 	req_speed = tb[ETHTOOL_A_LINKMODES_SPEED];
+	req_lanes = tb[ETHTOOL_A_LINKMODES_LANES];
 	req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX];
 
 	ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG],
 			mod);
+
+	lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES];
+	if (lanes_cfg) {
+		/* If autoneg is off and lanes parameter is not supported by the
+		 * driver, return an error.
+		 */
+		if (!lsettings->autoneg &&
+		    !dev->ethtool_ops->cap_link_lanes_supported) {
+			NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg,
+					    "lanes configuration not supported by device");
+			return -EOPNOTSUPP;
+		}
+	} else if (!lsettings->autoneg) {
+		/* If autoneg is off and lanes parameter is not passed from user,
+		 * set the lanes parameter to 0.
+		 */
+		ksettings->lanes = 0;
+	}
+
 	ret = ethnl_update_bitset(ksettings->link_modes.advertising,
 				  __ETHTOOL_LINK_MODE_MASK_NBITS,
 				  tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names,
@@ -372,13 +437,14 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
 		return ret;
 	ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED],
 			 mod);
+	ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod);
 	ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX],
 			mod);
 	ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod);
 
 	if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg &&
-	    (req_speed || req_duplex) &&
-	    ethnl_auto_linkmodes(ksettings, req_speed, req_duplex))
+	    (req_speed || req_lanes || req_duplex) &&
+	    ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex))
 		*mod = true;
 
 	return 0;
@@ -420,7 +486,7 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info)
 		goto out_ops;
 	}
 
-	ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod);
+	ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev);
 	if (ret < 0)
 		goto out_ops;
 
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index d8efec516d86..6eabd58d81bf 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -351,7 +351,7 @@ extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_O
 extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
 extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
 extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1];
-extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1];
+extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1];
 extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1];
 extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1];
 extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1];
-- 
cgit v1.2.3


From c8907043c6ac9ed58e6c1a76f2824be714b42228 Mon Sep 17 00:00:00 2001
From: Danielle Ratson <danieller@nvidia.com>
Date: Tue, 2 Feb 2021 20:06:07 +0200
Subject: ethtool: Get link mode in use instead of speed and duplex parameters

Currently, when user space queries the link's parameters, as speed and
duplex, each parameter is passed from the driver to ethtool.

Instead, get the link mode bit in use, and derive each of the parameters
from it in ethtool.

Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h |   1 +
 net/ethtool/common.c    | 147 +++++++++++++++++++++++++++++++++++++++++++++
 net/ethtool/common.h    |   7 +++
 net/ethtool/ioctl.c     |  18 +++++-
 net/ethtool/linkmodes.c | 157 +-----------------------------------------------
 5 files changed, 174 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 1ab13c5dfb2f..ec4cd3921c67 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -129,6 +129,7 @@ struct ethtool_link_ksettings {
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
 	} link_modes;
 	u32	lanes;
+	enum ethtool_link_mode_bit_indices link_mode;
 };
 
 /**
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 181220101a6e..835b9bba3e7e 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -198,6 +198,153 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
 };
 static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
 
+#define __LINK_MODE_LANES_CR		1
+#define __LINK_MODE_LANES_CR2		2
+#define __LINK_MODE_LANES_CR4		4
+#define __LINK_MODE_LANES_CR8		8
+#define __LINK_MODE_LANES_DR		1
+#define __LINK_MODE_LANES_DR2		2
+#define __LINK_MODE_LANES_DR4		4
+#define __LINK_MODE_LANES_DR8		8
+#define __LINK_MODE_LANES_KR		1
+#define __LINK_MODE_LANES_KR2		2
+#define __LINK_MODE_LANES_KR4		4
+#define __LINK_MODE_LANES_KR8		8
+#define __LINK_MODE_LANES_SR		1
+#define __LINK_MODE_LANES_SR2		2
+#define __LINK_MODE_LANES_SR4		4
+#define __LINK_MODE_LANES_SR8		8
+#define __LINK_MODE_LANES_ER		1
+#define __LINK_MODE_LANES_KX		1
+#define __LINK_MODE_LANES_KX4		4
+#define __LINK_MODE_LANES_LR		1
+#define __LINK_MODE_LANES_LR4		4
+#define __LINK_MODE_LANES_LR4_ER4	4
+#define __LINK_MODE_LANES_LR_ER_FR	1
+#define __LINK_MODE_LANES_LR2_ER2_FR2	2
+#define __LINK_MODE_LANES_LR4_ER4_FR4	4
+#define __LINK_MODE_LANES_LR8_ER8_FR8	8
+#define __LINK_MODE_LANES_LRM		1
+#define __LINK_MODE_LANES_MLD2		2
+#define __LINK_MODE_LANES_T		1
+#define __LINK_MODE_LANES_T1		1
+#define __LINK_MODE_LANES_X		1
+#define __LINK_MODE_LANES_FX		1
+
+#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex)	\
+	[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = {		\
+		.speed  = SPEED_ ## _speed, \
+		.lanes  = __LINK_MODE_LANES_ ## _type, \
+		.duplex	= __DUPLEX_ ## _duplex \
+	}
+#define __DUPLEX_Half DUPLEX_HALF
+#define __DUPLEX_Full DUPLEX_FULL
+#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
+	[ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \
+		.speed	= SPEED_UNKNOWN, \
+		.lanes	= 0, \
+		.duplex	= DUPLEX_UNKNOWN, \
+	}
+
+const struct link_mode_info link_mode_params[] = {
+	__DEFINE_LINK_MODE_PARAMS(10, T, Half),
+	__DEFINE_LINK_MODE_PARAMS(10, T, Full),
+	__DEFINE_LINK_MODE_PARAMS(100, T, Half),
+	__DEFINE_LINK_MODE_PARAMS(100, T, Full),
+	__DEFINE_LINK_MODE_PARAMS(1000, T, Half),
+	__DEFINE_LINK_MODE_PARAMS(1000, T, Full),
+	__DEFINE_SPECIAL_MODE_PARAMS(Autoneg),
+	__DEFINE_SPECIAL_MODE_PARAMS(TP),
+	__DEFINE_SPECIAL_MODE_PARAMS(AUI),
+	__DEFINE_SPECIAL_MODE_PARAMS(MII),
+	__DEFINE_SPECIAL_MODE_PARAMS(FIBRE),
+	__DEFINE_SPECIAL_MODE_PARAMS(BNC),
+	__DEFINE_LINK_MODE_PARAMS(10000, T, Full),
+	__DEFINE_SPECIAL_MODE_PARAMS(Pause),
+	__DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause),
+	__DEFINE_LINK_MODE_PARAMS(2500, X, Full),
+	__DEFINE_SPECIAL_MODE_PARAMS(Backplane),
+	__DEFINE_LINK_MODE_PARAMS(1000, KX, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, KX4, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, KR, Full),
+	[ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = {
+		.speed	= SPEED_10000,
+		.duplex = DUPLEX_FULL,
+	},
+	__DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full),
+	__DEFINE_LINK_MODE_PARAMS(20000, KR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(40000, KR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(40000, CR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(40000, SR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(40000, LR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(56000, KR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(56000, CR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(56000, SR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(56000, LR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(25000, CR, Full),
+	__DEFINE_LINK_MODE_PARAMS(25000, KR, Full),
+	__DEFINE_LINK_MODE_PARAMS(25000, SR, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, CR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, KR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, KR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, SR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, CR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, SR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(1000, X, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, CR, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, SR, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, LR, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, LRM, Full),
+	__DEFINE_LINK_MODE_PARAMS(10000, ER, Full),
+	__DEFINE_LINK_MODE_PARAMS(2500, T, Full),
+	__DEFINE_LINK_MODE_PARAMS(5000, T, Full),
+	__DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE),
+	__DEFINE_SPECIAL_MODE_PARAMS(FEC_RS),
+	__DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER),
+	__DEFINE_LINK_MODE_PARAMS(50000, KR, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, SR, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, CR, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full),
+	__DEFINE_LINK_MODE_PARAMS(50000, DR, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, KR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, SR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, CR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, DR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, KR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, SR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, DR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, CR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(100, T1, Full),
+	__DEFINE_LINK_MODE_PARAMS(1000, T1, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, KR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, SR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
+	__DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS),
+	__DEFINE_LINK_MODE_PARAMS(100000, KR, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, SR, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, DR, Full),
+	__DEFINE_LINK_MODE_PARAMS(100000, CR, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, KR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, SR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, DR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(200000, CR2, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, KR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, SR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
+	__DEFINE_LINK_MODE_PARAMS(100, FX, Half),
+	__DEFINE_LINK_MODE_PARAMS(100, FX, Full),
+};
+static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
+
 const char netif_msg_class_names[][ETH_GSTRING_LEN] = {
 	[NETIF_MSG_DRV_BIT]		= "drv",
 	[NETIF_MSG_PROBE_BIT]		= "probe",
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 3d9251c95a8b..a9d071248698 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -14,6 +14,12 @@
 
 #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1)
 
+struct link_mode_info {
+	int				speed;
+	u8				lanes;
+	u8				duplex;
+};
+
 extern const char
 netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
 extern const char
@@ -23,6 +29,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN];
 extern const char
 phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN];
 extern const char link_mode_names[][ETH_GSTRING_LEN];
+extern const struct link_mode_info link_mode_params[];
 extern const char netif_msg_class_names[][ETH_GSTRING_LEN];
 extern const char wol_mode_names[][ETH_GSTRING_LEN];
 extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 771688e1b0da..24783b71c584 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -426,13 +426,29 @@ struct ethtool_link_usettings {
 int __ethtool_get_link_ksettings(struct net_device *dev,
 				 struct ethtool_link_ksettings *link_ksettings)
 {
+	const struct link_mode_info *link_info;
+	int err;
+
 	ASSERT_RTNL();
 
 	if (!dev->ethtool_ops->get_link_ksettings)
 		return -EOPNOTSUPP;
 
 	memset(link_ksettings, 0, sizeof(*link_ksettings));
-	return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
+
+	link_ksettings->link_mode = -1;
+	err = dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
+	if (err)
+		return err;
+
+	if (link_ksettings->link_mode != -1) {
+		link_info = &link_mode_params[link_ksettings->link_mode];
+		link_ksettings->base.speed = link_info->speed;
+		link_ksettings->lanes = link_info->lanes;
+		link_ksettings->base.duplex = link_info->duplex;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(__ethtool_get_link_ksettings);
 
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index db3e31fc6709..fc986d035b01 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -4,6 +4,8 @@
 #include "common.h"
 #include "bitset.h"
 
+/* LINKMODES_GET */
+
 struct linkmodes_req_info {
 	struct ethnl_req_info		base;
 };
@@ -150,158 +152,6 @@ const struct ethnl_request_ops ethnl_linkmodes_request_ops = {
 
 /* LINKMODES_SET */
 
-struct link_mode_info {
-	int				speed;
-	u8				lanes;
-	u8				duplex;
-};
-
-#define __LINK_MODE_LANES_CR		1
-#define __LINK_MODE_LANES_CR2		2
-#define __LINK_MODE_LANES_CR4		4
-#define __LINK_MODE_LANES_CR8		8
-#define __LINK_MODE_LANES_DR		1
-#define __LINK_MODE_LANES_DR2		2
-#define __LINK_MODE_LANES_DR4		4
-#define __LINK_MODE_LANES_DR8		8
-#define __LINK_MODE_LANES_KR		1
-#define __LINK_MODE_LANES_KR2		2
-#define __LINK_MODE_LANES_KR4		4
-#define __LINK_MODE_LANES_KR8		8
-#define __LINK_MODE_LANES_SR		1
-#define __LINK_MODE_LANES_SR2		2
-#define __LINK_MODE_LANES_SR4		4
-#define __LINK_MODE_LANES_SR8		8
-#define __LINK_MODE_LANES_ER		1
-#define __LINK_MODE_LANES_KX		1
-#define __LINK_MODE_LANES_KX4		4
-#define __LINK_MODE_LANES_LR		1
-#define __LINK_MODE_LANES_LR4		4
-#define __LINK_MODE_LANES_LR4_ER4	4
-#define __LINK_MODE_LANES_LR_ER_FR	1
-#define __LINK_MODE_LANES_LR2_ER2_FR2	2
-#define __LINK_MODE_LANES_LR4_ER4_FR4	4
-#define __LINK_MODE_LANES_LR8_ER8_FR8	8
-#define __LINK_MODE_LANES_LRM		1
-#define __LINK_MODE_LANES_MLD2		2
-#define __LINK_MODE_LANES_T		1
-#define __LINK_MODE_LANES_T1		1
-#define __LINK_MODE_LANES_X		1
-#define __LINK_MODE_LANES_FX		1
-
-#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex)	\
-	[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = {		\
-		.speed  = SPEED_ ## _speed, \
-		.lanes  = __LINK_MODE_LANES_ ## _type, \
-		.duplex	= __DUPLEX_ ## _duplex \
-	}
-#define __DUPLEX_Half DUPLEX_HALF
-#define __DUPLEX_Full DUPLEX_FULL
-#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
-	[ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \
-		.speed	= SPEED_UNKNOWN, \
-		.lanes	= 0, \
-		.duplex	= DUPLEX_UNKNOWN, \
-	}
-
-static const struct link_mode_info link_mode_params[] = {
-	__DEFINE_LINK_MODE_PARAMS(10, T, Half),
-	__DEFINE_LINK_MODE_PARAMS(10, T, Full),
-	__DEFINE_LINK_MODE_PARAMS(100, T, Half),
-	__DEFINE_LINK_MODE_PARAMS(100, T, Full),
-	__DEFINE_LINK_MODE_PARAMS(1000, T, Half),
-	__DEFINE_LINK_MODE_PARAMS(1000, T, Full),
-	__DEFINE_SPECIAL_MODE_PARAMS(Autoneg),
-	__DEFINE_SPECIAL_MODE_PARAMS(TP),
-	__DEFINE_SPECIAL_MODE_PARAMS(AUI),
-	__DEFINE_SPECIAL_MODE_PARAMS(MII),
-	__DEFINE_SPECIAL_MODE_PARAMS(FIBRE),
-	__DEFINE_SPECIAL_MODE_PARAMS(BNC),
-	__DEFINE_LINK_MODE_PARAMS(10000, T, Full),
-	__DEFINE_SPECIAL_MODE_PARAMS(Pause),
-	__DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause),
-	__DEFINE_LINK_MODE_PARAMS(2500, X, Full),
-	__DEFINE_SPECIAL_MODE_PARAMS(Backplane),
-	__DEFINE_LINK_MODE_PARAMS(1000, KX, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, KX4, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, KR, Full),
-	[ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = {
-		.speed	= SPEED_10000,
-		.duplex = DUPLEX_FULL,
-	},
-	__DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full),
-	__DEFINE_LINK_MODE_PARAMS(20000, KR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(40000, KR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(40000, CR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(40000, SR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(40000, LR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(56000, KR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(56000, CR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(56000, SR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(56000, LR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(25000, CR, Full),
-	__DEFINE_LINK_MODE_PARAMS(25000, KR, Full),
-	__DEFINE_LINK_MODE_PARAMS(25000, SR, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, CR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, KR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, KR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, SR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, CR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, SR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(1000, X, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, CR, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, SR, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, LR, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, LRM, Full),
-	__DEFINE_LINK_MODE_PARAMS(10000, ER, Full),
-	__DEFINE_LINK_MODE_PARAMS(2500, T, Full),
-	__DEFINE_LINK_MODE_PARAMS(5000, T, Full),
-	__DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE),
-	__DEFINE_SPECIAL_MODE_PARAMS(FEC_RS),
-	__DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER),
-	__DEFINE_LINK_MODE_PARAMS(50000, KR, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, SR, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, CR, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full),
-	__DEFINE_LINK_MODE_PARAMS(50000, DR, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, KR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, SR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, CR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, DR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, KR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, SR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, DR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, CR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(100, T1, Full),
-	__DEFINE_LINK_MODE_PARAMS(1000, T1, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, KR8, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, SR8, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
-	__DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS),
-	__DEFINE_LINK_MODE_PARAMS(100000, KR, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, SR, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, DR, Full),
-	__DEFINE_LINK_MODE_PARAMS(100000, CR, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, KR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, SR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, DR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(200000, CR2, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, KR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, SR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
-	__DEFINE_LINK_MODE_PARAMS(100, FX, Half),
-	__DEFINE_LINK_MODE_PARAMS(100, FX, Full),
-};
-
 const struct nla_policy ethnl_linkmodes_set_policy[] = {
 	[ETHTOOL_A_LINKMODES_HEADER]		=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -327,9 +177,6 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
 	DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS);
 	unsigned int i;
 
-	BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) !=
-		     __ETHTOOL_LINK_MODE_MASK_NBITS);
-
 	bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS);
 
 	for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) {
-- 
cgit v1.2.3


From 2a1673f0f1de78b146bfdbe8c8a773c4a0499790 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Thu, 4 Feb 2021 08:50:36 +0800
Subject: usb: pd: Reland VDO definitions of PD2.0

Reland VDO definitions of PD Revision 2.0 as they are still used in
PD2.0 products.

Fixes: 0e1d6f55a12e ("usb: pd: Update VDO definitions")
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210204005036.1555294-1-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/pd_vdo.h | 69 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index e9b6822c54c2..5de7f550f93e 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -110,6 +110,10 @@
  * <20:16>  :: Reserved, Shall be set to zero
  * <15:0>   :: USB-IF assigned VID for this cable vendor
  */
+
+/* PD Rev2.0 definition */
+#define IDH_PTYPE_UNDEF		0
+
 /* SOP Product Type (UFP) */
 #define IDH_PTYPE_NOT_UFP	0
 #define IDH_PTYPE_HUB		1
@@ -248,7 +252,25 @@
 	 | ((pnum) & 0x1f))
 
 /*
- * Passive Cable VDO
+ * Cable VDO (for both Passive and Active Cable VDO in PD Rev2.0)
+ * ---------
+ * <31:28> :: Cable HW version
+ * <27:24> :: Cable FW version
+ * <23:20> :: Reserved, Shall be set to zero
+ * <19:18> :: type-C to Type-A/B/C/Captive (00b == A, 01 == B, 10 == C, 11 == Captive)
+ * <17>    :: Reserved, Shall be set to zero
+ * <16:13> :: cable latency (0001 == <10ns(~1m length))
+ * <12:11> :: cable termination type (11b == both ends active VCONN req)
+ * <10>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
+ * <9>     :: SSTX2 Directionality support
+ * <8>     :: SSRX1 Directionality support
+ * <7>     :: SSRX2 Directionality support
+ * <6:5>   :: Vbus current handling capability (01b == 3A, 10b == 5A)
+ * <4>     :: Vbus through cable (0b == no, 1b == yes)
+ * <3>     :: SOP" controller present? (0b == no, 1b == yes)
+ * <2:0>   :: USB SS Signaling support
+ *
+ * Passive Cable VDO (PD Rev3.0+)
  * ---------
  * <31:28> :: Cable HW version
  * <27:24> :: Cable FW version
@@ -264,7 +286,7 @@
  * <4:3>   :: Reserved, Shall be set to zero
  * <2:0>   :: USB highest speed
  *
- * Active Cable VDO 1
+ * Active Cable VDO 1 (PD Rev3.0+)
  * ---------
  * <31:28> :: Cable HW version
  * <27:24> :: Cable FW version
@@ -286,7 +308,9 @@
 #define CABLE_VDO_VER1_0	0
 #define CABLE_VDO_VER1_3	3
 
-/* Connector Type */
+/* Connector Type (_ATYPE and _BTYPE are for PD Rev2.0 only) */
+#define CABLE_ATYPE		0
+#define CABLE_BTYPE		1
 #define CABLE_CTYPE		2
 #define CABLE_CAPTIVE		3
 
@@ -323,12 +347,22 @@
 #define CABLE_CURR_3A		1
 #define CABLE_CURR_5A		2
 
+/* USB SuperSpeed Signaling Support (PD Rev2.0) */
+#define CABLE_USBSS_U2_ONLY	0
+#define CABLE_USBSS_U31_GEN1	1
+#define CABLE_USBSS_U31_GEN2	2
+
 /* USB Highest Speed */
 #define CABLE_USB2_ONLY		0
 #define CABLE_USB32_GEN1	1
 #define CABLE_USB32_4_GEN2	2
 #define CABLE_USB4_GEN3		3
 
+#define VDO_CABLE(hw, fw, cbl, lat, term, tx1d, tx2d, rx1d, rx2d, cur, vps, sopp, usbss) \
+	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24 | ((cbl) & 0x3) << 18		\
+	 | ((lat) & 0x7) << 13 | ((term) & 0x3) << 11 | (tx1d) << 10		\
+	 | (tx2d) << 9 | (rx1d) << 8 | (rx2d) << 7 | ((cur) & 0x3) << 5		\
+	 | (vps) << 4 | (sopp) << 3 | ((usbss) & 0x7))
 #define VDO_PCABLE(hw, fw, ver, conn, lat, term, vbm, cur, spd)			\
 	(((hw) & 0xf) << 28 | ((fw) & 0xf) << 24 | ((ver) & 0x7) << 21		\
 	 | ((conn) & 0x3) << 18 | ((lat) & 0xf) << 13 | ((term) & 0x3) << 11	\
@@ -395,6 +429,35 @@
 	 | ((hops) & 0x3) << 6 | (u2) << 5 | (u32) << 4 | (lane) << 3		\
 	 | (iso) << 2 | (gen))
 
+/*
+ * AMA VDO (PD Rev2.0)
+ * ---------
+ * <31:28> :: Cable HW version
+ * <27:24> :: Cable FW version
+ * <23:12> :: Reserved, Shall be set to zero
+ * <11>    :: SSTX1 Directionality support (0b == fixed, 1b == cfgable)
+ * <10>    :: SSTX2 Directionality support
+ * <9>     :: SSRX1 Directionality support
+ * <8>     :: SSRX2 Directionality support
+ * <7:5>   :: Vconn power
+ * <4>     :: Vconn power required
+ * <3>     :: Vbus power required
+ * <2:0>   :: USB SS Signaling support
+ */
+#define VDO_AMA(hw, fw, tx1d, tx2d, rx1d, rx2d, vcpwr, vcr, vbr, usbss) \
+	(((hw) & 0x7) << 28 | ((fw) & 0x7) << 24			\
+	 | (tx1d) << 11 | (tx2d) << 10 | (rx1d) << 9 | (rx2d) << 8	\
+	 | ((vcpwr) & 0x7) << 5 | (vcr) << 4 | (vbr) << 3		\
+	 | ((usbss) & 0x7))
+
+#define PD_VDO_AMA_VCONN_REQ(vdo)	(((vdo) >> 4) & 1)
+#define PD_VDO_AMA_VBUS_REQ(vdo)	(((vdo) >> 3) & 1)
+
+#define AMA_USBSS_U2_ONLY	0
+#define AMA_USBSS_U31_GEN1	1
+#define AMA_USBSS_U31_GEN2	2
+#define AMA_USBSS_BBONLY	3
+
 /*
  * VPD VDO
  * ---------
-- 
cgit v1.2.3


From 3cd542e6e6afb6fa6c34d4094d498f42e22110f5 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 3 Sep 2020 13:13:21 +0300
Subject: thunderbolt: Add support for PCIe tunneling disabled (SL5)

Recent Intel Thunderbolt firmware connection manager has support for
another security level, SL5, that disables PCIe tunneling. This option
can be turned on from the BIOS.

When this is set the driver exposes a new security level "nopcie" to the
userspace and hides the authorized attribute under connected devices.

While there we also hide it when "dponly" security level is enabled
since it is not really usable in that case anyway.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
---
 Documentation/ABI/testing/sysfs-bus-thunderbolt |  2 ++
 Documentation/admin-guide/thunderbolt.rst       |  7 +++++++
 drivers/thunderbolt/domain.c                    | 12 +++++++++++-
 drivers/thunderbolt/switch.c                    |  6 +++++-
 include/linux/thunderbolt.h                     |  3 +++
 5 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index 581dea95245b..d7f09d011b6d 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -85,6 +85,8 @@ Description:	This attribute holds current Thunderbolt security level
 		usbonly  Automatically tunnel USB controller of the
 			 connected Thunderbolt dock (and Display Port). All
 			 PCIe links downstream of the dock are removed.
+		nopcie   USB4 system where PCIe tunneling is disabled from
+			 the BIOS.
 		=======  ==================================================
 
 What: /sys/bus/thunderbolt/devices/.../authorized
diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index 0d4348445f91..f18e881373c4 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -47,6 +47,9 @@ be DMA masters and thus read contents of the host memory without CPU and OS
 knowing about it. There are ways to prevent this by setting up an IOMMU but
 it is not always available for various reasons.
 
+Some USB4 systems have a BIOS setting to disable PCIe tunneling. This is
+treated as another security level (nopcie).
+
 The security levels are as follows:
 
   none
@@ -77,6 +80,10 @@ The security levels are as follows:
     Display Port in a dock. All PCIe links downstream of the dock are
     removed.
 
+  nopcie
+    PCIe tunneling is disabled/forbidden from the BIOS. Available in some
+    USB4 systems.
+
 The current security level can be read from
 ``/sys/bus/thunderbolt/devices/domainX/security`` where ``domainX`` is
 the Thunderbolt domain the host controller manages. There is typically
diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 9ba2181464cc..a1c79c9c4f66 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -118,6 +118,7 @@ static const char * const tb_security_names[] = {
 	[TB_SECURITY_SECURE] = "secure",
 	[TB_SECURITY_DPONLY] = "dponly",
 	[TB_SECURITY_USBONLY] = "usbonly",
+	[TB_SECURITY_NOPCIE] = "nopcie",
 };
 
 static ssize_t boot_acl_show(struct device *dev, struct device_attribute *attr,
@@ -243,8 +244,14 @@ static ssize_t deauthorization_show(struct device *dev,
 				    char *buf)
 {
 	const struct tb *tb = container_of(dev, struct tb, dev);
+	bool deauthorization = false;
 
-	return sprintf(buf, "%d\n", !!tb->cm_ops->disapprove_switch);
+	/* Only meaningful if authorization is supported */
+	if (tb->security_level == TB_SECURITY_USER ||
+	    tb->security_level == TB_SECURITY_SECURE)
+		deauthorization = !!tb->cm_ops->disapprove_switch;
+
+	return sprintf(buf, "%d\n", deauthorization);
 }
 static DEVICE_ATTR_RO(deauthorization);
 
@@ -452,6 +459,9 @@ int tb_domain_add(struct tb *tb)
 			goto err_ctl_stop;
 	}
 
+	tb_dbg(tb, "security level set to %s\n",
+	       tb_security_names[tb->security_level]);
+
 	ret = device_add(&tb->dev);
 	if (ret)
 		goto err_ctl_stop;
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index 5377d0a3390f..b63fecca6c2a 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -1774,7 +1774,11 @@ static umode_t switch_attr_is_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct tb_switch *sw = tb_to_switch(dev);
 
-	if (attr == &dev_attr_device.attr) {
+	if (attr == &dev_attr_authorized.attr) {
+		if (sw->tb->security_level == TB_SECURITY_NOPCIE ||
+		    sw->tb->security_level == TB_SECURITY_DPONLY)
+			return 0;
+	} else if (attr == &dev_attr_device.attr) {
 		if (!sw->device)
 			return 0;
 	} else if (attr == &dev_attr_device_name.attr) {
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 034dccf93955..659a0a810fa1 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -45,6 +45,8 @@ enum tb_cfg_pkg_type {
  * @TB_SECURITY_USBONLY: Only tunnel USB controller of the connected
  *			 Thunderbolt dock (and Display Port). All PCIe
  *			 links downstream of the dock are removed.
+ * @TB_SECURITY_NOPCIE: For USB4 systems this level is used when the
+ *			PCIe tunneling is disabled from the BIOS.
  */
 enum tb_security_level {
 	TB_SECURITY_NONE,
@@ -52,6 +54,7 @@ enum tb_security_level {
 	TB_SECURITY_SECURE,
 	TB_SECURITY_DPONLY,
 	TB_SECURITY_USBONLY,
+	TB_SECURITY_NOPCIE,
 };
 
 /**
-- 
cgit v1.2.3


From 5a6a2c0f0f43676df27632d657a3f18b151a7ef8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 18 Feb 2020 16:02:45 +0200
Subject: ACPI: Add support for native USB4 control _OSC

ACPI 6.4 introduced a new _OSC capability that is used negotiate native
connection manager support. Connection manager is the entity that is
responsible for tunneling over the USB4 fabric. If the platform rejects
the native access then firmware based connection manager is used.

The new _OSC also includes a set of bits that can be used to disable
certain tunnel types such as PCIe for security reasons for instance.

This implements the new USB4 _OSC so that we try to negotiate native
USB4 support if the Thunderbolt/USB4 (CONFIG_USB4) driver is enabled.
Drivers can determine what was negotiated by checking two new variables
exposed in this patch.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c   | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h | 10 +++++++
 2 files changed, 86 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index a52cb28c40d8..9c3fe08e8f18 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -281,6 +281,12 @@ bool osc_sb_apei_support_acked;
 bool osc_pc_lpi_support_confirmed;
 EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed);
 
+/*
+ * ACPI 6.4 Operating System Capabilities for USB.
+ */
+bool osc_sb_native_usb4_support_confirmed;
+EXPORT_SYMBOL_GPL(osc_sb_native_usb4_support_confirmed);
+
 static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48";
 static void acpi_bus_osc_negotiate_platform_control(void)
 {
@@ -317,6 +323,9 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 	if (IS_ENABLED(CONFIG_SCHED_MC_PRIO))
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT;
 
+	if (IS_ENABLED(CONFIG_USB4))
+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_NATIVE_USB4_SUPPORT;
+
 	if (!ghes_disable)
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT;
 	if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
@@ -348,8 +357,74 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 			capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_APEI_SUPPORT;
 		osc_pc_lpi_support_confirmed =
 			capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT;
+		osc_sb_native_usb4_support_confirmed =
+			capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT;
+	}
+
+	kfree(context.ret.pointer);
+}
+
+/*
+ * Native control of USB4 capabilities. If any of the tunneling bits is
+ * set it means OS is in control and we use software based connection
+ * manager.
+ */
+u32 osc_sb_native_usb4_control;
+EXPORT_SYMBOL_GPL(osc_sb_native_usb4_control);
+
+static void acpi_bus_decode_usb_osc(const char *msg, u32 bits)
+{
+	printk(KERN_INFO PREFIX "%s USB3%c DisplayPort%c PCIe%c XDomain%c\n", msg,
+	       (bits & OSC_USB_USB3_TUNNELING) ? '+' : '-',
+	       (bits & OSC_USB_DP_TUNNELING) ? '+' : '-',
+	       (bits & OSC_USB_PCIE_TUNNELING) ? '+' : '-',
+	       (bits & OSC_USB_XDOMAIN) ? '+' : '-');
+}
+
+static u8 sb_usb_uuid_str[] = "23A0D13A-26AB-486C-9C5F-0FFA525A575A";
+static void acpi_bus_osc_negotiate_usb_control(void)
+{
+	u32 capbuf[3];
+	struct acpi_osc_context context = {
+		.uuid_str = sb_usb_uuid_str,
+		.rev = 1,
+		.cap.length = sizeof(capbuf),
+		.cap.pointer = capbuf,
+	};
+	acpi_handle handle;
+	acpi_status status;
+	u32 control;
+
+	if (!osc_sb_native_usb4_support_confirmed)
+		return;
+
+	if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
+		return;
+
+	control = OSC_USB_USB3_TUNNELING | OSC_USB_DP_TUNNELING |
+		  OSC_USB_PCIE_TUNNELING | OSC_USB_XDOMAIN;
+
+	capbuf[OSC_QUERY_DWORD] = 0;
+	capbuf[OSC_SUPPORT_DWORD] = 0;
+	capbuf[OSC_CONTROL_DWORD] = control;
+
+	status = acpi_run_osc(handle, &context);
+	if (ACPI_FAILURE(status))
+		return;
+
+	if (context.ret.length != sizeof(capbuf)) {
+		printk(KERN_INFO PREFIX "USB4 _OSC: returned invalid length buffer\n");
+		goto out_free;
 	}
 
+	osc_sb_native_usb4_control =
+		control & ((u32 *)context.ret.pointer)[OSC_CONTROL_DWORD];
+
+	acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control);
+	acpi_bus_decode_usb_osc("USB4 _OSC: OS controls",
+				osc_sb_native_usb4_control);
+
+out_free:
 	kfree(context.ret.pointer);
 }
 
@@ -1188,6 +1263,7 @@ static int __init acpi_bus_init(void)
 	 * so it must be run after ACPI_FULL_INITIALIZATION
 	 */
 	acpi_bus_osc_negotiate_platform_control();
+	acpi_bus_osc_negotiate_usb_control();
 
 	/*
 	 * _PDC control method may load dynamic SSDT tables,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 2630c2e953f7..ac68c2d4e393 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -546,9 +546,19 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
+#define OSC_SB_NATIVE_USB4_SUPPORT		0x00040000
 
 extern bool osc_sb_apei_support_acked;
 extern bool osc_pc_lpi_support_confirmed;
+extern bool osc_sb_native_usb4_support_confirmed;
+
+/* USB4 Capabilities */
+#define OSC_USB_USB3_TUNNELING			0x00000001
+#define OSC_USB_DP_TUNNELING			0x00000002
+#define OSC_USB_PCIE_TUNNELING			0x00000004
+#define OSC_USB_XDOMAIN				0x00000008
+
+extern u32 osc_sb_native_usb4_control;
 
 /* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */
 #define OSC_PCI_EXT_CONFIG_SUPPORT		0x00000001
-- 
cgit v1.2.3


From 2c07ded06427dd3339278487a1413d5e478f05f9 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Jan 2021 09:17:49 -0600
Subject: KVM/SVM: add support for SEV attestation command

The SEV FW version >= 0.23 added a new command that can be used to query
the attestation report containing the SHA-256 digest of the guest memory
encrypted through the KVM_SEV_LAUNCH_UPDATE_{DATA, VMSA} commands and
sign the report with the Platform Endorsement Key (PEK).

See the SEV FW API spec section 6.8 for more details.

Note there already exist a command (KVM_SEV_LAUNCH_MEASURE) that can be
used to get the SHA-256 digest. The main difference between the
KVM_SEV_LAUNCH_MEASURE and KVM_SEV_ATTESTATION_REPORT is that the latter
can be called while the guest is running and the measurement value is
signed with PEK.

Cc: James Bottomley <jejb@linux.ibm.com>
Cc: Tom Lendacky <Thomas.Lendacky@amd.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: John Allen <john.allen@amd.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: linux-crypto@vger.kernel.org
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: James Bottomley <jejb@linux.ibm.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Message-Id: <20210104151749.30248-1-brijesh.singh@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst | 21 +++++++
 arch/x86/kvm/svm/sev.c                           | 71 ++++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c                     |  1 +
 include/linux/psp-sev.h                          | 17 ++++++
 include/uapi/linux/kvm.h                         |  8 +++
 5 files changed, 118 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 09a8f2a34e39..469a6308765b 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -263,6 +263,27 @@ Returns: 0 on success, -negative on error
                 __u32 trans_len;
         };
 
+10. KVM_SEV_GET_ATTESTATION_REPORT
+----------------------------------
+
+The KVM_SEV_GET_ATTESTATION_REPORT command can be used by the hypervisor to query the attestation
+report containing the SHA-256 digest of the guest memory and VMSA passed through the KVM_SEV_LAUNCH
+commands and signed with the PEK. The digest returned by the command should match the digest
+used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
+
+Parameters (in): struct kvm_sev_attestation
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_attestation_report {
+                __u8 mnonce[16];        /* A random mnonce that will be placed in the report */
+
+                __u64 uaddr;            /* userspace address where the report should be copied */
+                __u32 len;
+        };
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..8dfe8988be8d 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1041,6 +1041,74 @@ e_unpin_memory:
 	return ret;
 }
 
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	void __user *report = (void __user *)(uintptr_t)argp->data;
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_attestation_report *data;
+	struct kvm_sev_attestation_report params;
+	void __user *p;
+	void *blob = NULL;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (!data)
+		return -ENOMEM;
+
+	/* User wants to query the blob length */
+	if (!params.len)
+		goto cmd;
+
+	p = (void __user *)(uintptr_t)params.uaddr;
+	if (p) {
+		if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+			ret = -EINVAL;
+			goto e_free;
+		}
+
+		ret = -ENOMEM;
+		blob = kmalloc(params.len, GFP_KERNEL);
+		if (!blob)
+			goto e_free;
+
+		data->address = __psp_pa(blob);
+		data->len = params.len;
+		memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+	}
+cmd:
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+	/*
+	 * If we query the session length, FW responded with expected data.
+	 */
+	if (!params.len)
+		goto done;
+
+	if (ret)
+		goto e_free_blob;
+
+	if (blob) {
+		if (copy_to_user(p, blob, params.len))
+			ret = -EFAULT;
+	}
+
+done:
+	params.len = data->len;
+	if (copy_to_user(report, &params, sizeof(params)))
+		ret = -EFAULT;
+e_free_blob:
+	kfree(blob);
+e_free:
+	kfree(data);
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1091,6 +1159,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_LAUNCH_SECRET:
 		r = sev_launch_secret(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_GET_ATTESTATION_REPORT:
+		r = sev_get_attestation_report(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 476113e12489..cb9b4c4e371e 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -128,6 +128,7 @@ static int sev_cmd_buffer_len(int cmd)
 	case SEV_CMD_LAUNCH_UPDATE_SECRET:	return sizeof(struct sev_data_launch_secret);
 	case SEV_CMD_DOWNLOAD_FIRMWARE:		return sizeof(struct sev_data_download_firmware);
 	case SEV_CMD_GET_ID:			return sizeof(struct sev_data_get_id);
+	case SEV_CMD_ATTESTATION_REPORT:	return sizeof(struct sev_data_attestation_report);
 	default:				return 0;
 	}
 
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 49d155cd2dfe..b801ead1e2bb 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -66,6 +66,7 @@ enum sev_cmd {
 	SEV_CMD_LAUNCH_MEASURE		= 0x033,
 	SEV_CMD_LAUNCH_UPDATE_SECRET	= 0x034,
 	SEV_CMD_LAUNCH_FINISH		= 0x035,
+	SEV_CMD_ATTESTATION_REPORT	= 0x036,
 
 	/* Guest migration commands (outgoing) */
 	SEV_CMD_SEND_START		= 0x040,
@@ -483,6 +484,22 @@ struct sev_data_dbg {
 	u32 len;				/* In */
 } __packed;
 
+/**
+ * struct sev_data_attestation_report - SEV_ATTESTATION_REPORT command parameters
+ *
+ * @handle: handle of the VM
+ * @mnonce: a random nonce that will be included in the report.
+ * @address: physical address where the report will be copied.
+ * @len: length of the physical buffer.
+ */
+struct sev_data_attestation_report {
+	u32 handle;				/* In */
+	u32 reserved;
+	u64 address;				/* In */
+	u8 mnonce[16];				/* In */
+	u32 len;				/* In/Out */
+} __packed;
+
 #ifdef CONFIG_CRYPTO_DEV_SP_PSP
 
 /**
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 374c67875cdb..07c194e2c302 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1593,6 +1593,8 @@ enum sev_cmd_id {
 	KVM_SEV_DBG_ENCRYPT,
 	/* Guest certificates commands */
 	KVM_SEV_CERT_EXPORT,
+	/* Attestation report */
+	KVM_SEV_GET_ATTESTATION_REPORT,
 
 	KVM_SEV_NR_MAX,
 };
@@ -1645,6 +1647,12 @@ struct kvm_sev_dbg {
 	__u32 len;
 };
 
+struct kvm_sev_attestation_report {
+	__u8 mnonce[16];
+	__u64 uaddr;
+	__u32 len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3


From 26128cb6c7e6731fe644c687af97733adfdb5ee9 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Tue, 2 Feb 2021 10:57:12 -0800
Subject: locking/rwlocks: Add contention detection for rwlocks

rwlocks do not currently have any facility to detect contention
like spinlocks do. In order to allow users of rwlocks to better manage
latency, add contention detection for queued rwlocks.

CC: Ingo Molnar <mingo@redhat.com>
CC: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20210202185734.1680553-7-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/asm-generic/qrwlock.h | 24 ++++++++++++++++++------
 include/linux/rwlock.h        |  7 +++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 84ce841ce735..0020d3b820a7 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -14,6 +14,7 @@
 #include <asm/processor.h>
 
 #include <asm-generic/qrwlock_types.h>
+#include <asm-generic/qspinlock.h>
 
 /*
  * Writer states & reader shift and bias.
@@ -116,15 +117,26 @@ static inline void queued_write_unlock(struct qrwlock *lock)
 	smp_store_release(&lock->wlocked, 0);
 }
 
+/**
+ * queued_rwlock_is_contended - check if the lock is contended
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+	return arch_spin_is_locked(&lock->wait_lock);
+}
+
 /*
  * Remapping rwlock architecture specific functions to the corresponding
  * queue rwlock functions.
  */
-#define arch_read_lock(l)	queued_read_lock(l)
-#define arch_write_lock(l)	queued_write_lock(l)
-#define arch_read_trylock(l)	queued_read_trylock(l)
-#define arch_write_trylock(l)	queued_write_trylock(l)
-#define arch_read_unlock(l)	queued_read_unlock(l)
-#define arch_write_unlock(l)	queued_write_unlock(l)
+#define arch_read_lock(l)		queued_read_lock(l)
+#define arch_write_lock(l)		queued_write_lock(l)
+#define arch_read_trylock(l)		queued_read_trylock(l)
+#define arch_write_trylock(l)		queued_write_trylock(l)
+#define arch_read_unlock(l)		queued_read_unlock(l)
+#define arch_write_unlock(l)		queued_write_unlock(l)
+#define arch_rwlock_is_contended(l)	queued_rwlock_is_contended(l)
 
 #endif /* __ASM_GENERIC_QRWLOCK_H */
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 3dcd617e65ae..7ce9a51ae5c0 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -128,4 +128,11 @@ do {								\
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
 
+#ifdef arch_rwlock_is_contended
+#define rwlock_is_contended(lock) \
+	 arch_rwlock_is_contended(&(lock)->raw_lock)
+#else
+#define rwlock_is_contended(lock)	((void)(lock), 0)
+#endif /* arch_rwlock_is_contended */
+
 #endif /* __LINUX_RWLOCK_H */
-- 
cgit v1.2.3


From a09a689a534183c48f200bc2de1ae61ae9c462ad Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Tue, 2 Feb 2021 10:57:13 -0800
Subject: sched: Add needbreak for rwlocks

Contention awareness while holding a spin lock is essential for reducing
latency when long running kernel operations can hold that lock. Add the
same contention detection interface for read/write spin locks.

CC: Ingo Molnar <mingo@redhat.com>
CC: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20210202185734.1680553-8-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e3a5eeec509..5d1378e5a040 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1912,6 +1912,23 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+/*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+static inline int rwlock_needbreak(rwlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+	return rwlock_is_contended(lock);
+#else
+	return 0;
+#endif
+}
+
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
-- 
cgit v1.2.3


From f3d4b4b1dc1c5fb9ea17cac14133463bfe72f170 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Tue, 2 Feb 2021 10:57:14 -0800
Subject: sched: Add cond_resched_rwlock

Safely rescheduling while holding a spin lock is essential for keeping
long running kernel operations running smoothly. Add the facility to
cond_resched rwlocks.

CC: Ingo Molnar <mingo@redhat.com>
CC: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20210202185734.1680553-9-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/sched.h | 12 ++++++++++++
 kernel/sched/core.c   | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d1378e5a040..3052d16da3cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1883,12 +1883,24 @@ static inline int _cond_resched(void) { return 0; }
 })
 
 extern int __cond_resched_lock(spinlock_t *lock);
+extern int __cond_resched_rwlock_read(rwlock_t *lock);
+extern int __cond_resched_rwlock_write(rwlock_t *lock);
 
 #define cond_resched_lock(lock) ({				\
 	___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
 	__cond_resched_lock(lock);				\
 })
 
+#define cond_resched_rwlock_read(lock) ({			\
+	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
+	__cond_resched_rwlock_read(lock);			\
+})
+
+#define cond_resched_rwlock_write(lock) ({			\
+	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
+	__cond_resched_rwlock_write(lock);			\
+})
+
 static inline void cond_resched_rcu(void)
 {
 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 15d2562118d1..ade357642279 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6695,6 +6695,46 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
+	int ret = 0;
+
+	lockdep_assert_held_read(lock);
+
+	if (rwlock_needbreak(lock) || resched) {
+		read_unlock(lock);
+		if (resched)
+			preempt_schedule_common();
+		else
+			cpu_relax();
+		ret = 1;
+		read_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
+	int ret = 0;
+
+	lockdep_assert_held_write(lock);
+
+	if (rwlock_needbreak(lock) || resched) {
+		write_unlock(lock);
+		if (resched)
+			preempt_schedule_common();
+		else
+			cpu_relax();
+		ret = 1;
+		write_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
 /**
  * yield - yield the current processor to other threads.
  *
-- 
cgit v1.2.3


From 531810caa9f4bc99ffbb90e09256792c56a6b07a Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Tue, 2 Feb 2021 10:57:24 -0800
Subject: KVM: x86/mmu: Use an rwlock for the x86 MMU

Add a read / write lock to be used in place of the MMU spinlock on x86.
The rwlock will enable the TDP MMU to handle page faults, and other
operations in parallel in future commits.

Reviewed-by: Peter Feiner <pfeiner@google.com>
Signed-off-by: Ben Gardon <bgardon@google.com>

Message-Id: <20210202185734.1680553-19-bgardon@google.com>
[Introduce virt/kvm/mmu_lock.h - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +
 arch/x86/kvm/mmu/mmu.c          | 90 ++++++++++++++++++++---------------------
 arch/x86/kvm/mmu/page_track.c   |  8 ++--
 arch/x86/kvm/mmu/paging_tmpl.h  |  8 ++--
 arch/x86/kvm/mmu/tdp_mmu.c      | 20 ++++-----
 arch/x86/kvm/x86.c              |  4 +-
 include/linux/kvm_host.h        |  5 +++
 virt/kvm/dirty_ring.c           |  5 ++-
 virt/kvm/kvm_main.c             | 37 +++++++++--------
 virt/kvm/mmu_lock.h             | 23 +++++++++++
 10 files changed, 118 insertions(+), 84 deletions(-)
 create mode 100644 virt/kvm/mmu_lock.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fa7b2df6422b..c445a51244d3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -348,6 +348,8 @@ struct kvm_mmu_root_info {
 
 #define KVM_MMU_NUM_PREV_ROOTS 3
 
+#define KVM_HAVE_MMU_RWLOCK
+
 struct kvm_mmu_page;
 
 /*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5b364ff9c115..329930d57774 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2010,9 +2010,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
-		if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
 			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
-			cond_resched_lock(&vcpu->kvm->mmu_lock);
+			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
 			flush = false;
 		}
 	}
@@ -2464,7 +2464,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
  */
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
 {
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 
 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
 		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
@@ -2475,7 +2475,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
 
 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2486,7 +2486,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
 		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
@@ -2494,7 +2494,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 
 	return r;
 }
@@ -3186,7 +3186,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			return;
 	}
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3209,7 +3209,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	}
 
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
 
@@ -3230,16 +3230,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 {
 	struct kvm_mmu_page *sp;
 
-	spin_lock(&vcpu->kvm->mmu_lock);
+	write_lock(&vcpu->kvm->mmu_lock);
 
 	if (make_mmu_pages_available(vcpu)) {
-		spin_unlock(&vcpu->kvm->mmu_lock);
+		write_unlock(&vcpu->kvm->mmu_lock);
 		return INVALID_PAGE;
 	}
 	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
 	++sp->root_count;
 
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	write_unlock(&vcpu->kvm->mmu_lock);
 	return __pa(sp->spt);
 }
 
@@ -3410,17 +3410,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 		    !smp_load_acquire(&sp->unsync_children))
 			return;
 
-		spin_lock(&vcpu->kvm->mmu_lock);
+		write_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 
 		mmu_sync_children(vcpu, sp);
 
 		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-		spin_unlock(&vcpu->kvm->mmu_lock);
+		write_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
 
-	spin_lock(&vcpu->kvm->mmu_lock);
+	write_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 
 	for (i = 0; i < 4; ++i) {
@@ -3434,7 +3434,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	}
 
 	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	write_unlock(&vcpu->kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
@@ -3718,7 +3718,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		return r;
 
 	r = RET_PF_RETRY;
-	spin_lock(&vcpu->kvm->mmu_lock);
+	write_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 	r = make_mmu_pages_available(vcpu);
@@ -3733,7 +3733,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 				 prefault, is_tdp);
 
 out_unlock:
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	write_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
 	return r;
 }
@@ -4959,7 +4959,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 */
 	mmu_topup_memory_caches(vcpu, true);
 
-	spin_lock(&vcpu->kvm->mmu_lock);
+	write_lock(&vcpu->kvm->mmu_lock);
 
 	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
 
@@ -4991,7 +4991,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	write_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -5189,14 +5189,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		if (iterator.rmap)
 			flush |= fn(kvm, iterator.rmap);
 
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 			if (flush && lock_flush_tlb) {
 				kvm_flush_remote_tlbs_with_address(kvm,
 						start_gfn,
 						iterator.gfn - start_gfn + 1);
 				flush = false;
 			}
-			cond_resched_lock(&kvm->mmu_lock);
+			cond_resched_rwlock_write(&kvm->mmu_lock);
 		}
 	}
 
@@ -5346,7 +5346,7 @@ restart:
 		 * be in active use by the guest.
 		 */
 		if (batch >= BATCH_ZAP_PAGES &&
-		    cond_resched_lock(&kvm->mmu_lock)) {
+		    cond_resched_rwlock_write(&kvm->mmu_lock)) {
 			batch = 0;
 			goto restart;
 		}
@@ -5379,7 +5379,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
 	lockdep_assert_held(&kvm->slots_lock);
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	trace_kvm_mmu_zap_all_fast(kvm);
 
 	/*
@@ -5406,7 +5406,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 	if (kvm->arch.tdp_mmu_enabled)
 		kvm_tdp_mmu_zap_all(kvm);
 
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5448,7 +5448,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 	int i;
 	bool flush;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
 		kvm_for_each_memslot(memslot, slots) {
@@ -5472,7 +5472,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 			kvm_flush_remote_tlbs(kvm);
 	}
 
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5487,12 +5487,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 {
 	bool flush;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
 				start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
 	if (kvm->arch.tdp_mmu_enabled)
 		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 
 	/*
 	 * We can flush all the TLBs out of the mmu lock without TLB
@@ -5552,13 +5552,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot)
 {
 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
 			 kvm_mmu_zap_collapsible_spte, true);
 
 	if (kvm->arch.tdp_mmu_enabled)
 		kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
@@ -5581,11 +5581,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 {
 	bool flush;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
 	if (kvm->arch.tdp_mmu_enabled)
 		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 
 	/*
 	 * It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5603,12 +5603,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
 {
 	bool flush;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
 					false);
 	if (kvm->arch.tdp_mmu_enabled)
 		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 
 	if (flush)
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5620,11 +5620,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 {
 	bool flush;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
 	if (kvm->arch.tdp_mmu_enabled)
 		flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 
 	if (flush)
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5637,14 +5637,14 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 	LIST_HEAD(invalid_list);
 	int ign;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
 		if (WARN_ON(sp->role.invalid))
 			continue;
 		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
 			goto restart;
-		if (cond_resched_lock(&kvm->mmu_lock))
+		if (cond_resched_rwlock_write(&kvm->mmu_lock))
 			goto restart;
 	}
 
@@ -5653,7 +5653,7 @@ restart:
 	if (kvm->arch.tdp_mmu_enabled)
 		kvm_tdp_mmu_zap_all(kvm);
 
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -5713,7 +5713,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 			continue;
 
 		idx = srcu_read_lock(&kvm->srcu);
-		spin_lock(&kvm->mmu_lock);
+		write_lock(&kvm->mmu_lock);
 
 		if (kvm_has_zapped_obsolete_pages(kvm)) {
 			kvm_mmu_commit_zap_page(kvm,
@@ -5724,7 +5724,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
 
 unlock:
-		spin_unlock(&kvm->mmu_lock);
+		write_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 
 		/*
@@ -5944,7 +5944,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
 	ulong to_zap;
 
 	rcu_idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 
 	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
 	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
@@ -5969,14 +5969,14 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
 			WARN_ON_ONCE(sp->lpage_disallowed);
 		}
 
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+		if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 			kvm_mmu_commit_zap_page(kvm, &invalid_list);
-			cond_resched_lock(&kvm->mmu_lock);
+			cond_resched_rwlock_write(&kvm->mmu_lock);
 		}
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, rcu_idx);
 }
 
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 8443a675715b..34bb0ec69bd8 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm,
 
 	head = &kvm->arch.track_notifier_head;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	hlist_add_head_rcu(&n->node, &head->track_notifier_list);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
 
@@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
 
 	head = &kvm->arch.track_notifier_head;
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 	hlist_del_rcu(&n->node);
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 	synchronize_srcu(&head->track_srcu);
 }
 EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..d9f66cc459e8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	}
 
 	r = RET_PF_RETRY;
-	spin_lock(&vcpu->kvm->mmu_lock);
+	write_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
 
@@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 
 out_unlock:
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	write_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
 	return r;
 }
@@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
 		return;
 	}
 
-	spin_lock(&vcpu->kvm->mmu_lock);
+	write_lock(&vcpu->kvm->mmu_lock);
 	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
 		level = iterator.level;
 		sptep = iterator.sptep;
@@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
 		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	write_unlock(&vcpu->kvm->mmu_lock);
 }
 
 /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 9e4009068920..f1fbed72e149 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -59,7 +59,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
 					   struct kvm_mmu_page *root)
 {
-	lockdep_assert_held(&kvm->mmu_lock);
+	lockdep_assert_held_write(&kvm->mmu_lock);
 
 	if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
 		return false;
@@ -117,7 +117,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
 	gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 
-	lockdep_assert_held(&kvm->mmu_lock);
+	lockdep_assert_held_write(&kvm->mmu_lock);
 
 	WARN_ON(root->root_count);
 	WARN_ON(!root->tdp_mmu_page);
@@ -170,13 +170,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 
 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 
-	spin_lock(&kvm->mmu_lock);
+	write_lock(&kvm->mmu_lock);
 
 	/* Check for an existing root before allocating a new one. */
 	for_each_tdp_mmu_root(kvm, root) {
 		if (root->role.word == role.word) {
 			kvm_mmu_get_root(kvm, root);
-			spin_unlock(&kvm->mmu_lock);
+			write_unlock(&kvm->mmu_lock);
 			return root;
 		}
 	}
@@ -186,7 +186,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 
 	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 
-	spin_unlock(&kvm->mmu_lock);
+	write_unlock(&kvm->mmu_lock);
 
 	return root;
 }
@@ -421,7 +421,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 	struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 	int as_id = kvm_mmu_page_as_id(root);
 
-	lockdep_assert_held(&kvm->mmu_lock);
+	lockdep_assert_held_write(&kvm->mmu_lock);
 
 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
@@ -492,13 +492,13 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 	if (iter->next_last_level_gfn == iter->yielded_gfn)
 		return false;
 
-	if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
 		rcu_read_unlock();
 
 		if (flush)
 			kvm_flush_remote_tlbs(kvm);
 
-		cond_resched_lock(&kvm->mmu_lock);
+		cond_resched_rwlock_write(&kvm->mmu_lock);
 		rcu_read_lock();
 
 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -1103,7 +1103,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 	struct kvm_mmu_page *root;
 	int root_as_id;
 
-	lockdep_assert_held(&kvm->mmu_lock);
+	lockdep_assert_held_write(&kvm->mmu_lock);
 	for_each_tdp_mmu_root(kvm, root) {
 		root_as_id = kvm_mmu_page_as_id(root);
 		if (root_as_id != slot->as_id)
@@ -1268,7 +1268,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 	int root_as_id;
 	bool spte_set = false;
 
-	lockdep_assert_held(&kvm->mmu_lock);
+	lockdep_assert_held_write(&kvm->mmu_lock);
 	for_each_tdp_mmu_root(kvm, root) {
 		root_as_id = kvm_mmu_page_as_id(root);
 		if (root_as_id != slot->as_id)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 912fc418ce99..b3b1cce939ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7112,9 +7112,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	if (vcpu->arch.mmu->direct_map) {
 		unsigned int indirect_shadow_pages;
 
-		spin_lock(&vcpu->kvm->mmu_lock);
+		write_lock(&vcpu->kvm->mmu_lock);
 		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
-		spin_unlock(&vcpu->kvm->mmu_lock);
+		write_unlock(&vcpu->kvm->mmu_lock);
 
 		if (indirect_shadow_pages)
 			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3b1013fb22c..f417447129b9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -451,7 +451,12 @@ struct kvm_memslots {
 };
 
 struct kvm {
+#ifdef KVM_HAVE_MMU_RWLOCK
+	rwlock_t mmu_lock;
+#else
 	spinlock_t mmu_lock;
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
 	struct mutex slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index 790f17325f8d..7aafefc50aa7 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -9,6 +9,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kvm_dirty_ring.h>
 #include <trace/events/kvm.h>
+#include "mmu_lock.h"
 
 int __weak kvm_cpu_dirty_log_size(void)
 {
@@ -60,9 +61,9 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
 	if (!memslot || (offset + __fls(mask)) >= memslot->npages)
 		return;
 
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 }
 
 int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 335a1a2b8edc..48ccdf4e3d04 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -58,6 +58,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "mmu_lock.h"
 #include "vfio.h"
 
 #define CREATE_TRACE_POINTS
@@ -459,13 +460,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	int idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+
+	KVM_MMU_LOCK(kvm);
+
 	kvm->mmu_notifier_seq++;
 
 	if (kvm_set_spte_hva(kvm, address, pte))
 		kvm_flush_remote_tlbs(kvm);
 
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -476,7 +479,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	int need_tlb_flush = 0, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
@@ -489,7 +492,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	if (need_tlb_flush || kvm->tlbs_dirty)
 		kvm_flush_remote_tlbs(kvm);
 
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return 0;
@@ -500,7 +503,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
@@ -514,7 +517,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * in conjunction with the smp_rmb in mmu_notifier_retry().
 	 */
 	kvm->mmu_notifier_count--;
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 
 	BUG_ON(kvm->mmu_notifier_count < 0);
 }
@@ -528,13 +531,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	int young, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 
 	young = kvm_age_hva(kvm, start, end);
 	if (young)
 		kvm_flush_remote_tlbs(kvm);
 
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return young;
@@ -549,7 +552,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 	int young, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 	/*
 	 * Even though we do not flush TLB, this will still adversely
 	 * affect performance on pre-Haswell Intel EPT, where there is
@@ -564,7 +567,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 	 * more sophisticated heuristic later.
 	 */
 	young = kvm_age_hva(kvm, start, end);
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return young;
@@ -578,9 +581,9 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 	int young, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 	young = kvm_test_age_hva(kvm, address);
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return young;
@@ -745,7 +748,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
-	spin_lock_init(&kvm->mmu_lock);
+	KVM_MMU_LOCK_INIT(kvm);
 	mmgrab(current->mm);
 	kvm->mm = current->mm;
 	kvm_eventfd_init(kvm);
@@ -1525,7 +1528,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
 		memset(dirty_bitmap_buffer, 0, n);
 
-		spin_lock(&kvm->mmu_lock);
+		KVM_MMU_LOCK(kvm);
 		for (i = 0; i < n / sizeof(long); i++) {
 			unsigned long mask;
 			gfn_t offset;
@@ -1541,7 +1544,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
 								offset, mask);
 		}
-		spin_unlock(&kvm->mmu_lock);
+		KVM_MMU_UNLOCK(kvm);
 	}
 
 	if (flush)
@@ -1636,7 +1639,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
 	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
 		return -EFAULT;
 
-	spin_lock(&kvm->mmu_lock);
+	KVM_MMU_LOCK(kvm);
 	for (offset = log->first_page, i = offset / BITS_PER_LONG,
 		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
 	     i++, offset += BITS_PER_LONG) {
@@ -1659,7 +1662,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
 								offset, mask);
 		}
 	}
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
 
 	if (flush)
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h
new file mode 100644
index 000000000000..9e1308f9734c
--- /dev/null
+++ b/virt/kvm/mmu_lock.h
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef KVM_MMU_LOCK_H
+#define KVM_MMU_LOCK_H 1
+
+/*
+ * Architectures can choose whether to use an rwlock or spinlock
+ * for the mmu_lock.  These macros, for use in common code
+ * only, avoids using #ifdefs in places that must deal with
+ * multiple architectures.
+ */
+
+#ifdef KVM_HAVE_MMU_RWLOCK
+#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm)      write_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm)    write_unlock(&(kvm)->mmu_lock)
+#else
+#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm)      spin_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm)    spin_unlock(&(kvm)->mmu_lock)
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
+#endif
-- 
cgit v1.2.3


From 7d8658ef65a4f891d0cff6340fa717b378384642 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Thu, 4 Feb 2021 16:14:22 +0800
Subject: OPP: Add function to look up required OPP's for a given OPP

Add a function that allows looking up required OPPs given a source OPP
table, destination OPP table and the source OPP.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
[ Viresh: Rearranged code, fixed return errors ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  7 +++++++
 2 files changed, 62 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index dc95d29e94c1..c3f3d9249cc5 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2398,6 +2398,61 @@ devm_pm_opp_attach_genpd(struct device *dev, const char **names,
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
 
+/**
+ * dev_pm_opp_xlate_required_opp() - Find required OPP for @src_table OPP.
+ * @src_table: OPP table which has @dst_table as one of its required OPP table.
+ * @dst_table: Required OPP table of the @src_table.
+ * @src_opp: OPP from the @src_table.
+ *
+ * This function returns the OPP (present in @dst_table) pointed out by the
+ * "required-opps" property of the @src_opp (present in @src_table).
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ *
+ * Return: pointer to 'struct dev_pm_opp' on success and errorno otherwise.
+ */
+struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
+						 struct opp_table *dst_table,
+						 struct dev_pm_opp *src_opp)
+{
+	struct dev_pm_opp *opp, *dest_opp = ERR_PTR(-ENODEV);
+	int i;
+
+	if (!src_table || !dst_table || !src_opp ||
+	    !src_table->required_opp_tables)
+		return ERR_PTR(-EINVAL);
+
+	/* required-opps not fully initialized yet */
+	if (lazy_linking_pending(src_table))
+		return ERR_PTR(-EBUSY);
+
+	for (i = 0; i < src_table->required_opp_count; i++) {
+		if (src_table->required_opp_tables[i] == dst_table) {
+			mutex_lock(&src_table->lock);
+
+			list_for_each_entry(opp, &src_table->opp_list, node) {
+				if (opp == src_opp) {
+					dest_opp = opp->required_opps[i];
+					dev_pm_opp_get(dest_opp);
+					break;
+				}
+			}
+
+			mutex_unlock(&src_table->lock);
+			break;
+		}
+	}
+
+	if (IS_ERR(dest_opp)) {
+		pr_err("%s: Couldn't find matching OPP (%p: %p)\n", __func__,
+		       src_table, dst_table);
+	}
+
+	return dest_opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_xlate_required_opp);
+
 /**
  * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table.
  * @src_table: OPP table which has dst_table as one of its required OPP table.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index ab1d15ce559d..c0371efa4a0f 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -156,6 +156,7 @@ struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
+struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp);
@@ -367,6 +368,12 @@ static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
+				struct opp_table *dst_table, struct dev_pm_opp *src_opp)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 26f9c7cc42a6dc036edf871544fd0e6b3a0601c1 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Thu, 4 Feb 2021 16:14:23 +0800
Subject: PM / devfreq: Cache OPP table reference in devfreq

The OPP table can be used often in devfreq. Trying to get it each time can
be expensive, so cache it in the devfreq struct.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Acked-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
[ Viresh: Added a blank line ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/devfreq/devfreq.c | 7 +++++++
 include/linux/devfreq.h   | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 6aa10de792b3..cefe84a10824 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -757,6 +757,9 @@ static void devfreq_dev_release(struct device *dev)
 	if (devfreq->profile->exit)
 		devfreq->profile->exit(devfreq->dev.parent);
 
+	if (devfreq->opp_table)
+		dev_pm_opp_put_opp_table(devfreq->opp_table);
+
 	mutex_destroy(&devfreq->lock);
 	kfree(devfreq);
 }
@@ -844,6 +847,10 @@ struct devfreq *devfreq_add_device(struct device *dev,
 	}
 
 	devfreq->suspend_freq = dev_pm_opp_get_suspend_opp_freq(dev);
+	devfreq->opp_table = dev_pm_opp_get_opp_table(dev);
+	if (IS_ERR(devfreq->opp_table))
+		devfreq->opp_table = NULL;
+
 	atomic_set(&devfreq->suspend_count, 0);
 
 	dev_set_name(&devfreq->dev, "%s", dev_name(dev));
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index b6d3bae1c74d..26ea0850be9b 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -137,6 +137,7 @@ struct devfreq_stats {
  *		using devfreq.
  * @profile:	device-specific devfreq profile
  * @governor:	method how to choose frequency based on the usage.
+ * @opp_table:	Reference to OPP table of dev.parent, if one exists.
  * @nb:		notifier block used to notify devfreq object that it should
  *		reevaluate operable frequencies. Devfreq users may use
  *		devfreq.nb to the corresponding register notifier call chain.
@@ -173,6 +174,7 @@ struct devfreq {
 	struct device dev;
 	struct devfreq_dev_profile *profile;
 	const struct devfreq_governor *governor;
+	struct opp_table *opp_table;
 	struct notifier_block nb;
 	struct delayed_work work;
 
-- 
cgit v1.2.3


From ed8188a0c1f0f49739c727a53df1174826c1a80b Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 4 Feb 2021 09:43:55 +0800
Subject: iommu/vt-d: Fix 'physical' typos

Fix misspellings of "physical".

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/linux-iommu/20210126211738.2920789-1-helgaas@kernel.org
Link: https://lore.kernel.org/r/20210204014401.2846425-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ecb35fdff03e..f8ab154c979d 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -664,7 +664,7 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
  * 7: super page
  * 8-10: available
  * 11: snoop behavior
- * 12-63: Host physcial address
+ * 12-63: Host physical address
  */
 struct dma_pte {
 	u64 val;
-- 
cgit v1.2.3


From ad3d19029979b19378ece2011fc8ce07be98c905 Mon Sep 17 00:00:00 2001
From: Kyung Min Park <kyung.min.park@intel.com>
Date: Thu, 4 Feb 2021 09:43:56 +0800
Subject: iommu/vt-d: Audit IOMMU Capabilities and add helper functions

Audit IOMMU Capability/Extended Capability and check if the IOMMUs have
the consistent value for features. Report out or scale to the lowest
supported when IOMMU features have incompatibility among IOMMUs.

Report out features when below features are mismatched:
  - First Level 5 Level Paging Support (FL5LP)
  - First Level 1 GByte Page Support (FL1GP)
  - Read Draining (DRD)
  - Write Draining (DWD)
  - Page Selective Invalidation (PSI)
  - Zero Length Read (ZLR)
  - Caching Mode (CM)
  - Protected High/Low-Memory Region (PHMR/PLMR)
  - Required Write-Buffer Flushing (RWBF)
  - Advanced Fault Logging (AFL)
  - RID-PASID Support (RPS)
  - Scalable Mode Page Walk Coherency (SMPWC)
  - First Level Translation Support (FLTS)
  - Second Level Translation Support (SLTS)
  - No Write Flag Support (NWFS)
  - Second Level Accessed/Dirty Support (SLADS)
  - Virtual Command Support (VCS)
  - Scalable Mode Translation Support (SMTS)
  - Device TLB Invalidation Throttle (DIT)
  - Page Drain Support (PDS)
  - Process Address Space ID Support (PASID)
  - Extended Accessed Flag Support (EAFS)
  - Supervisor Request Support (SRS)
  - Execute Request Support (ERS)
  - Page Request Support (PRS)
  - Nested Translation Support (NEST)
  - Snoop Control (SC)
  - Pass Through (PT)
  - Device TLB Support (DT)
  - Queued Invalidation (QI)
  - Page walk Coherency (C)

Set capability to the lowest supported when below features are mismatched:
  - Maximum Address Mask Value (MAMV)
  - Number of Fault Recording Registers (NFR)
  - Second Level Large Page Support (SLLPS)
  - Fault Recording Offset (FRO)
  - Maximum Guest Address Width (MGAW)
  - Supported Adjusted Guest Address Width (SAGAW)
  - Number of Domains supported (NDOMS)
  - Pasid Size Supported (PSS)
  - Maximum Handle Mask Value (MHMV)
  - IOTLB Register Offset (IRO)

Signed-off-by: Kyung Min Park <kyung.min.park@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210130184452.31711-1-kyung.min.park@intel.com
Link: https://lore.kernel.org/r/20210204014401.2846425-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/Makefile        |   2 +-
 drivers/iommu/intel/cap_audit.c     | 185 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/cap_audit.h     | 110 +++++++++++++++++++++
 drivers/iommu/intel/iommu.c         |   9 ++
 drivers/iommu/intel/irq_remapping.c |   8 ++
 include/linux/intel-iommu.h         |  39 ++++----
 6 files changed, 334 insertions(+), 19 deletions(-)
 create mode 100644 drivers/iommu/intel/cap_audit.c
 create mode 100644 drivers/iommu/intel/cap_audit.h

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index ae570810a35e..ae236ec7d219 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
-obj-$(CONFIG_DMAR_TABLE) += trace.o
+obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
 obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o
 obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o
diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
new file mode 100644
index 000000000000..049bc0c76a00
--- /dev/null
+++ b/drivers/iommu/intel/cap_audit.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cap_audit.c - audit iommu capabilities for boot time and hot plug
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Kyung Min Park <kyung.min.park@intel.com>
+ *         Lu Baolu <baolu.lu@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)	"DMAR: " fmt
+
+#include <linux/intel-iommu.h>
+#include "cap_audit.h"
+
+static u64 intel_iommu_cap_sanity;
+static u64 intel_iommu_ecap_sanity;
+
+static inline void check_irq_capabilities(struct intel_iommu *a,
+					  struct intel_iommu *b)
+{
+	CHECK_FEATURE_MISMATCH(a, b, cap, pi_support, CAP_PI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, eim_support, ECAP_EIM_MASK);
+}
+
+static inline void check_dmar_capabilities(struct intel_iommu *a,
+					   struct intel_iommu *b)
+{
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_MAMV_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_NFR_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_SLLPS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_FRO_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_MGAW_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_SAGAW_MASK);
+	MINIMAL_FEATURE_IOMMU(b, cap, CAP_NDOMS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_PSS_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_MHMV_MASK);
+	MINIMAL_FEATURE_IOMMU(b, ecap, ECAP_IRO_MASK);
+
+	CHECK_FEATURE_MISMATCH(a, b, cap, 5lp_support, CAP_FL5LP_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, fl1gp_support, CAP_FL1GP_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, read_drain, CAP_RD_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, write_drain, CAP_WD_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, pgsel_inv, CAP_PSI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, zlr, CAP_ZLR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, caching_mode, CAP_CM_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, phmr, CAP_PHMR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, plmr, CAP_PLMR_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, rwbf, CAP_RWBF_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, cap, afl, CAP_AFL_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, rps, ECAP_RPS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, smpwc, ECAP_SMPWC_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, flts, ECAP_FLTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, slts, ECAP_SLTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, nwfs, ECAP_NWFS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, slads, ECAP_SLADS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, vcs, ECAP_VCS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, smts, ECAP_SMTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pds, ECAP_PDS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, dit, ECAP_DIT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pasid, ECAP_PASID_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, eafs, ECAP_EAFS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, srs, ECAP_SRS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, ers, ECAP_ERS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, prs, ECAP_PRS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, nest, ECAP_NEST_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, mts, ECAP_MTS_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, sc_support, ECAP_SC_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, pass_through, ECAP_PT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, dev_iotlb_support, ECAP_DT_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, qis, ECAP_QI_MASK);
+	CHECK_FEATURE_MISMATCH(a, b, ecap, coherent, ECAP_C_MASK);
+}
+
+static int cap_audit_hotplug(struct intel_iommu *iommu, enum cap_audit_type type)
+{
+	bool mismatch = false;
+	u64 old_cap = intel_iommu_cap_sanity;
+	u64 old_ecap = intel_iommu_ecap_sanity;
+
+	if (type == CAP_AUDIT_HOTPLUG_IRQR) {
+		CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pi_support, CAP_PI_MASK);
+		CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eim_support, ECAP_EIM_MASK);
+		goto out;
+	}
+
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, 5lp_support, CAP_FL5LP_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, fl1gp_support, CAP_FL1GP_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, read_drain, CAP_RD_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, write_drain, CAP_WD_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, pgsel_inv, CAP_PSI_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, zlr, CAP_ZLR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, caching_mode, CAP_CM_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, phmr, CAP_PHMR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, plmr, CAP_PLMR_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, rwbf, CAP_RWBF_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, cap, afl, CAP_AFL_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, rps, ECAP_RPS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smpwc, ECAP_SMPWC_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, flts, ECAP_FLTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slts, ECAP_SLTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nwfs, ECAP_NWFS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, slads, ECAP_SLADS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, vcs, ECAP_VCS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, smts, ECAP_SMTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pds, ECAP_PDS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dit, ECAP_DIT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pasid, ECAP_PASID_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, eafs, ECAP_EAFS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, srs, ECAP_SRS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, ers, ECAP_ERS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, prs, ECAP_PRS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, nest, ECAP_NEST_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, mts, ECAP_MTS_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, sc_support, ECAP_SC_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, pass_through, ECAP_PT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, dev_iotlb_support, ECAP_DT_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, qis, ECAP_QI_MASK);
+	CHECK_FEATURE_MISMATCH_HOTPLUG(iommu, ecap, coherent, ECAP_C_MASK);
+
+	/* Abort hot plug if the hot plug iommu feature is smaller than global */
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, max_amask_val, CAP_MAMV_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, num_fault_regs, CAP_NFR_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, super_page_val, CAP_SLLPS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, fault_reg_offset, CAP_FRO_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, mgaw, CAP_MGAW_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, sagaw, CAP_SAGAW_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, cap, ndoms, CAP_NDOMS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, pss, ECAP_PSS_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, max_handle_mask, ECAP_MHMV_MASK, mismatch);
+	MINIMAL_FEATURE_HOTPLUG(iommu, ecap, iotlb_offset, ECAP_IRO_MASK, mismatch);
+
+out:
+	if (mismatch) {
+		intel_iommu_cap_sanity = old_cap;
+		intel_iommu_ecap_sanity = old_ecap;
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type)
+{
+	struct dmar_drhd_unit *d;
+	struct intel_iommu *i;
+
+	rcu_read_lock();
+	if (list_empty(&dmar_drhd_units))
+		goto out;
+
+	for_each_active_iommu(i, d) {
+		if (!iommu) {
+			intel_iommu_ecap_sanity = i->ecap;
+			intel_iommu_cap_sanity = i->cap;
+			iommu = i;
+			continue;
+		}
+
+		if (type == CAP_AUDIT_STATIC_DMAR)
+			check_dmar_capabilities(iommu, i);
+		else
+			check_irq_capabilities(iommu, i);
+	}
+
+out:
+	rcu_read_unlock();
+	return 0;
+}
+
+int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu)
+{
+	switch (type) {
+	case CAP_AUDIT_STATIC_DMAR:
+	case CAP_AUDIT_STATIC_IRQR:
+		return cap_audit_static(iommu, type);
+	case CAP_AUDIT_HOTPLUG_DMAR:
+	case CAP_AUDIT_HOTPLUG_IRQR:
+		return cap_audit_hotplug(iommu, type);
+	default:
+		break;
+	}
+
+	return -EFAULT;
+}
diff --git a/drivers/iommu/intel/cap_audit.h b/drivers/iommu/intel/cap_audit.h
new file mode 100644
index 000000000000..a6a1530441b7
--- /dev/null
+++ b/drivers/iommu/intel/cap_audit.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * cap_audit.h - audit iommu capabilities header
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Author: Kyung Min Park <kyung.min.park@intel.com>
+ */
+
+/*
+ * Capability Register Mask
+ */
+#define CAP_FL5LP_MASK		BIT_ULL(60)
+#define CAP_PI_MASK		BIT_ULL(59)
+#define CAP_FL1GP_MASK		BIT_ULL(56)
+#define CAP_RD_MASK		BIT_ULL(55)
+#define CAP_WD_MASK		BIT_ULL(54)
+#define CAP_MAMV_MASK		GENMASK_ULL(53, 48)
+#define CAP_NFR_MASK		GENMASK_ULL(47, 40)
+#define CAP_PSI_MASK		BIT_ULL(39)
+#define CAP_SLLPS_MASK		GENMASK_ULL(37, 34)
+#define CAP_FRO_MASK		GENMASK_ULL(33, 24)
+#define CAP_ZLR_MASK		BIT_ULL(22)
+#define CAP_MGAW_MASK		GENMASK_ULL(21, 16)
+#define CAP_SAGAW_MASK		GENMASK_ULL(12, 8)
+#define CAP_CM_MASK		BIT_ULL(7)
+#define CAP_PHMR_MASK		BIT_ULL(6)
+#define CAP_PLMR_MASK		BIT_ULL(5)
+#define CAP_RWBF_MASK		BIT_ULL(4)
+#define CAP_AFL_MASK		BIT_ULL(3)
+#define CAP_NDOMS_MASK		GENMASK_ULL(2, 0)
+
+/*
+ * Extended Capability Register Mask
+ */
+#define ECAP_RPS_MASK		BIT_ULL(49)
+#define ECAP_SMPWC_MASK		BIT_ULL(48)
+#define ECAP_FLTS_MASK		BIT_ULL(47)
+#define ECAP_SLTS_MASK		BIT_ULL(46)
+#define ECAP_SLADS_MASK		BIT_ULL(45)
+#define ECAP_VCS_MASK		BIT_ULL(44)
+#define ECAP_SMTS_MASK		BIT_ULL(43)
+#define ECAP_PDS_MASK		BIT_ULL(42)
+#define ECAP_DIT_MASK		BIT_ULL(41)
+#define ECAP_PASID_MASK		BIT_ULL(40)
+#define ECAP_PSS_MASK		GENMASK_ULL(39, 35)
+#define ECAP_EAFS_MASK		BIT_ULL(34)
+#define ECAP_NWFS_MASK		BIT_ULL(33)
+#define ECAP_SRS_MASK		BIT_ULL(31)
+#define ECAP_ERS_MASK		BIT_ULL(30)
+#define ECAP_PRS_MASK		BIT_ULL(29)
+#define ECAP_NEST_MASK		BIT_ULL(26)
+#define ECAP_MTS_MASK		BIT_ULL(25)
+#define ECAP_MHMV_MASK		GENMASK_ULL(23, 20)
+#define ECAP_IRO_MASK		GENMASK_ULL(17, 8)
+#define ECAP_SC_MASK		BIT_ULL(7)
+#define ECAP_PT_MASK		BIT_ULL(6)
+#define ECAP_EIM_MASK		BIT_ULL(4)
+#define ECAP_DT_MASK		BIT_ULL(2)
+#define ECAP_QI_MASK		BIT_ULL(1)
+#define ECAP_C_MASK		BIT_ULL(0)
+
+/*
+ * u64 intel_iommu_cap_sanity, intel_iommu_ecap_sanity will be adjusted as each
+ * IOMMU gets audited.
+ */
+#define DO_CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
+do { \
+	if (cap##_##feature(a) != cap##_##feature(b)) { \
+		intel_iommu_##cap##_sanity &= ~(MASK); \
+		pr_info("IOMMU feature %s inconsistent", #feature); \
+	} \
+} while (0)
+
+#define CHECK_FEATURE_MISMATCH(a, b, cap, feature, MASK) \
+	DO_CHECK_FEATURE_MISMATCH((a)->cap, (b)->cap, cap, feature, MASK)
+
+#define CHECK_FEATURE_MISMATCH_HOTPLUG(b, cap, feature, MASK) \
+do { \
+	if (cap##_##feature(intel_iommu_##cap##_sanity)) \
+		DO_CHECK_FEATURE_MISMATCH(intel_iommu_##cap##_sanity, \
+					  (b)->cap, cap, feature, MASK); \
+} while (0)
+
+#define MINIMAL_FEATURE_IOMMU(iommu, cap, MASK) \
+do { \
+	u64 min_feature = intel_iommu_##cap##_sanity & (MASK); \
+	min_feature = min_t(u64, min_feature, (iommu)->cap & (MASK)); \
+	intel_iommu_##cap##_sanity = (intel_iommu_##cap##_sanity & ~(MASK)) | \
+				     min_feature; \
+} while (0)
+
+#define MINIMAL_FEATURE_HOTPLUG(iommu, cap, feature, MASK, mismatch) \
+do { \
+	if ((intel_iommu_##cap##_sanity & (MASK)) > \
+	    (cap##_##feature((iommu)->cap))) \
+		mismatch = true; \
+	else \
+		(iommu)->cap = ((iommu)->cap & ~(MASK)) | \
+		(intel_iommu_##cap##_sanity & (MASK)); \
+} while (0)
+
+enum cap_audit_type {
+	CAP_AUDIT_STATIC_DMAR,
+	CAP_AUDIT_STATIC_IRQR,
+	CAP_AUDIT_HOTPLUG_DMAR,
+	CAP_AUDIT_HOTPLUG_IRQR,
+};
+
+int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 87d0ef79ba6a..de82d5da01f9 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -47,6 +47,7 @@
 
 #include "../irq_remapping.h"
 #include "pasid.h"
+#include "cap_audit.h"
 
 #define ROOT_SIZE		VTD_PAGE_SIZE
 #define CONTEXT_SIZE		VTD_PAGE_SIZE
@@ -3206,6 +3207,10 @@ static int __init init_dmars(void)
 		goto error;
 	}
 
+	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
+	if (ret)
+		goto free_iommu;
+
 	for_each_iommu(iommu, drhd) {
 		if (drhd->ignored) {
 			iommu_disable_translation(iommu);
@@ -3757,6 +3762,10 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 	if (g_iommus[iommu->seq_id])
 		return 0;
 
+	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
+	if (ret)
+		goto out;
+
 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
 		pr_warn("%s: Doesn't support hardware pass through.\n",
 			iommu->name);
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 685200a5cff0..611ef5243cb6 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -22,6 +22,7 @@
 #include <asm/pci-direct.h>
 
 #include "../irq_remapping.h"
+#include "cap_audit.h"
 
 enum irq_mode {
 	IRQ_REMAPPING,
@@ -734,6 +735,9 @@ static int __init intel_prepare_irq_remapping(void)
 	if (dmar_table_init() < 0)
 		return -ENODEV;
 
+	if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
+		goto error;
+
 	if (!dmar_ir_support())
 		return -ENODEV;
 
@@ -1439,6 +1443,10 @@ static int dmar_ir_add(struct dmar_drhd_unit *dmaru, struct intel_iommu *iommu)
 	int ret;
 	int eim = x2apic_enabled();
 
+	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_IRQR, iommu);
+	if (ret)
+		return ret;
+
 	if (eim && !ecap_eim_support(iommu->ecap)) {
 		pr_info("DRHD %Lx: EIM not supported by DRHD, ecap %Lx\n",
 			iommu->reg_phys, iommu->ecap);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f8ab154c979d..1bc46b88711a 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -170,34 +170,37 @@
  * Extended Capability Register
  */
 
+#define	ecap_rps(e)		(((e) >> 49) & 0x1)
 #define ecap_smpwc(e)		(((e) >> 48) & 0x1)
 #define ecap_flts(e)		(((e) >> 47) & 0x1)
 #define ecap_slts(e)		(((e) >> 46) & 0x1)
+#define ecap_slads(e)		(((e) >> 45) & 0x1)
 #define ecap_vcs(e)		(((e) >> 44) & 0x1)
 #define ecap_smts(e)		(((e) >> 43) & 0x1)
-#define ecap_dit(e)		((e >> 41) & 0x1)
-#define ecap_pasid(e)		((e >> 40) & 0x1)
-#define ecap_pss(e)		((e >> 35) & 0x1f)
-#define ecap_eafs(e)		((e >> 34) & 0x1)
-#define ecap_nwfs(e)		((e >> 33) & 0x1)
-#define ecap_srs(e)		((e >> 31) & 0x1)
-#define ecap_ers(e)		((e >> 30) & 0x1)
-#define ecap_prs(e)		((e >> 29) & 0x1)
-#define ecap_broken_pasid(e)	((e >> 28) & 0x1)
-#define ecap_dis(e)		((e >> 27) & 0x1)
-#define ecap_nest(e)		((e >> 26) & 0x1)
-#define ecap_mts(e)		((e >> 25) & 0x1)
-#define ecap_ecs(e)		((e >> 24) & 0x1)
+#define ecap_dit(e)		(((e) >> 41) & 0x1)
+#define ecap_pds(e)		(((e) >> 42) & 0x1)
+#define ecap_pasid(e)		(((e) >> 40) & 0x1)
+#define ecap_pss(e)		(((e) >> 35) & 0x1f)
+#define ecap_eafs(e)		(((e) >> 34) & 0x1)
+#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
+#define ecap_srs(e)		(((e) >> 31) & 0x1)
+#define ecap_ers(e)		(((e) >> 30) & 0x1)
+#define ecap_prs(e)		(((e) >> 29) & 0x1)
+#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
+#define ecap_dis(e)		(((e) >> 27) & 0x1)
+#define ecap_nest(e)		(((e) >> 26) & 0x1)
+#define ecap_mts(e)		(((e) >> 25) & 0x1)
+#define ecap_ecs(e)		(((e) >> 24) & 0x1)
 #define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
 #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
 #define ecap_coherent(e)	((e) & 0x1)
 #define ecap_qis(e)		((e) & 0x2)
-#define ecap_pass_through(e)	((e >> 6) & 0x1)
-#define ecap_eim_support(e)	((e >> 4) & 0x1)
-#define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
+#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
+#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
 #define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
-#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-#define ecap_sc_support(e)	((e >> 7) & 0x1) /* Snooping Control */
+#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
+#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
 
 /* Virtual command interface capability */
 #define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
-- 
cgit v1.2.3


From 31a75cbbb9274cf8185f402904bf11386917870b Mon Sep 17 00:00:00 2001
From: Yian Chen <yian.chen@intel.com>
Date: Thu, 4 Feb 2021 09:44:00 +0800
Subject: iommu/vt-d: Parse SATC reporting structure

Software should parse every SATC table and all devices in the tables
reported by the BIOS and keep the information in kernel list for further
reference.

Signed-off-by: Yian Chen <yian.chen@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210203093329.1617808-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210204014401.2846425-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/dmar.c  |  8 ++++
 drivers/iommu/intel/iommu.c | 89 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmar.h        |  2 +
 3 files changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index bd51f33642e0..6b8fce1c85c4 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -526,6 +526,7 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 	struct acpi_dmar_reserved_memory *rmrr;
 	struct acpi_dmar_atsr *atsr;
 	struct acpi_dmar_rhsa *rhsa;
+	struct acpi_dmar_satc *satc;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
@@ -555,6 +556,10 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 		/* We don't print this here because we need to sanity-check
 		   it first. So print it in dmar_parse_one_andd() instead. */
 		break;
+	case ACPI_DMAR_TYPE_SATC:
+		satc = container_of(header, struct acpi_dmar_satc, header);
+		pr_info("SATC flags: 0x%x\n", satc->flags);
+		break;
 	}
 }
 
@@ -642,6 +647,7 @@ parse_dmar_table(void)
 		.cb[ACPI_DMAR_TYPE_ROOT_ATS] = &dmar_parse_one_atsr,
 		.cb[ACPI_DMAR_TYPE_HARDWARE_AFFINITY] = &dmar_parse_one_rhsa,
 		.cb[ACPI_DMAR_TYPE_NAMESPACE] = &dmar_parse_one_andd,
+		.cb[ACPI_DMAR_TYPE_SATC] = &dmar_parse_one_satc,
 	};
 
 	/*
@@ -2077,6 +2083,7 @@ static guid_t dmar_hp_guid =
 #define	DMAR_DSM_FUNC_DRHD		1
 #define	DMAR_DSM_FUNC_ATSR		2
 #define	DMAR_DSM_FUNC_RHSA		3
+#define	DMAR_DSM_FUNC_SATC		4
 
 static inline bool dmar_detect_dsm(acpi_handle handle, int func)
 {
@@ -2094,6 +2101,7 @@ static int dmar_walk_dsm_resource(acpi_handle handle, int func,
 		[DMAR_DSM_FUNC_DRHD] = ACPI_DMAR_TYPE_HARDWARE_UNIT,
 		[DMAR_DSM_FUNC_ATSR] = ACPI_DMAR_TYPE_ROOT_ATS,
 		[DMAR_DSM_FUNC_RHSA] = ACPI_DMAR_TYPE_HARDWARE_AFFINITY,
+		[DMAR_DSM_FUNC_SATC] = ACPI_DMAR_TYPE_SATC,
 	};
 
 	if (!dmar_detect_dsm(handle, func))
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 761c1b251c1e..384cc546632a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -316,8 +316,18 @@ struct dmar_atsr_unit {
 	u8 include_all:1;		/* include all ports */
 };
 
+struct dmar_satc_unit {
+	struct list_head list;		/* list of SATC units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct dmar_dev_scope *devices;	/* target devices */
+	struct intel_iommu *iommu;	/* the corresponding iommu */
+	int devices_cnt;		/* target device count */
+	u8 atc_required:1;		/* ATS is required */
+};
+
 static LIST_HEAD(dmar_atsr_units);
 static LIST_HEAD(dmar_rmrr_units);
+static LIST_HEAD(dmar_satc_units);
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
@@ -3716,6 +3726,57 @@ int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
 	return 0;
 }
 
+static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
+{
+	struct dmar_satc_unit *satcu;
+	struct acpi_dmar_satc *tmp;
+
+	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
+				dmar_rcu_check()) {
+		tmp = (struct acpi_dmar_satc *)satcu->hdr;
+		if (satc->segment != tmp->segment)
+			continue;
+		if (satc->header.length != tmp->header.length)
+			continue;
+		if (memcmp(satc, tmp, satc->header.length) == 0)
+			return satcu;
+	}
+
+	return NULL;
+}
+
+int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
+{
+	struct acpi_dmar_satc *satc;
+	struct dmar_satc_unit *satcu;
+
+	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
+		return 0;
+
+	satc = container_of(hdr, struct acpi_dmar_satc, header);
+	satcu = dmar_find_satc(satc);
+	if (satcu)
+		return 0;
+
+	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
+	if (!satcu)
+		return -ENOMEM;
+
+	satcu->hdr = (void *)(satcu + 1);
+	memcpy(satcu->hdr, hdr, hdr->length);
+	satcu->atc_required = satc->flags & 0x1;
+	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
+					      (void *)satc + satc->header.length,
+					      &satcu->devices_cnt);
+	if (satcu->devices_cnt && !satcu->devices) {
+		kfree(satcu);
+		return -ENOMEM;
+	}
+	list_add_rcu(&satcu->list, &dmar_satc_units);
+
+	return 0;
+}
+
 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 {
 	int sp, ret;
@@ -3823,6 +3884,7 @@ static void intel_iommu_free_dmars(void)
 {
 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
 	struct dmar_atsr_unit *atsru, *atsr_n;
+	struct dmar_satc_unit *satcu, *satc_n;
 
 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
 		list_del(&rmrru->list);
@@ -3834,6 +3896,11 @@ static void intel_iommu_free_dmars(void)
 		list_del(&atsru->list);
 		intel_iommu_free_atsr(atsru);
 	}
+	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
+		list_del(&satcu->list);
+		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
+		kfree(satcu);
+	}
 }
 
 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
@@ -3885,8 +3952,10 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 	int ret;
 	struct dmar_rmrr_unit *rmrru;
 	struct dmar_atsr_unit *atsru;
+	struct dmar_satc_unit *satcu;
 	struct acpi_dmar_atsr *atsr;
 	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_satc *satc;
 
 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
 		return 0;
@@ -3927,6 +3996,23 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 				break;
 		}
 	}
+	list_for_each_entry(satcu, &dmar_satc_units, list) {
+		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
+		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
+			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
+					(void *)satc + satc->header.length,
+					satc->segment, satcu->devices,
+					satcu->devices_cnt);
+			if (ret > 0)
+				break;
+			else if (ret < 0)
+				return ret;
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
+			if (dmar_remove_dev_scope(info, satc->segment,
+					satcu->devices, satcu->devices_cnt))
+				break;
+		}
+	}
 
 	return 0;
 }
@@ -4270,6 +4356,9 @@ int __init intel_iommu_init(void)
 	if (list_empty(&dmar_atsr_units))
 		pr_info("No ATSR found\n");
 
+	if (list_empty(&dmar_satc_units))
+		pr_info("No SATC found\n");
+
 	if (dmar_map_gfx)
 		intel_iommu_gfx_mapped = 1;
 
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 65565820328a..e04436a7ff27 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -138,6 +138,7 @@ extern void intel_iommu_shutdown(void);
 extern int dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_parse_one_atsr(struct acpi_dmar_header *header, void *arg);
 extern int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg);
+extern int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg);
 extern int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg);
 extern int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert);
 extern int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info);
@@ -149,6 +150,7 @@ static inline void intel_iommu_shutdown(void) { }
 #define	dmar_parse_one_atsr		dmar_res_noop
 #define	dmar_check_one_atsr		dmar_res_noop
 #define	dmar_release_one_atsr		dmar_res_noop
+#define	dmar_parse_one_satc		dmar_res_noop
 
 static inline int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 {
-- 
cgit v1.2.3


From e1e4f7fea37572f0ccf3887430e52c491e9accb6 Mon Sep 17 00:00:00 2001
From: CK Hu <ck.hu@mediatek.com>
Date: Tue, 21 Jul 2020 15:46:06 +0800
Subject: soc / drm: mediatek: Move mtk mutex driver to soc folder

mtk mutex is used by DRM and MDP driver, and its function is SoC-specific,
so move it to soc folder.

Signed-off-by: CK Hu <ck.hu@mediatek.com>
Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Acked-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/gpu/drm/mediatek/Makefile       |   3 +-
 drivers/gpu/drm/mediatek/mtk_drm_crtc.c |   2 +-
 drivers/gpu/drm/mediatek/mtk_drm_drv.c  |   1 -
 drivers/gpu/drm/mediatek/mtk_drm_drv.h  |   1 -
 drivers/gpu/drm/mediatek/mtk_mutex.c    | 463 -------------------------------
 drivers/gpu/drm/mediatek/mtk_mutex.h    |  26 --
 drivers/soc/mediatek/Makefile           |   1 +
 drivers/soc/mediatek/mtk-mutex.c        | 474 ++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk-mutex.h  |  26 ++
 9 files changed, 503 insertions(+), 494 deletions(-)
 delete mode 100644 drivers/gpu/drm/mediatek/mtk_mutex.c
 delete mode 100644 drivers/gpu/drm/mediatek/mtk_mutex.h
 create mode 100644 drivers/soc/mediatek/mtk-mutex.c
 create mode 100644 include/linux/soc/mediatek/mtk-mutex.h

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/mediatek/Makefile b/drivers/gpu/drm/mediatek/Makefile
index 09979c4c340a..01d06332f767 100644
--- a/drivers/gpu/drm/mediatek/Makefile
+++ b/drivers/gpu/drm/mediatek/Makefile
@@ -9,8 +9,7 @@ mediatek-drm-y := mtk_disp_color.o \
 		  mtk_drm_gem.o \
 		  mtk_drm_plane.o \
 		  mtk_dsi.o \
-		  mtk_dpi.o \
-		  mtk_mutex.o
+		  mtk_dpi.o
 
 obj-$(CONFIG_DRM_MEDIATEK) += mediatek-drm.o
 
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
index 1e3a9450680b..e9b6788d52cd 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
@@ -7,6 +7,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/soc/mediatek/mtk-cmdq.h>
 #include <linux/soc/mediatek/mtk-mmsys.h>
+#include <linux/soc/mediatek/mtk-mutex.h>
 
 #include <asm/barrier.h>
 #include <soc/mediatek/smi.h>
@@ -22,7 +23,6 @@
 #include "mtk_drm_ddp_comp.h"
 #include "mtk_drm_gem.h"
 #include "mtk_drm_plane.h"
-#include "mtk_mutex.h"
 
 /*
  * struct mtk_drm_crtc - MediaTek specific crtc structure.
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c
index b99a06e6834e..5d39dd54255d 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c
@@ -588,7 +588,6 @@ static struct platform_driver mtk_drm_platform_driver = {
 };
 
 static struct platform_driver * const mtk_drm_drivers[] = {
-	&mtk_mutex_driver,
 	&mtk_disp_color_driver,
 	&mtk_disp_ovl_driver,
 	&mtk_disp_rdma_driver,
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.h b/drivers/gpu/drm/mediatek/mtk_drm_drv.h
index ae366868d01a..e8238fa4aa2a 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_drv.h
+++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.h
@@ -46,7 +46,6 @@ struct mtk_drm_private {
 	struct drm_atomic_state *suspend_state;
 };
 
-extern struct platform_driver mtk_mutex_driver;
 extern struct platform_driver mtk_disp_color_driver;
 extern struct platform_driver mtk_disp_ovl_driver;
 extern struct platform_driver mtk_disp_rdma_driver;
diff --git a/drivers/gpu/drm/mediatek/mtk_mutex.c b/drivers/gpu/drm/mediatek/mtk_mutex.c
deleted file mode 100644
index 66344759e622..000000000000
--- a/drivers/gpu/drm/mediatek/mtk_mutex.c
+++ /dev/null
@@ -1,463 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2015 MediaTek Inc.
- */
-
-#include <linux/clk.h>
-#include <linux/iopoll.h>
-#include <linux/module.h>
-#include <linux/of_device.h>
-#include <linux/platform_device.h>
-#include <linux/regmap.h>
-#include <linux/soc/mediatek/mtk-mmsys.h>
-
-#include "mtk_mutex.h"
-
-#define MT2701_MUTEX0_MOD0			0x2c
-#define MT2701_MUTEX0_SOF0			0x30
-
-#define DISP_REG_MUTEX_EN(n)			(0x20 + 0x20 * (n))
-#define DISP_REG_MUTEX(n)			(0x24 + 0x20 * (n))
-#define DISP_REG_MUTEX_RST(n)			(0x28 + 0x20 * (n))
-#define DISP_REG_MUTEX_MOD(mutex_mod_reg, n)	(mutex_mod_reg + 0x20 * (n))
-#define DISP_REG_MUTEX_SOF(mutex_sof_reg, n)	(mutex_sof_reg + 0x20 * (n))
-#define DISP_REG_MUTEX_MOD2(n)			(0x34 + 0x20 * (n))
-
-#define INT_MUTEX				BIT(1)
-
-#define MT8167_MUTEX_MOD_DISP_PWM		1
-#define MT8167_MUTEX_MOD_DISP_OVL0		6
-#define MT8167_MUTEX_MOD_DISP_OVL1		7
-#define MT8167_MUTEX_MOD_DISP_RDMA0		8
-#define MT8167_MUTEX_MOD_DISP_RDMA1		9
-#define MT8167_MUTEX_MOD_DISP_WDMA0		10
-#define MT8167_MUTEX_MOD_DISP_CCORR		11
-#define MT8167_MUTEX_MOD_DISP_COLOR		12
-#define MT8167_MUTEX_MOD_DISP_AAL		13
-#define MT8167_MUTEX_MOD_DISP_GAMMA		14
-#define MT8167_MUTEX_MOD_DISP_DITHER		15
-#define MT8167_MUTEX_MOD_DISP_UFOE		16
-
-#define MT8173_MUTEX_MOD_DISP_OVL0		11
-#define MT8173_MUTEX_MOD_DISP_OVL1		12
-#define MT8173_MUTEX_MOD_DISP_RDMA0		13
-#define MT8173_MUTEX_MOD_DISP_RDMA1		14
-#define MT8173_MUTEX_MOD_DISP_RDMA2		15
-#define MT8173_MUTEX_MOD_DISP_WDMA0		16
-#define MT8173_MUTEX_MOD_DISP_WDMA1		17
-#define MT8173_MUTEX_MOD_DISP_COLOR0		18
-#define MT8173_MUTEX_MOD_DISP_COLOR1		19
-#define MT8173_MUTEX_MOD_DISP_AAL		20
-#define MT8173_MUTEX_MOD_DISP_GAMMA		21
-#define MT8173_MUTEX_MOD_DISP_UFOE		22
-#define MT8173_MUTEX_MOD_DISP_PWM0		23
-#define MT8173_MUTEX_MOD_DISP_PWM1		24
-#define MT8173_MUTEX_MOD_DISP_OD		25
-
-#define MT2712_MUTEX_MOD_DISP_PWM2		10
-#define MT2712_MUTEX_MOD_DISP_OVL0		11
-#define MT2712_MUTEX_MOD_DISP_OVL1		12
-#define MT2712_MUTEX_MOD_DISP_RDMA0		13
-#define MT2712_MUTEX_MOD_DISP_RDMA1		14
-#define MT2712_MUTEX_MOD_DISP_RDMA2		15
-#define MT2712_MUTEX_MOD_DISP_WDMA0		16
-#define MT2712_MUTEX_MOD_DISP_WDMA1		17
-#define MT2712_MUTEX_MOD_DISP_COLOR0		18
-#define MT2712_MUTEX_MOD_DISP_COLOR1		19
-#define MT2712_MUTEX_MOD_DISP_AAL0		20
-#define MT2712_MUTEX_MOD_DISP_UFOE		22
-#define MT2712_MUTEX_MOD_DISP_PWM0		23
-#define MT2712_MUTEX_MOD_DISP_PWM1		24
-#define MT2712_MUTEX_MOD_DISP_OD0		25
-#define MT2712_MUTEX_MOD2_DISP_AAL1		33
-#define MT2712_MUTEX_MOD2_DISP_OD1		34
-
-#define MT2701_MUTEX_MOD_DISP_OVL		3
-#define MT2701_MUTEX_MOD_DISP_WDMA		6
-#define MT2701_MUTEX_MOD_DISP_COLOR		7
-#define MT2701_MUTEX_MOD_DISP_BLS		9
-#define MT2701_MUTEX_MOD_DISP_RDMA0		10
-#define MT2701_MUTEX_MOD_DISP_RDMA1		12
-
-#define MT2712_MUTEX_SOF_SINGLE_MODE		0
-#define MT2712_MUTEX_SOF_DSI0			1
-#define MT2712_MUTEX_SOF_DSI1			2
-#define MT2712_MUTEX_SOF_DPI0			3
-#define MT2712_MUTEX_SOF_DPI1			4
-#define MT2712_MUTEX_SOF_DSI2			5
-#define MT2712_MUTEX_SOF_DSI3			6
-#define MT8167_MUTEX_SOF_DPI0			2
-#define MT8167_MUTEX_SOF_DPI1			3
-
-struct mtk_mutex {
-	int id;
-	bool claimed;
-};
-
-enum mtk_mutex_sof_id {
-	MUTEX_SOF_SINGLE_MODE,
-	MUTEX_SOF_DSI0,
-	MUTEX_SOF_DSI1,
-	MUTEX_SOF_DPI0,
-	MUTEX_SOF_DPI1,
-	MUTEX_SOF_DSI2,
-	MUTEX_SOF_DSI3,
-};
-
-struct mtk_mutex_data {
-	const unsigned int *mutex_mod;
-	const unsigned int *mutex_sof;
-	const unsigned int mutex_mod_reg;
-	const unsigned int mutex_sof_reg;
-	const bool no_clk;
-};
-
-struct mtk_mutex_ctx {
-	struct device			*dev;
-	struct clk			*clk;
-	void __iomem			*regs;
-	struct mtk_mutex		mutex[10];
-	const struct mtk_mutex_data	*data;
-};
-
-static const unsigned int mt2701_mutex_mod[DDP_COMPONENT_ID_MAX] = {
-	[DDP_COMPONENT_BLS] = MT2701_MUTEX_MOD_DISP_BLS,
-	[DDP_COMPONENT_COLOR0] = MT2701_MUTEX_MOD_DISP_COLOR,
-	[DDP_COMPONENT_OVL0] = MT2701_MUTEX_MOD_DISP_OVL,
-	[DDP_COMPONENT_RDMA0] = MT2701_MUTEX_MOD_DISP_RDMA0,
-	[DDP_COMPONENT_RDMA1] = MT2701_MUTEX_MOD_DISP_RDMA1,
-	[DDP_COMPONENT_WDMA0] = MT2701_MUTEX_MOD_DISP_WDMA,
-};
-
-static const unsigned int mt2712_mutex_mod[DDP_COMPONENT_ID_MAX] = {
-	[DDP_COMPONENT_AAL0] = MT2712_MUTEX_MOD_DISP_AAL0,
-	[DDP_COMPONENT_AAL1] = MT2712_MUTEX_MOD2_DISP_AAL1,
-	[DDP_COMPONENT_COLOR0] = MT2712_MUTEX_MOD_DISP_COLOR0,
-	[DDP_COMPONENT_COLOR1] = MT2712_MUTEX_MOD_DISP_COLOR1,
-	[DDP_COMPONENT_OD0] = MT2712_MUTEX_MOD_DISP_OD0,
-	[DDP_COMPONENT_OD1] = MT2712_MUTEX_MOD2_DISP_OD1,
-	[DDP_COMPONENT_OVL0] = MT2712_MUTEX_MOD_DISP_OVL0,
-	[DDP_COMPONENT_OVL1] = MT2712_MUTEX_MOD_DISP_OVL1,
-	[DDP_COMPONENT_PWM0] = MT2712_MUTEX_MOD_DISP_PWM0,
-	[DDP_COMPONENT_PWM1] = MT2712_MUTEX_MOD_DISP_PWM1,
-	[DDP_COMPONENT_PWM2] = MT2712_MUTEX_MOD_DISP_PWM2,
-	[DDP_COMPONENT_RDMA0] = MT2712_MUTEX_MOD_DISP_RDMA0,
-	[DDP_COMPONENT_RDMA1] = MT2712_MUTEX_MOD_DISP_RDMA1,
-	[DDP_COMPONENT_RDMA2] = MT2712_MUTEX_MOD_DISP_RDMA2,
-	[DDP_COMPONENT_UFOE] = MT2712_MUTEX_MOD_DISP_UFOE,
-	[DDP_COMPONENT_WDMA0] = MT2712_MUTEX_MOD_DISP_WDMA0,
-	[DDP_COMPONENT_WDMA1] = MT2712_MUTEX_MOD_DISP_WDMA1,
-};
-
-static const unsigned int mt8167_mutex_mod[DDP_COMPONENT_ID_MAX] = {
-	[DDP_COMPONENT_AAL0] = MT8167_MUTEX_MOD_DISP_AAL,
-	[DDP_COMPONENT_CCORR] = MT8167_MUTEX_MOD_DISP_CCORR,
-	[DDP_COMPONENT_COLOR0] = MT8167_MUTEX_MOD_DISP_COLOR,
-	[DDP_COMPONENT_DITHER] = MT8167_MUTEX_MOD_DISP_DITHER,
-	[DDP_COMPONENT_GAMMA] = MT8167_MUTEX_MOD_DISP_GAMMA,
-	[DDP_COMPONENT_OVL0] = MT8167_MUTEX_MOD_DISP_OVL0,
-	[DDP_COMPONENT_OVL1] = MT8167_MUTEX_MOD_DISP_OVL1,
-	[DDP_COMPONENT_PWM0] = MT8167_MUTEX_MOD_DISP_PWM,
-	[DDP_COMPONENT_RDMA0] = MT8167_MUTEX_MOD_DISP_RDMA0,
-	[DDP_COMPONENT_RDMA1] = MT8167_MUTEX_MOD_DISP_RDMA1,
-	[DDP_COMPONENT_UFOE] = MT8167_MUTEX_MOD_DISP_UFOE,
-	[DDP_COMPONENT_WDMA0] = MT8167_MUTEX_MOD_DISP_WDMA0,
-};
-
-static const unsigned int mt8173_mutex_mod[DDP_COMPONENT_ID_MAX] = {
-	[DDP_COMPONENT_AAL0] = MT8173_MUTEX_MOD_DISP_AAL,
-	[DDP_COMPONENT_COLOR0] = MT8173_MUTEX_MOD_DISP_COLOR0,
-	[DDP_COMPONENT_COLOR1] = MT8173_MUTEX_MOD_DISP_COLOR1,
-	[DDP_COMPONENT_GAMMA] = MT8173_MUTEX_MOD_DISP_GAMMA,
-	[DDP_COMPONENT_OD0] = MT8173_MUTEX_MOD_DISP_OD,
-	[DDP_COMPONENT_OVL0] = MT8173_MUTEX_MOD_DISP_OVL0,
-	[DDP_COMPONENT_OVL1] = MT8173_MUTEX_MOD_DISP_OVL1,
-	[DDP_COMPONENT_PWM0] = MT8173_MUTEX_MOD_DISP_PWM0,
-	[DDP_COMPONENT_PWM1] = MT8173_MUTEX_MOD_DISP_PWM1,
-	[DDP_COMPONENT_RDMA0] = MT8173_MUTEX_MOD_DISP_RDMA0,
-	[DDP_COMPONENT_RDMA1] = MT8173_MUTEX_MOD_DISP_RDMA1,
-	[DDP_COMPONENT_RDMA2] = MT8173_MUTEX_MOD_DISP_RDMA2,
-	[DDP_COMPONENT_UFOE] = MT8173_MUTEX_MOD_DISP_UFOE,
-	[DDP_COMPONENT_WDMA0] = MT8173_MUTEX_MOD_DISP_WDMA0,
-	[DDP_COMPONENT_WDMA1] = MT8173_MUTEX_MOD_DISP_WDMA1,
-};
-
-static const unsigned int mt2712_mutex_sof[MUTEX_SOF_DSI3 + 1] = {
-	[MUTEX_SOF_SINGLE_MODE] = MUTEX_SOF_SINGLE_MODE,
-	[MUTEX_SOF_DSI0] = MUTEX_SOF_DSI0,
-	[MUTEX_SOF_DSI1] = MUTEX_SOF_DSI1,
-	[MUTEX_SOF_DPI0] = MUTEX_SOF_DPI0,
-	[MUTEX_SOF_DPI1] = MUTEX_SOF_DPI1,
-	[MUTEX_SOF_DSI2] = MUTEX_SOF_DSI2,
-	[MUTEX_SOF_DSI3] = MUTEX_SOF_DSI3,
-};
-
-static const unsigned int mt8167_mutex_sof[MUTEX_SOF_DSI3 + 1] = {
-	[MUTEX_SOF_SINGLE_MODE] = MUTEX_SOF_SINGLE_MODE,
-	[MUTEX_SOF_DSI0] = MUTEX_SOF_DSI0,
-	[MUTEX_SOF_DPI0] = MT8167_MUTEX_SOF_DPI0,
-	[MUTEX_SOF_DPI1] = MT8167_MUTEX_SOF_DPI1,
-};
-
-static const struct mtk_mutex_data mt2701_mutex_driver_data = {
-	.mutex_mod = mt2701_mutex_mod,
-	.mutex_sof = mt2712_mutex_sof,
-	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
-	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
-};
-
-static const struct mtk_mutex_data mt2712_mutex_driver_data = {
-	.mutex_mod = mt2712_mutex_mod,
-	.mutex_sof = mt2712_mutex_sof,
-	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
-	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
-};
-
-static const struct mtk_mutex_data mt8167_mutex_driver_data = {
-	.mutex_mod = mt8167_mutex_mod,
-	.mutex_sof = mt8167_mutex_sof,
-	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
-	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
-	.no_clk = true,
-};
-
-static const struct mtk_mutex_data mt8173_mutex_driver_data = {
-	.mutex_mod = mt8173_mutex_mod,
-	.mutex_sof = mt2712_mutex_sof,
-	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
-	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
-};
-
-struct mtk_mutex *mtk_mutex_get(struct device *dev)
-{
-	struct mtk_mutex_ctx *mtx = dev_get_drvdata(dev);
-	int i;
-
-	for (i = 0; i < 10; i++)
-		if (!mtx->mutex[i].claimed) {
-			mtx->mutex[i].claimed = true;
-			return &mtx->mutex[i];
-		}
-
-	return ERR_PTR(-EBUSY);
-}
-
-void mtk_mutex_put(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-
-	WARN_ON(&mtx->mutex[mutex->id] != mutex);
-
-	mutex->claimed = false;
-}
-
-int mtk_mutex_prepare(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-	return clk_prepare_enable(mtx->clk);
-}
-
-void mtk_mutex_unprepare(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-	clk_disable_unprepare(mtx->clk);
-}
-
-void mtk_mutex_add_comp(struct mtk_mutex *mutex,
-			enum mtk_ddp_comp_id id)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-	unsigned int reg;
-	unsigned int sof_id;
-	unsigned int offset;
-
-	WARN_ON(&mtx->mutex[mutex->id] != mutex);
-
-	switch (id) {
-	case DDP_COMPONENT_DSI0:
-		sof_id = MUTEX_SOF_DSI0;
-		break;
-	case DDP_COMPONENT_DSI1:
-		sof_id = MUTEX_SOF_DSI0;
-		break;
-	case DDP_COMPONENT_DSI2:
-		sof_id = MUTEX_SOF_DSI2;
-		break;
-	case DDP_COMPONENT_DSI3:
-		sof_id = MUTEX_SOF_DSI3;
-		break;
-	case DDP_COMPONENT_DPI0:
-		sof_id = MUTEX_SOF_DPI0;
-		break;
-	case DDP_COMPONENT_DPI1:
-		sof_id = MUTEX_SOF_DPI1;
-		break;
-	default:
-		if (mtx->data->mutex_mod[id] < 32) {
-			offset = DISP_REG_MUTEX_MOD(mtx->data->mutex_mod_reg,
-						    mutex->id);
-			reg = readl_relaxed(mtx->regs + offset);
-			reg |= 1 << mtx->data->mutex_mod[id];
-			writel_relaxed(reg, mtx->regs + offset);
-		} else {
-			offset = DISP_REG_MUTEX_MOD2(mutex->id);
-			reg = readl_relaxed(mtx->regs + offset);
-			reg |= 1 << (mtx->data->mutex_mod[id] - 32);
-			writel_relaxed(reg, mtx->regs + offset);
-		}
-		return;
-	}
-
-	writel_relaxed(mtx->data->mutex_sof[sof_id],
-		       mtx->regs +
-		       DISP_REG_MUTEX_SOF(mtx->data->mutex_sof_reg, mutex->id));
-}
-
-void mtk_mutex_remove_comp(struct mtk_mutex *mutex,
-			   enum mtk_ddp_comp_id id)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-	unsigned int reg;
-	unsigned int offset;
-
-	WARN_ON(&mtx->mutex[mutex->id] != mutex);
-
-	switch (id) {
-	case DDP_COMPONENT_DSI0:
-	case DDP_COMPONENT_DSI1:
-	case DDP_COMPONENT_DSI2:
-	case DDP_COMPONENT_DSI3:
-	case DDP_COMPONENT_DPI0:
-	case DDP_COMPONENT_DPI1:
-		writel_relaxed(MUTEX_SOF_SINGLE_MODE,
-			       mtx->regs +
-			       DISP_REG_MUTEX_SOF(mtx->data->mutex_sof_reg,
-						  mutex->id));
-		break;
-	default:
-		if (mtx->data->mutex_mod[id] < 32) {
-			offset = DISP_REG_MUTEX_MOD(mtx->data->mutex_mod_reg,
-						    mutex->id);
-			reg = readl_relaxed(mtx->regs + offset);
-			reg &= ~(1 << mtx->data->mutex_mod[id]);
-			writel_relaxed(reg, mtx->regs + offset);
-		} else {
-			offset = DISP_REG_MUTEX_MOD2(mutex->id);
-			reg = readl_relaxed(mtx->regs + offset);
-			reg &= ~(1 << (mtx->data->mutex_mod[id] - 32));
-			writel_relaxed(reg, mtx->regs + offset);
-		}
-		break;
-	}
-}
-
-void mtk_mutex_enable(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-
-	WARN_ON(&mtx->mutex[mutex->id] != mutex);
-
-	writel(1, mtx->regs + DISP_REG_MUTEX_EN(mutex->id));
-}
-
-void mtk_mutex_disable(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-
-	WARN_ON(&mtx->mutex[mutex->id] != mutex);
-
-	writel(0, mtx->regs + DISP_REG_MUTEX_EN(mutex->id));
-}
-
-void mtk_mutex_acquire(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-	u32 tmp;
-
-	writel(1, mtx->regs + DISP_REG_MUTEX_EN(mutex->id));
-	writel(1, mtx->regs + DISP_REG_MUTEX(mutex->id));
-	if (readl_poll_timeout_atomic(mtx->regs + DISP_REG_MUTEX(mutex->id),
-				      tmp, tmp & INT_MUTEX, 1, 10000))
-		pr_err("could not acquire mutex %d\n", mutex->id);
-}
-
-void mtk_mutex_release(struct mtk_mutex *mutex)
-{
-	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
-						 mutex[mutex->id]);
-
-	writel(0, mtx->regs + DISP_REG_MUTEX(mutex->id));
-}
-
-static int mtk_mutex_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct mtk_mutex_ctx *mtx;
-	struct resource *regs;
-	int i;
-
-	mtx = devm_kzalloc(dev, sizeof(*mtx), GFP_KERNEL);
-	if (!mtx)
-		return -ENOMEM;
-
-	for (i = 0; i < 10; i++)
-		mtx->mutex[i].id = i;
-
-	mtx->data = of_device_get_match_data(dev);
-
-	if (!mtx->data->no_clk) {
-		mtx->clk = devm_clk_get(dev, NULL);
-		if (IS_ERR(mtx->clk)) {
-			if (PTR_ERR(mtx->clk) != -EPROBE_DEFER)
-				dev_err(dev, "Failed to get clock\n");
-			return PTR_ERR(mtx->clk);
-		}
-	}
-
-	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	mtx->regs = devm_ioremap_resource(dev, regs);
-	if (IS_ERR(mtx->regs)) {
-		dev_err(dev, "Failed to map mutex registers\n");
-		return PTR_ERR(mtx->regs);
-	}
-
-	platform_set_drvdata(pdev, mtx);
-
-	return 0;
-}
-
-static int mtk_mutex_remove(struct platform_device *pdev)
-{
-	return 0;
-}
-
-static const struct of_device_id mutex_driver_dt_match[] = {
-	{ .compatible = "mediatek,mt2701-disp-mutex",
-	  .data = &mt2701_mutex_driver_data},
-	{ .compatible = "mediatek,mt2712-disp-mutex",
-	  .data = &mt2712_mutex_driver_data},
-	{ .compatible = "mediatek,mt8167-disp-mutex",
-	  .data = &mt8167_mutex_driver_data},
-	{ .compatible = "mediatek,mt8173-disp-mutex",
-	  .data = &mt8173_mutex_driver_data},
-	{},
-};
-MODULE_DEVICE_TABLE(of, mutex_driver_dt_match);
-
-struct platform_driver mtk_mutex_driver = {
-	.probe		= mtk_mutex_probe,
-	.remove		= mtk_mutex_remove,
-	.driver		= {
-		.name	= "mediatek-mutex",
-		.owner	= THIS_MODULE,
-		.of_match_table = mutex_driver_dt_match,
-	},
-};
diff --git a/drivers/gpu/drm/mediatek/mtk_mutex.h b/drivers/gpu/drm/mediatek/mtk_mutex.h
deleted file mode 100644
index 6fe4ffbde290..000000000000
--- a/drivers/gpu/drm/mediatek/mtk_mutex.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (c) 2015 MediaTek Inc.
- */
-
-#ifndef MTK_MUTEX_H
-#define MTK_MUTEX_H
-
-struct regmap;
-struct device;
-struct mtk_mutex;
-
-struct mtk_mutex *mtk_mutex_get(struct device *dev);
-int mtk_mutex_prepare(struct mtk_mutex *mutex);
-void mtk_mutex_add_comp(struct mtk_mutex *mutex,
-			enum mtk_ddp_comp_id id);
-void mtk_mutex_enable(struct mtk_mutex *mutex);
-void mtk_mutex_disable(struct mtk_mutex *mutex);
-void mtk_mutex_remove_comp(struct mtk_mutex *mutex,
-			   enum mtk_ddp_comp_id id);
-void mtk_mutex_unprepare(struct mtk_mutex *mutex);
-void mtk_mutex_put(struct mtk_mutex *mutex);
-void mtk_mutex_acquire(struct mtk_mutex *mutex);
-void mtk_mutex_release(struct mtk_mutex *mutex);
-
-#endif /* MTK_MUTEX_H */
diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile
index b6908db534c2..90270f8114ed 100644
--- a/drivers/soc/mediatek/Makefile
+++ b/drivers/soc/mediatek/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o
 obj-$(CONFIG_MTK_SCPSYS) += mtk-scpsys.o
 obj-$(CONFIG_MTK_SCPSYS_PM_DOMAINS) += mtk-pm-domains.o
 obj-$(CONFIG_MTK_MMSYS) += mtk-mmsys.o
+obj-$(CONFIG_MTK_MMSYS) += mtk-mutex.o
diff --git a/drivers/soc/mediatek/mtk-mutex.c b/drivers/soc/mediatek/mtk-mutex.c
new file mode 100644
index 000000000000..f531b119da7a
--- /dev/null
+++ b/drivers/soc/mediatek/mtk-mutex.c
@@ -0,0 +1,474 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ */
+
+#include <linux/clk.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/soc/mediatek/mtk-mmsys.h>
+#include <linux/soc/mediatek/mtk-mutex.h>
+
+#define MT2701_MUTEX0_MOD0			0x2c
+#define MT2701_MUTEX0_SOF0			0x30
+
+#define DISP_REG_MUTEX_EN(n)			(0x20 + 0x20 * (n))
+#define DISP_REG_MUTEX(n)			(0x24 + 0x20 * (n))
+#define DISP_REG_MUTEX_RST(n)			(0x28 + 0x20 * (n))
+#define DISP_REG_MUTEX_MOD(mutex_mod_reg, n)	(mutex_mod_reg + 0x20 * (n))
+#define DISP_REG_MUTEX_SOF(mutex_sof_reg, n)	(mutex_sof_reg + 0x20 * (n))
+#define DISP_REG_MUTEX_MOD2(n)			(0x34 + 0x20 * (n))
+
+#define INT_MUTEX				BIT(1)
+
+#define MT8167_MUTEX_MOD_DISP_PWM		1
+#define MT8167_MUTEX_MOD_DISP_OVL0		6
+#define MT8167_MUTEX_MOD_DISP_OVL1		7
+#define MT8167_MUTEX_MOD_DISP_RDMA0		8
+#define MT8167_MUTEX_MOD_DISP_RDMA1		9
+#define MT8167_MUTEX_MOD_DISP_WDMA0		10
+#define MT8167_MUTEX_MOD_DISP_CCORR		11
+#define MT8167_MUTEX_MOD_DISP_COLOR		12
+#define MT8167_MUTEX_MOD_DISP_AAL		13
+#define MT8167_MUTEX_MOD_DISP_GAMMA		14
+#define MT8167_MUTEX_MOD_DISP_DITHER		15
+#define MT8167_MUTEX_MOD_DISP_UFOE		16
+
+#define MT8173_MUTEX_MOD_DISP_OVL0		11
+#define MT8173_MUTEX_MOD_DISP_OVL1		12
+#define MT8173_MUTEX_MOD_DISP_RDMA0		13
+#define MT8173_MUTEX_MOD_DISP_RDMA1		14
+#define MT8173_MUTEX_MOD_DISP_RDMA2		15
+#define MT8173_MUTEX_MOD_DISP_WDMA0		16
+#define MT8173_MUTEX_MOD_DISP_WDMA1		17
+#define MT8173_MUTEX_MOD_DISP_COLOR0		18
+#define MT8173_MUTEX_MOD_DISP_COLOR1		19
+#define MT8173_MUTEX_MOD_DISP_AAL		20
+#define MT8173_MUTEX_MOD_DISP_GAMMA		21
+#define MT8173_MUTEX_MOD_DISP_UFOE		22
+#define MT8173_MUTEX_MOD_DISP_PWM0		23
+#define MT8173_MUTEX_MOD_DISP_PWM1		24
+#define MT8173_MUTEX_MOD_DISP_OD		25
+
+#define MT2712_MUTEX_MOD_DISP_PWM2		10
+#define MT2712_MUTEX_MOD_DISP_OVL0		11
+#define MT2712_MUTEX_MOD_DISP_OVL1		12
+#define MT2712_MUTEX_MOD_DISP_RDMA0		13
+#define MT2712_MUTEX_MOD_DISP_RDMA1		14
+#define MT2712_MUTEX_MOD_DISP_RDMA2		15
+#define MT2712_MUTEX_MOD_DISP_WDMA0		16
+#define MT2712_MUTEX_MOD_DISP_WDMA1		17
+#define MT2712_MUTEX_MOD_DISP_COLOR0		18
+#define MT2712_MUTEX_MOD_DISP_COLOR1		19
+#define MT2712_MUTEX_MOD_DISP_AAL0		20
+#define MT2712_MUTEX_MOD_DISP_UFOE		22
+#define MT2712_MUTEX_MOD_DISP_PWM0		23
+#define MT2712_MUTEX_MOD_DISP_PWM1		24
+#define MT2712_MUTEX_MOD_DISP_OD0		25
+#define MT2712_MUTEX_MOD2_DISP_AAL1		33
+#define MT2712_MUTEX_MOD2_DISP_OD1		34
+
+#define MT2701_MUTEX_MOD_DISP_OVL		3
+#define MT2701_MUTEX_MOD_DISP_WDMA		6
+#define MT2701_MUTEX_MOD_DISP_COLOR		7
+#define MT2701_MUTEX_MOD_DISP_BLS		9
+#define MT2701_MUTEX_MOD_DISP_RDMA0		10
+#define MT2701_MUTEX_MOD_DISP_RDMA1		12
+
+#define MT2712_MUTEX_SOF_SINGLE_MODE		0
+#define MT2712_MUTEX_SOF_DSI0			1
+#define MT2712_MUTEX_SOF_DSI1			2
+#define MT2712_MUTEX_SOF_DPI0			3
+#define MT2712_MUTEX_SOF_DPI1			4
+#define MT2712_MUTEX_SOF_DSI2			5
+#define MT2712_MUTEX_SOF_DSI3			6
+#define MT8167_MUTEX_SOF_DPI0			2
+#define MT8167_MUTEX_SOF_DPI1			3
+
+struct mtk_mutex {
+	int id;
+	bool claimed;
+};
+
+enum mtk_mutex_sof_id {
+	MUTEX_SOF_SINGLE_MODE,
+	MUTEX_SOF_DSI0,
+	MUTEX_SOF_DSI1,
+	MUTEX_SOF_DPI0,
+	MUTEX_SOF_DPI1,
+	MUTEX_SOF_DSI2,
+	MUTEX_SOF_DSI3,
+};
+
+struct mtk_mutex_data {
+	const unsigned int *mutex_mod;
+	const unsigned int *mutex_sof;
+	const unsigned int mutex_mod_reg;
+	const unsigned int mutex_sof_reg;
+	const bool no_clk;
+};
+
+struct mtk_mutex_ctx {
+	struct device			*dev;
+	struct clk			*clk;
+	void __iomem			*regs;
+	struct mtk_mutex		mutex[10];
+	const struct mtk_mutex_data	*data;
+};
+
+static const unsigned int mt2701_mutex_mod[DDP_COMPONENT_ID_MAX] = {
+	[DDP_COMPONENT_BLS] = MT2701_MUTEX_MOD_DISP_BLS,
+	[DDP_COMPONENT_COLOR0] = MT2701_MUTEX_MOD_DISP_COLOR,
+	[DDP_COMPONENT_OVL0] = MT2701_MUTEX_MOD_DISP_OVL,
+	[DDP_COMPONENT_RDMA0] = MT2701_MUTEX_MOD_DISP_RDMA0,
+	[DDP_COMPONENT_RDMA1] = MT2701_MUTEX_MOD_DISP_RDMA1,
+	[DDP_COMPONENT_WDMA0] = MT2701_MUTEX_MOD_DISP_WDMA,
+};
+
+static const unsigned int mt2712_mutex_mod[DDP_COMPONENT_ID_MAX] = {
+	[DDP_COMPONENT_AAL0] = MT2712_MUTEX_MOD_DISP_AAL0,
+	[DDP_COMPONENT_AAL1] = MT2712_MUTEX_MOD2_DISP_AAL1,
+	[DDP_COMPONENT_COLOR0] = MT2712_MUTEX_MOD_DISP_COLOR0,
+	[DDP_COMPONENT_COLOR1] = MT2712_MUTEX_MOD_DISP_COLOR1,
+	[DDP_COMPONENT_OD0] = MT2712_MUTEX_MOD_DISP_OD0,
+	[DDP_COMPONENT_OD1] = MT2712_MUTEX_MOD2_DISP_OD1,
+	[DDP_COMPONENT_OVL0] = MT2712_MUTEX_MOD_DISP_OVL0,
+	[DDP_COMPONENT_OVL1] = MT2712_MUTEX_MOD_DISP_OVL1,
+	[DDP_COMPONENT_PWM0] = MT2712_MUTEX_MOD_DISP_PWM0,
+	[DDP_COMPONENT_PWM1] = MT2712_MUTEX_MOD_DISP_PWM1,
+	[DDP_COMPONENT_PWM2] = MT2712_MUTEX_MOD_DISP_PWM2,
+	[DDP_COMPONENT_RDMA0] = MT2712_MUTEX_MOD_DISP_RDMA0,
+	[DDP_COMPONENT_RDMA1] = MT2712_MUTEX_MOD_DISP_RDMA1,
+	[DDP_COMPONENT_RDMA2] = MT2712_MUTEX_MOD_DISP_RDMA2,
+	[DDP_COMPONENT_UFOE] = MT2712_MUTEX_MOD_DISP_UFOE,
+	[DDP_COMPONENT_WDMA0] = MT2712_MUTEX_MOD_DISP_WDMA0,
+	[DDP_COMPONENT_WDMA1] = MT2712_MUTEX_MOD_DISP_WDMA1,
+};
+
+static const unsigned int mt8167_mutex_mod[DDP_COMPONENT_ID_MAX] = {
+	[DDP_COMPONENT_AAL0] = MT8167_MUTEX_MOD_DISP_AAL,
+	[DDP_COMPONENT_CCORR] = MT8167_MUTEX_MOD_DISP_CCORR,
+	[DDP_COMPONENT_COLOR0] = MT8167_MUTEX_MOD_DISP_COLOR,
+	[DDP_COMPONENT_DITHER] = MT8167_MUTEX_MOD_DISP_DITHER,
+	[DDP_COMPONENT_GAMMA] = MT8167_MUTEX_MOD_DISP_GAMMA,
+	[DDP_COMPONENT_OVL0] = MT8167_MUTEX_MOD_DISP_OVL0,
+	[DDP_COMPONENT_OVL1] = MT8167_MUTEX_MOD_DISP_OVL1,
+	[DDP_COMPONENT_PWM0] = MT8167_MUTEX_MOD_DISP_PWM,
+	[DDP_COMPONENT_RDMA0] = MT8167_MUTEX_MOD_DISP_RDMA0,
+	[DDP_COMPONENT_RDMA1] = MT8167_MUTEX_MOD_DISP_RDMA1,
+	[DDP_COMPONENT_UFOE] = MT8167_MUTEX_MOD_DISP_UFOE,
+	[DDP_COMPONENT_WDMA0] = MT8167_MUTEX_MOD_DISP_WDMA0,
+};
+
+static const unsigned int mt8173_mutex_mod[DDP_COMPONENT_ID_MAX] = {
+	[DDP_COMPONENT_AAL0] = MT8173_MUTEX_MOD_DISP_AAL,
+	[DDP_COMPONENT_COLOR0] = MT8173_MUTEX_MOD_DISP_COLOR0,
+	[DDP_COMPONENT_COLOR1] = MT8173_MUTEX_MOD_DISP_COLOR1,
+	[DDP_COMPONENT_GAMMA] = MT8173_MUTEX_MOD_DISP_GAMMA,
+	[DDP_COMPONENT_OD0] = MT8173_MUTEX_MOD_DISP_OD,
+	[DDP_COMPONENT_OVL0] = MT8173_MUTEX_MOD_DISP_OVL0,
+	[DDP_COMPONENT_OVL1] = MT8173_MUTEX_MOD_DISP_OVL1,
+	[DDP_COMPONENT_PWM0] = MT8173_MUTEX_MOD_DISP_PWM0,
+	[DDP_COMPONENT_PWM1] = MT8173_MUTEX_MOD_DISP_PWM1,
+	[DDP_COMPONENT_RDMA0] = MT8173_MUTEX_MOD_DISP_RDMA0,
+	[DDP_COMPONENT_RDMA1] = MT8173_MUTEX_MOD_DISP_RDMA1,
+	[DDP_COMPONENT_RDMA2] = MT8173_MUTEX_MOD_DISP_RDMA2,
+	[DDP_COMPONENT_UFOE] = MT8173_MUTEX_MOD_DISP_UFOE,
+	[DDP_COMPONENT_WDMA0] = MT8173_MUTEX_MOD_DISP_WDMA0,
+	[DDP_COMPONENT_WDMA1] = MT8173_MUTEX_MOD_DISP_WDMA1,
+};
+
+static const unsigned int mt2712_mutex_sof[MUTEX_SOF_DSI3 + 1] = {
+	[MUTEX_SOF_SINGLE_MODE] = MUTEX_SOF_SINGLE_MODE,
+	[MUTEX_SOF_DSI0] = MUTEX_SOF_DSI0,
+	[MUTEX_SOF_DSI1] = MUTEX_SOF_DSI1,
+	[MUTEX_SOF_DPI0] = MUTEX_SOF_DPI0,
+	[MUTEX_SOF_DPI1] = MUTEX_SOF_DPI1,
+	[MUTEX_SOF_DSI2] = MUTEX_SOF_DSI2,
+	[MUTEX_SOF_DSI3] = MUTEX_SOF_DSI3,
+};
+
+static const unsigned int mt8167_mutex_sof[MUTEX_SOF_DSI3 + 1] = {
+	[MUTEX_SOF_SINGLE_MODE] = MUTEX_SOF_SINGLE_MODE,
+	[MUTEX_SOF_DSI0] = MUTEX_SOF_DSI0,
+	[MUTEX_SOF_DPI0] = MT8167_MUTEX_SOF_DPI0,
+	[MUTEX_SOF_DPI1] = MT8167_MUTEX_SOF_DPI1,
+};
+
+static const struct mtk_mutex_data mt2701_mutex_driver_data = {
+	.mutex_mod = mt2701_mutex_mod,
+	.mutex_sof = mt2712_mutex_sof,
+	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
+	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
+};
+
+static const struct mtk_mutex_data mt2712_mutex_driver_data = {
+	.mutex_mod = mt2712_mutex_mod,
+	.mutex_sof = mt2712_mutex_sof,
+	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
+	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
+};
+
+static const struct mtk_mutex_data mt8167_mutex_driver_data = {
+	.mutex_mod = mt8167_mutex_mod,
+	.mutex_sof = mt8167_mutex_sof,
+	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
+	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
+	.no_clk = true,
+};
+
+static const struct mtk_mutex_data mt8173_mutex_driver_data = {
+	.mutex_mod = mt8173_mutex_mod,
+	.mutex_sof = mt2712_mutex_sof,
+	.mutex_mod_reg = MT2701_MUTEX0_MOD0,
+	.mutex_sof_reg = MT2701_MUTEX0_SOF0,
+};
+
+struct mtk_mutex *mtk_mutex_get(struct device *dev)
+{
+	struct mtk_mutex_ctx *mtx = dev_get_drvdata(dev);
+	int i;
+
+	for (i = 0; i < 10; i++)
+		if (!mtx->mutex[i].claimed) {
+			mtx->mutex[i].claimed = true;
+			return &mtx->mutex[i];
+		}
+
+	return ERR_PTR(-EBUSY);
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_get);
+
+void mtk_mutex_put(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	mutex->claimed = false;
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_put);
+
+int mtk_mutex_prepare(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+	return clk_prepare_enable(mtx->clk);
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_prepare);
+
+void mtk_mutex_unprepare(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+	clk_disable_unprepare(mtx->clk);
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_unprepare);
+
+void mtk_mutex_add_comp(struct mtk_mutex *mutex,
+			enum mtk_ddp_comp_id id)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+	unsigned int reg;
+	unsigned int sof_id;
+	unsigned int offset;
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	switch (id) {
+	case DDP_COMPONENT_DSI0:
+		sof_id = MUTEX_SOF_DSI0;
+		break;
+	case DDP_COMPONENT_DSI1:
+		sof_id = MUTEX_SOF_DSI0;
+		break;
+	case DDP_COMPONENT_DSI2:
+		sof_id = MUTEX_SOF_DSI2;
+		break;
+	case DDP_COMPONENT_DSI3:
+		sof_id = MUTEX_SOF_DSI3;
+		break;
+	case DDP_COMPONENT_DPI0:
+		sof_id = MUTEX_SOF_DPI0;
+		break;
+	case DDP_COMPONENT_DPI1:
+		sof_id = MUTEX_SOF_DPI1;
+		break;
+	default:
+		if (mtx->data->mutex_mod[id] < 32) {
+			offset = DISP_REG_MUTEX_MOD(mtx->data->mutex_mod_reg,
+						    mutex->id);
+			reg = readl_relaxed(mtx->regs + offset);
+			reg |= 1 << mtx->data->mutex_mod[id];
+			writel_relaxed(reg, mtx->regs + offset);
+		} else {
+			offset = DISP_REG_MUTEX_MOD2(mutex->id);
+			reg = readl_relaxed(mtx->regs + offset);
+			reg |= 1 << (mtx->data->mutex_mod[id] - 32);
+			writel_relaxed(reg, mtx->regs + offset);
+		}
+		return;
+	}
+
+	writel_relaxed(mtx->data->mutex_sof[sof_id],
+		       mtx->regs +
+		       DISP_REG_MUTEX_SOF(mtx->data->mutex_sof_reg, mutex->id));
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_add_comp);
+
+void mtk_mutex_remove_comp(struct mtk_mutex *mutex,
+			   enum mtk_ddp_comp_id id)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+	unsigned int reg;
+	unsigned int offset;
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	switch (id) {
+	case DDP_COMPONENT_DSI0:
+	case DDP_COMPONENT_DSI1:
+	case DDP_COMPONENT_DSI2:
+	case DDP_COMPONENT_DSI3:
+	case DDP_COMPONENT_DPI0:
+	case DDP_COMPONENT_DPI1:
+		writel_relaxed(MUTEX_SOF_SINGLE_MODE,
+			       mtx->regs +
+			       DISP_REG_MUTEX_SOF(mtx->data->mutex_sof_reg,
+						  mutex->id));
+		break;
+	default:
+		if (mtx->data->mutex_mod[id] < 32) {
+			offset = DISP_REG_MUTEX_MOD(mtx->data->mutex_mod_reg,
+						    mutex->id);
+			reg = readl_relaxed(mtx->regs + offset);
+			reg &= ~(1 << mtx->data->mutex_mod[id]);
+			writel_relaxed(reg, mtx->regs + offset);
+		} else {
+			offset = DISP_REG_MUTEX_MOD2(mutex->id);
+			reg = readl_relaxed(mtx->regs + offset);
+			reg &= ~(1 << (mtx->data->mutex_mod[id] - 32));
+			writel_relaxed(reg, mtx->regs + offset);
+		}
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_remove_comp);
+
+void mtk_mutex_enable(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	writel(1, mtx->regs + DISP_REG_MUTEX_EN(mutex->id));
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_enable);
+
+void mtk_mutex_disable(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+
+	WARN_ON(&mtx->mutex[mutex->id] != mutex);
+
+	writel(0, mtx->regs + DISP_REG_MUTEX_EN(mutex->id));
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_disable);
+
+void mtk_mutex_acquire(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+	u32 tmp;
+
+	writel(1, mtx->regs + DISP_REG_MUTEX_EN(mutex->id));
+	writel(1, mtx->regs + DISP_REG_MUTEX(mutex->id));
+	if (readl_poll_timeout_atomic(mtx->regs + DISP_REG_MUTEX(mutex->id),
+				      tmp, tmp & INT_MUTEX, 1, 10000))
+		pr_err("could not acquire mutex %d\n", mutex->id);
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_acquire);
+
+void mtk_mutex_release(struct mtk_mutex *mutex)
+{
+	struct mtk_mutex_ctx *mtx = container_of(mutex, struct mtk_mutex_ctx,
+						 mutex[mutex->id]);
+
+	writel(0, mtx->regs + DISP_REG_MUTEX(mutex->id));
+}
+EXPORT_SYMBOL_GPL(mtk_mutex_release);
+
+static int mtk_mutex_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct mtk_mutex_ctx *mtx;
+	struct resource *regs;
+	int i;
+
+	mtx = devm_kzalloc(dev, sizeof(*mtx), GFP_KERNEL);
+	if (!mtx)
+		return -ENOMEM;
+
+	for (i = 0; i < 10; i++)
+		mtx->mutex[i].id = i;
+
+	mtx->data = of_device_get_match_data(dev);
+
+	if (!mtx->data->no_clk) {
+		mtx->clk = devm_clk_get(dev, NULL);
+		if (IS_ERR(mtx->clk)) {
+			if (PTR_ERR(mtx->clk) != -EPROBE_DEFER)
+				dev_err(dev, "Failed to get clock\n");
+			return PTR_ERR(mtx->clk);
+		}
+	}
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	mtx->regs = devm_ioremap_resource(dev, regs);
+	if (IS_ERR(mtx->regs)) {
+		dev_err(dev, "Failed to map mutex registers\n");
+		return PTR_ERR(mtx->regs);
+	}
+
+	platform_set_drvdata(pdev, mtx);
+
+	return 0;
+}
+
+static int mtk_mutex_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static const struct of_device_id mutex_driver_dt_match[] = {
+	{ .compatible = "mediatek,mt2701-disp-mutex",
+	  .data = &mt2701_mutex_driver_data},
+	{ .compatible = "mediatek,mt2712-disp-mutex",
+	  .data = &mt2712_mutex_driver_data},
+	{ .compatible = "mediatek,mt8167-disp-mutex",
+	  .data = &mt8167_mutex_driver_data},
+	{ .compatible = "mediatek,mt8173-disp-mutex",
+	  .data = &mt8173_mutex_driver_data},
+	{},
+};
+MODULE_DEVICE_TABLE(of, mutex_driver_dt_match);
+
+struct platform_driver mtk_mutex_driver = {
+	.probe		= mtk_mutex_probe,
+	.remove		= mtk_mutex_remove,
+	.driver		= {
+		.name	= "mediatek-mutex",
+		.owner	= THIS_MODULE,
+		.of_match_table = mutex_driver_dt_match,
+	},
+};
+
+builtin_platform_driver(mtk_mutex_driver);
diff --git a/include/linux/soc/mediatek/mtk-mutex.h b/include/linux/soc/mediatek/mtk-mutex.h
new file mode 100644
index 000000000000..6fe4ffbde290
--- /dev/null
+++ b/include/linux/soc/mediatek/mtk-mutex.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ */
+
+#ifndef MTK_MUTEX_H
+#define MTK_MUTEX_H
+
+struct regmap;
+struct device;
+struct mtk_mutex;
+
+struct mtk_mutex *mtk_mutex_get(struct device *dev);
+int mtk_mutex_prepare(struct mtk_mutex *mutex);
+void mtk_mutex_add_comp(struct mtk_mutex *mutex,
+			enum mtk_ddp_comp_id id);
+void mtk_mutex_enable(struct mtk_mutex *mutex);
+void mtk_mutex_disable(struct mtk_mutex *mutex);
+void mtk_mutex_remove_comp(struct mtk_mutex *mutex,
+			   enum mtk_ddp_comp_id id);
+void mtk_mutex_unprepare(struct mtk_mutex *mutex);
+void mtk_mutex_put(struct mtk_mutex *mutex);
+void mtk_mutex_acquire(struct mtk_mutex *mutex);
+void mtk_mutex_release(struct mtk_mutex *mutex);
+
+#endif /* MTK_MUTEX_H */
-- 
cgit v1.2.3


From 6e736c60a9fe905b3e4f4b07171a31ad602d56d2 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 1 Feb 2021 11:13:25 -0700
Subject: coresight: Introduce device access abstraction

We are about to introduce support for sysreg access to ETMv4.4+
component. Since there are generic routines that access the
registers (e.g, CS_LOCK/UNLOCK , claim/disclaim operations, timeout)
and in order to preserve the logic of these operations at a
single place we introduce an abstraction layer for the accesses
to a given device.

Link: https://lore.kernel.org/r/20210110224850.1880240-4-suzuki.poulose@arm.com
Cc: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210201181351.1475223-6-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |   1 +
 drivers/hwtracing/coresight/coresight-core.c       |  43 +++++
 drivers/hwtracing/coresight/coresight-cti-core.c   |   1 +
 drivers/hwtracing/coresight/coresight-etb10.c      |   1 +
 drivers/hwtracing/coresight/coresight-etm3x-core.c |   1 +
 drivers/hwtracing/coresight/coresight-etm4x-core.c |   1 +
 drivers/hwtracing/coresight/coresight-funnel.c     |   1 +
 drivers/hwtracing/coresight/coresight-replicator.c |   1 +
 drivers/hwtracing/coresight/coresight-stm.c        |   1 +
 drivers/hwtracing/coresight/coresight-tmc-core.c   |   1 +
 drivers/hwtracing/coresight/coresight-tpiu.c       |   1 +
 include/linux/coresight.h                          | 191 ++++++++++++++++++++-
 12 files changed, 241 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index a61313f320bd..867c932c7b26 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -551,6 +551,7 @@ static int catu_probe(struct amba_device *adev, const struct amba_id *id)
 	dev->platform_data = pdata;
 
 	drvdata->base = base;
+	catu_desc.access = CSDEV_ACCESS_IOMEM(base);
 	catu_desc.pdata = pdata;
 	catu_desc.dev = dev;
 	catu_desc.groups = catu_groups;
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 4ba801dffcb7..a38af8f0831b 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -1458,6 +1458,48 @@ int coresight_timeout(void __iomem *addr, u32 offset, int position, int value)
 }
 EXPORT_SYMBOL_GPL(coresight_timeout);
 
+u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_relaxed_read32(&csdev->access, offset);
+}
+
+u32 coresight_read32(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_read32(&csdev->access, offset);
+}
+
+void coresight_relaxed_write32(struct coresight_device *csdev,
+			       u32 val, u32 offset)
+{
+	csdev_access_relaxed_write32(&csdev->access, val, offset);
+}
+
+void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset)
+{
+	csdev_access_write32(&csdev->access, val, offset);
+}
+
+u64 coresight_relaxed_read64(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_relaxed_read64(&csdev->access, offset);
+}
+
+u64 coresight_read64(struct coresight_device *csdev, u32 offset)
+{
+	return csdev_access_read64(&csdev->access, offset);
+}
+
+void coresight_relaxed_write64(struct coresight_device *csdev,
+			       u64 val, u32 offset)
+{
+	csdev_access_relaxed_write64(&csdev->access, val, offset);
+}
+
+void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset)
+{
+	csdev_access_write64(&csdev->access, val, offset);
+}
+
 /*
  * coresight_release_platform_data: Release references to the devices connected
  * to the output port of this device.
@@ -1522,6 +1564,7 @@ struct coresight_device *coresight_register(struct coresight_desc *desc)
 	csdev->type = desc->type;
 	csdev->subtype = desc->subtype;
 	csdev->ops = desc->ops;
+	csdev->access = desc->access;
 	csdev->orphan = false;
 
 	csdev->dev.type = &coresight_dev_type[desc->type];
diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
index 61dbc1afd8da..b38a8db5f252 100644
--- a/drivers/hwtracing/coresight/coresight-cti-core.c
+++ b/drivers/hwtracing/coresight/coresight-cti-core.c
@@ -870,6 +870,7 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	cti_desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	dev_set_drvdata(dev, drvdata);
 
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 0cf6f0b947b6..cc742561a986 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -757,6 +757,7 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-core.c b/drivers/hwtracing/coresight/coresight-etm3x-core.c
index 5bf5a5a4ce6d..3b7837cbe376 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x-core.c
@@ -839,6 +839,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index 0924c376e35a..b9e01357ffad 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -1626,6 +1626,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 071c723227db..8f7c40d7d8d6 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -242,6 +242,7 @@ static int funnel_probe(struct device *dev, struct resource *res)
 		}
 		drvdata->base = base;
 		desc.groups = coresight_funnel_groups;
+		desc.access = CSDEV_ACCESS_IOMEM(base);
 	}
 
 	dev_set_drvdata(dev, drvdata);
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 7e2a2b7f503f..205756fab729 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -254,6 +254,7 @@ static int replicator_probe(struct device *dev, struct resource *res)
 		}
 		drvdata->base = base;
 		desc.groups = replicator_groups;
+		desc.access = CSDEV_ACCESS_IOMEM(base);
 	}
 
 	if (fwnode_property_present(dev_fwnode(dev),
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 99791773f682..41d9a922c2d4 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -884,6 +884,7 @@ static int stm_probe(struct amba_device *adev, const struct amba_id *id)
 	if (IS_ERR(base))
 		return PTR_ERR(base);
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	ret = stm_get_stimulus_area(dev, &ch_res);
 	if (ret)
diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c
index 8169dff5a9f6..e61b75be66b6 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-core.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-core.c
@@ -456,6 +456,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
 	}
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	spin_lock_init(&drvdata->spinlock);
 
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index d5dfee9ee556..6ca396799883 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -149,6 +149,7 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
 		return PTR_ERR(base);
 
 	drvdata->base = base;
+	desc.access = CSDEV_ACCESS_IOMEM(base);
 
 	/* Disable tpiu to support older devices */
 	tpiu_disable_hw(drvdata);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 7d3c87e5b97c..6107cf4021d3 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -7,6 +7,7 @@
 #define _LINUX_CORESIGHT_H
 
 #include <linux/device.h>
+#include <linux/io.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
 
@@ -114,6 +115,32 @@ struct coresight_platform_data {
 	struct coresight_connection *conns;
 };
 
+/**
+ * struct csdev_access - Abstraction of a CoreSight device access.
+ *
+ * @io_mem	: True if the device has memory mapped I/O
+ * @base	: When io_mem == true, base address of the component
+ * @read	: Read from the given "offset" of the given instance.
+ * @write	: Write "val" to the given "offset".
+ */
+struct csdev_access {
+	bool io_mem;
+	union {
+		void __iomem *base;
+		struct {
+			u64 (*read)(u32 offset, bool relaxed, bool _64bit);
+			void (*write)(u64 val, u32 offset, bool relaxed,
+				      bool _64bit);
+		};
+	};
+};
+
+#define CSDEV_ACCESS_IOMEM(_addr)		\
+	((struct csdev_access)	{		\
+		.io_mem		= true,		\
+		.base		= (_addr),	\
+	})
+
 /**
  * struct coresight_desc - description of a component required from drivers
  * @type:	as defined by @coresight_dev_type.
@@ -125,6 +152,7 @@ struct coresight_platform_data {
  * @groups:	operations specific to this component. These will end up
  *		in the component's sysfs sub-directory.
  * @name:	name for the coresight device, also shown under sysfs.
+ * @access:	Describe access to the device
  */
 struct coresight_desc {
 	enum coresight_dev_type type;
@@ -134,6 +162,7 @@ struct coresight_desc {
 	struct device *dev;
 	const struct attribute_group **groups;
 	const char *name;
+	struct csdev_access access;
 };
 
 /**
@@ -173,7 +202,8 @@ struct coresight_sysfs_link {
  * @type:	as defined by @coresight_dev_type.
  * @subtype:	as defined by @coresight_dev_subtype.
  * @ops:	generic operations for this component, as defined
-		by @coresight_ops.
+ *		by @coresight_ops.
+ * @access:	Device i/o access abstraction for this device.
  * @dev:	The device entity associated to this component.
  * @refcnt:	keep track of what is in use.
  * @orphan:	true if the component has connections that haven't been linked.
@@ -195,6 +225,7 @@ struct coresight_device {
 	enum coresight_dev_type type;
 	union coresight_dev_subtype subtype;
 	const struct coresight_ops *ops;
+	struct csdev_access access;
 	struct device dev;
 	atomic_t *refcnt;
 	bool orphan;
@@ -326,6 +357,104 @@ struct coresight_ops {
 };
 
 #if IS_ENABLED(CONFIG_CORESIGHT)
+
+static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa,
+					      u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readl_relaxed(csa->base + offset);
+
+	return csa->read(offset, true, false);
+}
+
+static inline u32 csdev_access_read32(struct csdev_access *csa, u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readl(csa->base + offset);
+
+	return csa->read(offset, false, false);
+}
+
+static inline void csdev_access_relaxed_write32(struct csdev_access *csa,
+						u32 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writel_relaxed(val, csa->base + offset);
+	else
+		csa->write(val, offset, true, false);
+}
+
+static inline void csdev_access_write32(struct csdev_access *csa, u32 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writel(val, csa->base + offset);
+	else
+		csa->write(val, offset, false, false);
+}
+
+#ifdef CONFIG_64BIT
+
+static inline u64 csdev_access_relaxed_read64(struct csdev_access *csa,
+					      u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readq_relaxed(csa->base + offset);
+
+	return csa->read(offset, true, true);
+}
+
+static inline u64 csdev_access_read64(struct csdev_access *csa, u32 offset)
+{
+	if (likely(csa->io_mem))
+		return readq(csa->base + offset);
+
+	return csa->read(offset, false, true);
+}
+
+static inline void csdev_access_relaxed_write64(struct csdev_access *csa,
+						u64 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writeq_relaxed(val, csa->base + offset);
+	else
+		csa->write(val, offset, true, true);
+}
+
+static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 offset)
+{
+	if (likely(csa->io_mem))
+		writeq(val, csa->base + offset);
+	else
+		csa->write(val, offset, false, true);
+}
+
+#else	/* !CONFIG_64BIT */
+
+static inline u64 csdev_access_relaxed_read64(struct csdev_access *csa,
+					      u32 offset)
+{
+	WARN_ON(1);
+	return 0;
+}
+
+static inline u64 csdev_access_read64(struct csdev_access *csa, u32 offset)
+{
+	WARN_ON(1);
+	return 0;
+}
+
+static inline void csdev_access_relaxed_write64(struct csdev_access *csa,
+						u64 val, u32 offset)
+{
+	WARN_ON(1);
+}
+
+static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 offset)
+{
+	WARN_ON(1);
+}
+#endif	/* CONFIG_64BIT */
+
 extern struct coresight_device *
 coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
@@ -343,6 +472,18 @@ extern char *coresight_alloc_device_name(struct coresight_dev_list *devs,
 					 struct device *dev);
 
 extern bool coresight_loses_context_with_cpu(struct device *dev);
+
+u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset);
+u32 coresight_read32(struct coresight_device *csdev, u32 offset);
+void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset);
+void coresight_relaxed_write32(struct coresight_device *csdev,
+			       u32 val, u32 offset);
+u64 coresight_relaxed_read64(struct coresight_device *csdev, u32 offset);
+u64 coresight_read64(struct coresight_device *csdev, u32 offset);
+void coresight_relaxed_write64(struct coresight_device *csdev,
+			       u64 val, u32 offset);
+void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset);
+
 #else
 static inline struct coresight_device *
 coresight_register(struct coresight_desc *desc) { return NULL; }
@@ -369,10 +510,54 @@ static inline bool coresight_loses_context_with_cpu(struct device *dev)
 {
 	return false;
 }
-#endif
+
+static inline u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline u32 coresight_read32(struct coresight_device *csdev, u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset)
+{
+}
+
+static inline void coresight_relaxed_write32(struct coresight_device *csdev,
+					     u32 val, u32 offset)
+{
+}
+
+static inline u64 coresight_relaxed_read64(struct coresight_device *csdev,
+					   u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline u64 coresight_read64(struct coresight_device *csdev, u32 offset)
+{
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static inline void coresight_relaxed_write64(struct coresight_device *csdev,
+					     u64 val, u32 offset)
+{
+}
+
+static inline void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset)
+{
+}
+
+#endif		/* IS_ENABLED(CONFIG_CORESIGHT) */
 
 extern int coresight_get_cpu(struct device *dev);
 
 struct coresight_platform_data *coresight_get_platform_data(struct device *dev);
 
-#endif
+#endif		/* _LINUX_COREISGHT_H */
-- 
cgit v1.2.3


From 020052825e49128d381d6444d1ce079e8ca82386 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 1 Feb 2021 11:13:27 -0700
Subject: coresight: Convert coresight_timeout to use access abstraction

Convert the generic routines to use the new access abstraction layer
gradually, starting with coresigth_timeout.

Link: https://lore.kernel.org/r/20210110224850.1880240-6-suzuki.poulose@arm.com
Cc: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210201181351.1475223-8-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |  5 ++--
 drivers/hwtracing/coresight/coresight-core.c       | 13 +++++-----
 drivers/hwtracing/coresight/coresight-etb10.c      |  5 ++--
 drivers/hwtracing/coresight/coresight-etm4x-core.c | 30 ++++++++++++++--------
 drivers/hwtracing/coresight/coresight-stm.c        |  3 ++-
 drivers/hwtracing/coresight/coresight-tmc-core.c   | 15 ++++++-----
 drivers/hwtracing/coresight/coresight-tpiu.c       |  4 +--
 include/linux/coresight.h                          | 11 +++++---
 8 files changed, 54 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index 867c932c7b26..d6097454d399 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -401,8 +401,9 @@ static const struct attribute_group *catu_groups[] = {
 
 static inline int catu_wait_for_ready(struct catu_drvdata *drvdata)
 {
-	return coresight_timeout(drvdata->base,
-				 CATU_STATUS, CATU_STATUS_READY, 1);
+	struct csdev_access *csa = &drvdata->csdev->access;
+
+	return coresight_timeout(csa, CATU_STATUS, CATU_STATUS_READY, 1);
 }
 
 static int catu_enable_hw(struct catu_drvdata *drvdata, void *data)
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index a38af8f0831b..74985068f325 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -1418,23 +1418,24 @@ static void coresight_remove_conns(struct coresight_device *csdev)
 }
 
 /**
- * coresight_timeout - loop until a bit has changed to a specific state.
- * @addr: base address of the area of interest.
- * @offset: address of a register, starting from @addr.
+ * coresight_timeout - loop until a bit has changed to a specific register
+ *			state.
+ * @csa: coresight device access for the device
+ * @offset: Offset of the register from the base of the device.
  * @position: the position of the bit of interest.
  * @value: the value the bit should have.
  *
  * Return: 0 as soon as the bit has taken the desired state or -EAGAIN if
  * TIMEOUT_US has elapsed, which ever happens first.
  */
-
-int coresight_timeout(void __iomem *addr, u32 offset, int position, int value)
+int coresight_timeout(struct csdev_access *csa, u32 offset,
+		      int position, int value)
 {
 	int i;
 	u32 val;
 
 	for (i = TIMEOUT_US; i > 0; i--) {
-		val = __raw_readl(addr + offset);
+		val = csdev_access_read32(csa, offset);
 		/* waiting on the bit to go from 0 to 1 */
 		if (value) {
 			if (val & BIT(position))
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index cc742561a986..0f664aeeda93 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -252,6 +252,7 @@ static void __etb_disable_hw(struct etb_drvdata *drvdata)
 {
 	u32 ffcr;
 	struct device *dev = &drvdata->csdev->dev;
+	struct csdev_access *csa = &drvdata->csdev->access;
 
 	CS_UNLOCK(drvdata->base);
 
@@ -263,7 +264,7 @@ static void __etb_disable_hw(struct etb_drvdata *drvdata)
 	ffcr |= ETB_FFCR_FON_MAN;
 	writel_relaxed(ffcr, drvdata->base + ETB_FFCR);
 
-	if (coresight_timeout(drvdata->base, ETB_FFCR, ETB_FFCR_BIT, 0)) {
+	if (coresight_timeout(csa, ETB_FFCR, ETB_FFCR_BIT, 0)) {
 		dev_err(dev,
 		"timeout while waiting for completion of Manual Flush\n");
 	}
@@ -271,7 +272,7 @@ static void __etb_disable_hw(struct etb_drvdata *drvdata)
 	/* disable trace capture */
 	writel_relaxed(0x0, drvdata->base + ETB_CTL_REG);
 
-	if (coresight_timeout(drvdata->base, ETB_FFSR, ETB_FFSR_BIT, 1)) {
+	if (coresight_timeout(csa, ETB_FFSR, ETB_FFSR_BIT, 1)) {
 		dev_err(dev,
 			"timeout while waiting for Formatter to Stop\n");
 	}
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index b9e01357ffad..180bb6ed9090 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -217,7 +217,9 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 {
 	int i, rc;
 	struct etmv4_config *config = &drvdata->config;
-	struct device *etm_dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
+	struct device *etm_dev = &csdev->dev;
+	struct csdev_access *csa = &csdev->access;
 
 	CS_UNLOCK(drvdata->base);
 	etm4_enable_arch_specific(drvdata);
@@ -232,7 +234,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 	writel_relaxed(0, drvdata->base + TRCPRGCTLR);
 
 	/* wait for TRCSTATR.IDLE to go up */
-	if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 1))
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_IDLE_BIT, 1))
 		dev_err(etm_dev,
 			"timeout while waiting for Idle Trace Status\n");
 	if (drvdata->nr_pe)
@@ -323,7 +325,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 	writel_relaxed(1, drvdata->base + TRCPRGCTLR);
 
 	/* wait for TRCSTATR.IDLE to go back down to '0' */
-	if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 0))
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_IDLE_BIT, 0))
 		dev_err(etm_dev,
 			"timeout while waiting for Idle Trace Status\n");
 
@@ -587,7 +589,9 @@ static void etm4_disable_hw(void *info)
 	u32 control;
 	struct etmv4_drvdata *drvdata = info;
 	struct etmv4_config *config = &drvdata->config;
-	struct device *etm_dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
+	struct device *etm_dev = &csdev->dev;
+	struct csdev_access *csa = &csdev->access;
 	int i;
 
 	CS_UNLOCK(drvdata->base);
@@ -615,8 +619,7 @@ static void etm4_disable_hw(void *info)
 	writel_relaxed(control, drvdata->base + TRCPRGCTLR);
 
 	/* wait for TRCSTATR.PMSTABLE to go to '1' */
-	if (coresight_timeout(drvdata->base, TRCSTATR,
-			      TRCSTATR_PMSTABLE_BIT, 1))
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1))
 		dev_err(etm_dev,
 			"timeout while waiting for PM stable Trace Status\n");
 
@@ -1272,7 +1275,15 @@ static int etm4_cpu_save(struct etmv4_drvdata *drvdata)
 {
 	int i, ret = 0;
 	struct etmv4_save_state *state;
-	struct device *etm_dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
+	struct csdev_access *csa;
+	struct device *etm_dev;
+
+	if (WARN_ON(!csdev))
+		return -ENODEV;
+
+	etm_dev = &csdev->dev;
+	csa = &csdev->access;
 
 	/*
 	 * As recommended by 3.4.1 ("The procedure when powering down the PE")
@@ -1287,8 +1298,7 @@ static int etm4_cpu_save(struct etmv4_drvdata *drvdata)
 	etm4_os_lock(drvdata);
 
 	/* wait for TRCSTATR.PMSTABLE to go up */
-	if (coresight_timeout(drvdata->base, TRCSTATR,
-			      TRCSTATR_PMSTABLE_BIT, 1)) {
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1)) {
 		dev_err(etm_dev,
 			"timeout while waiting for PM Stable Status\n");
 		etm4_os_unlock(drvdata);
@@ -1377,7 +1387,7 @@ static int etm4_cpu_save(struct etmv4_drvdata *drvdata)
 		state->trcpdcr = readl(drvdata->base + TRCPDCR);
 
 	/* wait for TRCSTATR.IDLE to go up */
-	if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 1)) {
+	if (coresight_timeout(csa, TRCSTATR, TRCSTATR_IDLE_BIT, 1)) {
 		dev_err(etm_dev,
 			"timeout while waiting for Idle Trace Status\n");
 		etm4_os_unlock(drvdata);
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c
index 41d9a922c2d4..5927316d7a03 100644
--- a/drivers/hwtracing/coresight/coresight-stm.c
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -258,6 +258,7 @@ static void stm_disable(struct coresight_device *csdev,
 			struct perf_event *event)
 {
 	struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+	struct csdev_access *csa = &csdev->access;
 
 	/*
 	 * For as long as the tracer isn't disabled another entity can't
@@ -270,7 +271,7 @@ static void stm_disable(struct coresight_device *csdev,
 		spin_unlock(&drvdata->spinlock);
 
 		/* Wait until the engine has completely stopped */
-		coresight_timeout(drvdata->base, STMTCSR, STMTCSR_BUSY_BIT, 0);
+		coresight_timeout(csa, STMTCSR, STMTCSR_BUSY_BIT, 0);
 
 		pm_runtime_put(csdev->dev.parent);
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c
index e61b75be66b6..4dc1ea2c19b5 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-core.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-core.c
@@ -33,16 +33,20 @@ DEFINE_CORESIGHT_DEVLIST(etr_devs, "tmc_etr");
 
 void tmc_wait_for_tmcready(struct tmc_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+	struct csdev_access *csa = &csdev->access;
+
 	/* Ensure formatter, unformatter and hardware fifo are empty */
-	if (coresight_timeout(drvdata->base,
-			      TMC_STS, TMC_STS_TMCREADY_BIT, 1)) {
-		dev_err(&drvdata->csdev->dev,
+	if (coresight_timeout(csa, TMC_STS, TMC_STS_TMCREADY_BIT, 1)) {
+		dev_err(&csdev->dev,
 			"timeout while waiting for TMC to be Ready\n");
 	}
 }
 
 void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+	struct csdev_access *csa = &csdev->access;
 	u32 ffcr;
 
 	ffcr = readl_relaxed(drvdata->base + TMC_FFCR);
@@ -51,9 +55,8 @@ void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
 	ffcr |= BIT(TMC_FFCR_FLUSHMAN_BIT);
 	writel_relaxed(ffcr, drvdata->base + TMC_FFCR);
 	/* Ensure flush completes */
-	if (coresight_timeout(drvdata->base,
-			      TMC_FFCR, TMC_FFCR_FLUSHMAN_BIT, 0)) {
-		dev_err(&drvdata->csdev->dev,
+	if (coresight_timeout(csa, TMC_FFCR, TMC_FFCR_FLUSHMAN_BIT, 0)) {
+		dev_err(&csdev->dev,
 		"timeout while waiting for completion of Manual Flush\n");
 	}
 
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c
index a12b6ee0a576..2ec057892799 100644
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -86,9 +86,9 @@ static void tpiu_disable_hw(struct csdev_access *csa)
 	/* Generate manual flush */
 	csdev_access_relaxed_write32(csa, FFCR_STOP_FI | FFCR_FON_MAN, TPIU_FFCR);
 	/* Wait for flush to complete */
-	coresight_timeout(csa->base, TPIU_FFCR, FFCR_FON_MAN_BIT, 0);
+	coresight_timeout(csa, TPIU_FFCR, FFCR_FON_MAN_BIT, 0);
 	/* Wait for formatter to stop */
-	coresight_timeout(csa->base, TPIU_FFSR, FFSR_FT_STOPPED_BIT, 1);
+	coresight_timeout(csa, TPIU_FFSR, FFSR_FT_STOPPED_BIT, 1);
 
 	CS_LOCK(csa->base);
 }
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 6107cf4021d3..18bc7f9fb041 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -460,7 +460,7 @@ coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
 extern int coresight_enable(struct coresight_device *csdev);
 extern void coresight_disable(struct coresight_device *csdev);
-extern int coresight_timeout(void __iomem *addr, u32 offset,
+extern int coresight_timeout(struct csdev_access *csa, u32 offset,
 			     int position, int value);
 
 extern int coresight_claim_device(void __iomem *base);
@@ -491,8 +491,13 @@ static inline void coresight_unregister(struct coresight_device *csdev) {}
 static inline int
 coresight_enable(struct coresight_device *csdev) { return -ENOSYS; }
 static inline void coresight_disable(struct coresight_device *csdev) {}
-static inline int coresight_timeout(void __iomem *addr, u32 offset,
-				     int position, int value) { return 1; }
+
+static inline int coresight_timeout(struct csdev_access *csa, u32 offset,
+				    int position, int value)
+{
+	return 1;
+}
+
 static inline int coresight_claim_device_unlocked(void __iomem *base)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 8ce0029658ba16c52670e869b9ccde02ae7dec6b Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 1 Feb 2021 11:13:28 -0700
Subject: coresight: Convert claim/disclaim operations to use access wrappers

Convert the generic CLAIM tag management APIs to use the
device access layer abstraction.

Link: https://lore.kernel.org/r/20210110224850.1880240-7-suzuki.poulose@arm.com
Cc: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210201181351.1475223-9-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-catu.c       |  6 +-
 drivers/hwtracing/coresight/coresight-core.c       | 66 +++++++++++++---------
 drivers/hwtracing/coresight/coresight-cti-core.c   | 17 +++---
 drivers/hwtracing/coresight/coresight-etb10.c      |  4 +-
 drivers/hwtracing/coresight/coresight-etm3x-core.c |  8 ++-
 drivers/hwtracing/coresight/coresight-etm4x-core.c |  4 +-
 drivers/hwtracing/coresight/coresight-funnel.c     |  6 +-
 drivers/hwtracing/coresight/coresight-replicator.c | 12 ++--
 drivers/hwtracing/coresight/coresight-tmc-etf.c    | 10 ++--
 drivers/hwtracing/coresight/coresight-tmc-etr.c    |  4 +-
 include/linux/coresight.h                          | 16 +++---
 11 files changed, 91 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c
index d6097454d399..9a0b9ce4a7da 100644
--- a/drivers/hwtracing/coresight/coresight-catu.c
+++ b/drivers/hwtracing/coresight/coresight-catu.c
@@ -412,6 +412,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, void *data)
 	u32 control, mode;
 	struct etr_buf *etr_buf = data;
 	struct device *dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	if (catu_wait_for_ready(drvdata))
 		dev_warn(dev, "Timeout while waiting for READY\n");
@@ -422,7 +423,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, void *data)
 		return -EBUSY;
 	}
 
-	rc = coresight_claim_device_unlocked(drvdata->base);
+	rc = coresight_claim_device_unlocked(csdev);
 	if (rc)
 		return rc;
 
@@ -466,9 +467,10 @@ static int catu_disable_hw(struct catu_drvdata *drvdata)
 {
 	int rc = 0;
 	struct device *dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	catu_write_control(drvdata, 0);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 	if (catu_wait_for_ready(drvdata)) {
 		dev_info(dev, "Timeout while waiting for READY\n");
 		rc = -EAGAIN;
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 74985068f325..0062c8935653 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -145,30 +145,32 @@ static int coresight_find_link_outport(struct coresight_device *csdev,
 	return -ENODEV;
 }
 
-static inline u32 coresight_read_claim_tags(void __iomem *base)
+static inline u32 coresight_read_claim_tags(struct coresight_device *csdev)
 {
-	return readl_relaxed(base + CORESIGHT_CLAIMCLR);
+	return csdev_access_relaxed_read32(&csdev->access, CORESIGHT_CLAIMCLR);
 }
 
-static inline bool coresight_is_claimed_self_hosted(void __iomem *base)
+static inline bool coresight_is_claimed_self_hosted(struct coresight_device *csdev)
 {
-	return coresight_read_claim_tags(base) == CORESIGHT_CLAIM_SELF_HOSTED;
+	return coresight_read_claim_tags(csdev) == CORESIGHT_CLAIM_SELF_HOSTED;
 }
 
-static inline bool coresight_is_claimed_any(void __iomem *base)
+static inline bool coresight_is_claimed_any(struct coresight_device *csdev)
 {
-	return coresight_read_claim_tags(base) != 0;
+	return coresight_read_claim_tags(csdev) != 0;
 }
 
-static inline void coresight_set_claim_tags(void __iomem *base)
+static inline void coresight_set_claim_tags(struct coresight_device *csdev)
 {
-	writel_relaxed(CORESIGHT_CLAIM_SELF_HOSTED, base + CORESIGHT_CLAIMSET);
+	csdev_access_relaxed_write32(&csdev->access, CORESIGHT_CLAIM_SELF_HOSTED,
+				     CORESIGHT_CLAIMSET);
 	isb();
 }
 
-static inline void coresight_clear_claim_tags(void __iomem *base)
+static inline void coresight_clear_claim_tags(struct coresight_device *csdev)
 {
-	writel_relaxed(CORESIGHT_CLAIM_SELF_HOSTED, base + CORESIGHT_CLAIMCLR);
+	csdev_access_relaxed_write32(&csdev->access, CORESIGHT_CLAIM_SELF_HOSTED,
+				     CORESIGHT_CLAIMCLR);
 	isb();
 }
 
@@ -182,27 +184,33 @@ static inline void coresight_clear_claim_tags(void __iomem *base)
  * Called with CS_UNLOCKed for the component.
  * Returns : 0 on success
  */
-int coresight_claim_device_unlocked(void __iomem *base)
+int coresight_claim_device_unlocked(struct coresight_device *csdev)
 {
-	if (coresight_is_claimed_any(base))
+	if (WARN_ON(!csdev))
+		return -EINVAL;
+
+	if (coresight_is_claimed_any(csdev))
 		return -EBUSY;
 
-	coresight_set_claim_tags(base);
-	if (coresight_is_claimed_self_hosted(base))
+	coresight_set_claim_tags(csdev);
+	if (coresight_is_claimed_self_hosted(csdev))
 		return 0;
 	/* There was a race setting the tags, clean up and fail */
-	coresight_clear_claim_tags(base);
+	coresight_clear_claim_tags(csdev);
 	return -EBUSY;
 }
 EXPORT_SYMBOL_GPL(coresight_claim_device_unlocked);
 
-int coresight_claim_device(void __iomem *base)
+int coresight_claim_device(struct coresight_device *csdev)
 {
 	int rc;
 
-	CS_UNLOCK(base);
-	rc = coresight_claim_device_unlocked(base);
-	CS_LOCK(base);
+	if (WARN_ON(!csdev))
+		return -EINVAL;
+
+	CS_UNLOCK(csdev->access.base);
+	rc = coresight_claim_device_unlocked(csdev);
+	CS_LOCK(csdev->access.base);
 
 	return rc;
 }
@@ -212,11 +220,14 @@ EXPORT_SYMBOL_GPL(coresight_claim_device);
  * coresight_disclaim_device_unlocked : Clear the claim tags for the device.
  * Called with CS_UNLOCKed for the component.
  */
-void coresight_disclaim_device_unlocked(void __iomem *base)
+void coresight_disclaim_device_unlocked(struct coresight_device *csdev)
 {
 
-	if (coresight_is_claimed_self_hosted(base))
-		coresight_clear_claim_tags(base);
+	if (WARN_ON(!csdev))
+		return;
+
+	if (coresight_is_claimed_self_hosted(csdev))
+		coresight_clear_claim_tags(csdev);
 	else
 		/*
 		 * The external agent may have not honoured our claim
@@ -227,11 +238,14 @@ void coresight_disclaim_device_unlocked(void __iomem *base)
 }
 EXPORT_SYMBOL_GPL(coresight_disclaim_device_unlocked);
 
-void coresight_disclaim_device(void __iomem *base)
+void coresight_disclaim_device(struct coresight_device *csdev)
 {
-	CS_UNLOCK(base);
-	coresight_disclaim_device_unlocked(base);
-	CS_LOCK(base);
+	if (WARN_ON(!csdev))
+		return;
+
+	CS_UNLOCK(csdev->access.base);
+	coresight_disclaim_device_unlocked(csdev);
+	CS_LOCK(csdev->access.base);
 }
 EXPORT_SYMBOL_GPL(coresight_disclaim_device);
 
diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c
index b38a8db5f252..0c81eb9603ae 100644
--- a/drivers/hwtracing/coresight/coresight-cti-core.c
+++ b/drivers/hwtracing/coresight/coresight-cti-core.c
@@ -102,7 +102,7 @@ static int cti_enable_hw(struct cti_drvdata *drvdata)
 		goto cti_state_unchanged;
 
 	/* claim the device */
-	rc = coresight_claim_device(drvdata->base);
+	rc = coresight_claim_device(drvdata->csdev);
 	if (rc)
 		goto cti_err_not_enabled;
 
@@ -136,7 +136,7 @@ static void cti_cpuhp_enable_hw(struct cti_drvdata *drvdata)
 		goto cti_hp_not_enabled;
 
 	/* try to claim the device */
-	if (coresight_claim_device(drvdata->base))
+	if (coresight_claim_device(drvdata->csdev))
 		goto cti_hp_not_enabled;
 
 	cti_write_all_hw_regs(drvdata);
@@ -154,6 +154,7 @@ static int cti_disable_hw(struct cti_drvdata *drvdata)
 {
 	struct cti_config *config = &drvdata->config;
 	struct device *dev = &drvdata->csdev->dev;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	spin_lock(&drvdata->spinlock);
 
@@ -171,7 +172,7 @@ static int cti_disable_hw(struct cti_drvdata *drvdata)
 	writel_relaxed(0, drvdata->base + CTICONTROL);
 	config->hw_enabled = false;
 
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 	CS_LOCK(drvdata->base);
 	spin_unlock(&drvdata->spinlock);
 	pm_runtime_put(dev);
@@ -655,6 +656,7 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 			     void *v)
 {
 	struct cti_drvdata *drvdata;
+	struct coresight_device *csdev;
 	unsigned int cpu = smp_processor_id();
 	int notify_res = NOTIFY_OK;
 
@@ -662,6 +664,7 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 		return NOTIFY_OK;
 
 	drvdata = cti_cpu_drvdata[cpu];
+	csdev = drvdata->csdev;
 
 	if (WARN_ON_ONCE(drvdata->ctidev.cpu != cpu))
 		return NOTIFY_BAD;
@@ -673,13 +676,13 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 		/* CTI regs all static - we have a copy & nothing to save */
 		drvdata->config.hw_powered = false;
 		if (drvdata->config.hw_enabled)
-			coresight_disclaim_device(drvdata->base);
+			coresight_disclaim_device(csdev);
 		break;
 
 	case CPU_PM_ENTER_FAILED:
 		drvdata->config.hw_powered = true;
 		if (drvdata->config.hw_enabled) {
-			if (coresight_claim_device(drvdata->base))
+			if (coresight_claim_device(csdev))
 				drvdata->config.hw_enabled = false;
 		}
 		break;
@@ -692,7 +695,7 @@ static int cti_cpu_pm_notify(struct notifier_block *nb, unsigned long cmd,
 		/* check enable reference count to enable HW */
 		if (atomic_read(&drvdata->config.enable_req_count)) {
 			/* check we can claim the device as we re-power */
-			if (coresight_claim_device(drvdata->base))
+			if (coresight_claim_device(csdev))
 				goto cti_notify_exit;
 
 			drvdata->config.hw_enabled = true;
@@ -736,7 +739,7 @@ static int cti_dying_cpu(unsigned int cpu)
 	spin_lock(&drvdata->spinlock);
 	drvdata->config.hw_powered = false;
 	if (drvdata->config.hw_enabled)
-		coresight_disclaim_device(drvdata->base);
+		coresight_disclaim_device(drvdata->csdev);
 	spin_unlock(&drvdata->spinlock);
 	return 0;
 }
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index 0f664aeeda93..74922c94f4b1 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -132,7 +132,7 @@ static void __etb_enable_hw(struct etb_drvdata *drvdata)
 
 static int etb_enable_hw(struct etb_drvdata *drvdata)
 {
-	int rc = coresight_claim_device(drvdata->base);
+	int rc = coresight_claim_device(drvdata->csdev);
 
 	if (rc)
 		return rc;
@@ -345,7 +345,7 @@ static void etb_disable_hw(struct etb_drvdata *drvdata)
 {
 	__etb_disable_hw(drvdata);
 	etb_dump_hw(drvdata);
-	coresight_disclaim_device(drvdata->base);
+	coresight_disclaim_device(drvdata->csdev);
 }
 
 static int etb_disable(struct coresight_device *csdev)
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-core.c b/drivers/hwtracing/coresight/coresight-etm3x-core.c
index 3b7837cbe376..29d4dba4bee9 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x-core.c
@@ -358,10 +358,11 @@ static int etm_enable_hw(struct etm_drvdata *drvdata)
 	int i, rc;
 	u32 etmcr;
 	struct etm_config *config = &drvdata->config;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
-	rc = coresight_claim_device_unlocked(drvdata->base);
+	rc = coresight_claim_device_unlocked(csdev);
 	if (rc)
 		goto done;
 
@@ -566,6 +567,7 @@ static void etm_disable_hw(void *info)
 	int i;
 	struct etm_drvdata *drvdata = info;
 	struct etm_config *config = &drvdata->config;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 	etm_set_prog(drvdata);
@@ -577,7 +579,7 @@ static void etm_disable_hw(void *info)
 		config->cntr_val[i] = etm_readl(drvdata, ETMCNTVRn(i));
 
 	etm_set_pwrdwn(drvdata);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 
@@ -602,7 +604,7 @@ static void etm_disable_perf(struct coresight_device *csdev)
 	 * power down the tracer.
 	 */
 	etm_set_pwrdwn(drvdata);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 }
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index 180bb6ed9090..a041ad52737f 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -226,7 +226,7 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
 
 	etm4_os_unlock(drvdata);
 
-	rc = coresight_claim_device_unlocked(drvdata->base);
+	rc = coresight_claim_device_unlocked(csdev);
 	if (rc)
 		goto done;
 
@@ -635,7 +635,7 @@ static void etm4_disable_hw(void *info)
 			readl_relaxed(drvdata->base + TRCCNTVRn(i));
 	}
 
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index 8f7c40d7d8d6..38bebdddcbff 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -52,13 +52,14 @@ static int dynamic_funnel_enable_hw(struct funnel_drvdata *drvdata, int port)
 {
 	u32 functl;
 	int rc = 0;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
 	functl = readl_relaxed(drvdata->base + FUNNEL_FUNCTL);
 	/* Claim the device only when we enable the first slave */
 	if (!(functl & FUNNEL_ENSx_MASK)) {
-		rc = coresight_claim_device_unlocked(drvdata->base);
+		rc = coresight_claim_device_unlocked(csdev);
 		if (rc)
 			goto done;
 	}
@@ -101,6 +102,7 @@ static void dynamic_funnel_disable_hw(struct funnel_drvdata *drvdata,
 				      int inport)
 {
 	u32 functl;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
@@ -110,7 +112,7 @@ static void dynamic_funnel_disable_hw(struct funnel_drvdata *drvdata,
 
 	/* Disclaim the device if none of the slaves are now active */
 	if (!(functl & FUNNEL_ENSx_MASK))
-		coresight_disclaim_device_unlocked(drvdata->base);
+		coresight_disclaim_device_unlocked(csdev);
 
 	CS_LOCK(drvdata->base);
 }
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 205756fab729..a73fea9185b6 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -45,12 +45,14 @@ struct replicator_drvdata {
 
 static void dynamic_replicator_reset(struct replicator_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+
 	CS_UNLOCK(drvdata->base);
 
-	if (!coresight_claim_device_unlocked(drvdata->base)) {
+	if (!coresight_claim_device_unlocked(csdev)) {
 		writel_relaxed(0xff, drvdata->base + REPLICATOR_IDFILTER0);
 		writel_relaxed(0xff, drvdata->base + REPLICATOR_IDFILTER1);
-		coresight_disclaim_device_unlocked(drvdata->base);
+		coresight_disclaim_device_unlocked(csdev);
 	}
 
 	CS_LOCK(drvdata->base);
@@ -70,6 +72,7 @@ static int dynamic_replicator_enable(struct replicator_drvdata *drvdata,
 {
 	int rc = 0;
 	u32 id0val, id1val;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	CS_UNLOCK(drvdata->base);
 
@@ -84,7 +87,7 @@ static int dynamic_replicator_enable(struct replicator_drvdata *drvdata,
 		id0val = id1val = 0xff;
 
 	if (id0val == 0xff && id1val == 0xff)
-		rc = coresight_claim_device_unlocked(drvdata->base);
+		rc = coresight_claim_device_unlocked(csdev);
 
 	if (!rc) {
 		switch (outport) {
@@ -140,6 +143,7 @@ static void dynamic_replicator_disable(struct replicator_drvdata *drvdata,
 				       int inport, int outport)
 {
 	u32 reg;
+	struct coresight_device *csdev = drvdata->csdev;
 
 	switch (outport) {
 	case 0:
@@ -160,7 +164,7 @@ static void dynamic_replicator_disable(struct replicator_drvdata *drvdata,
 
 	if ((readl_relaxed(drvdata->base + REPLICATOR_IDFILTER0) == 0xff) &&
 	    (readl_relaxed(drvdata->base + REPLICATOR_IDFILTER1) == 0xff))
-		coresight_disclaim_device_unlocked(drvdata->base);
+		coresight_disclaim_device_unlocked(csdev);
 	CS_LOCK(drvdata->base);
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 989d965f3d90..45b85edfc690 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -37,7 +37,7 @@ static void __tmc_etb_enable_hw(struct tmc_drvdata *drvdata)
 
 static int tmc_etb_enable_hw(struct tmc_drvdata *drvdata)
 {
-	int rc = coresight_claim_device(drvdata->base);
+	int rc = coresight_claim_device(drvdata->csdev);
 
 	if (rc)
 		return rc;
@@ -88,7 +88,7 @@ static void __tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
 static void tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
 {
 	__tmc_etb_disable_hw(drvdata);
-	coresight_disclaim_device(drvdata->base);
+	coresight_disclaim_device(drvdata->csdev);
 }
 
 static void __tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
@@ -109,7 +109,7 @@ static void __tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
 
 static int tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
 {
-	int rc = coresight_claim_device(drvdata->base);
+	int rc = coresight_claim_device(drvdata->csdev);
 
 	if (rc)
 		return rc;
@@ -120,11 +120,13 @@ static int tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
 
 static void tmc_etf_disable_hw(struct tmc_drvdata *drvdata)
 {
+	struct coresight_device *csdev = drvdata->csdev;
+
 	CS_UNLOCK(drvdata->base);
 
 	tmc_flush_and_stop(drvdata);
 	tmc_disable_hw(drvdata);
-	coresight_disclaim_device_unlocked(drvdata->base);
+	coresight_disclaim_device_unlocked(csdev);
 	CS_LOCK(drvdata->base);
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index bf5230e39c5b..acdb59e0e661 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1040,7 +1040,7 @@ static int tmc_etr_enable_hw(struct tmc_drvdata *drvdata,
 	rc = tmc_etr_enable_catu(drvdata, etr_buf);
 	if (rc)
 		return rc;
-	rc = coresight_claim_device(drvdata->base);
+	rc = coresight_claim_device(drvdata->csdev);
 	if (!rc) {
 		drvdata->etr_buf = etr_buf;
 		__tmc_etr_enable_hw(drvdata);
@@ -1134,7 +1134,7 @@ void tmc_etr_disable_hw(struct tmc_drvdata *drvdata)
 	__tmc_etr_disable_hw(drvdata);
 	/* Disable CATU device if this ETR is connected to one */
 	tmc_etr_disable_catu(drvdata);
-	coresight_disclaim_device(drvdata->base);
+	coresight_disclaim_device(drvdata->csdev);
 	/* Reset the ETR buf used by hardware */
 	drvdata->etr_buf = NULL;
 }
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 18bc7f9fb041..976ec2697610 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -463,11 +463,11 @@ extern void coresight_disable(struct coresight_device *csdev);
 extern int coresight_timeout(struct csdev_access *csa, u32 offset,
 			     int position, int value);
 
-extern int coresight_claim_device(void __iomem *base);
-extern int coresight_claim_device_unlocked(void __iomem *base);
+extern int coresight_claim_device(struct coresight_device *csdev);
+extern int coresight_claim_device_unlocked(struct coresight_device *csdev);
 
-extern void coresight_disclaim_device(void __iomem *base);
-extern void coresight_disclaim_device_unlocked(void __iomem *base);
+extern void coresight_disclaim_device(struct coresight_device *csdev);
+extern void coresight_disclaim_device_unlocked(struct coresight_device *csdev);
 extern char *coresight_alloc_device_name(struct coresight_dev_list *devs,
 					 struct device *dev);
 
@@ -498,18 +498,18 @@ static inline int coresight_timeout(struct csdev_access *csa, u32 offset,
 	return 1;
 }
 
-static inline int coresight_claim_device_unlocked(void __iomem *base)
+static inline int coresight_claim_device_unlocked(struct coresight_device *csdev)
 {
 	return -EINVAL;
 }
 
-static inline int coresight_claim_device(void __iomem *base)
+static inline int coresight_claim_device(struct coresight_device *csdev)
 {
 	return -EINVAL;
 }
 
-static inline void coresight_disclaim_device(void __iomem *base) {}
-static inline void coresight_disclaim_device_unlocked(void __iomem *base) {}
+static inline void coresight_disclaim_device(struct coresight_device *csdev) {}
+static inline void coresight_disclaim_device_unlocked(struct coresight_device *csdev) {}
 
 static inline bool coresight_loses_context_with_cpu(struct device *dev)
 {
-- 
cgit v1.2.3


From b64afd949ee3a61e180813859b50aced26023c55 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Tue, 2 Feb 2021 18:15:37 -0800
Subject: platform/chrome: cros_ec: Import Type C control command

This command is used to communicate with the Chrome Embedded Controller
(EC) regarding USB Type C events and state.

These header updates are included in the latest Chrome OS EC headers [1]

[1]
https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/main/include/ec_commands.h

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Link: https://lore.kernel.org/r/20210203021539.745239-1-pmalani@chromium.org
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 include/linux/platform_data/cros_ec_commands.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 9787715540c7..50e1caad81be 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -5577,6 +5577,32 @@ struct ec_response_typec_discovery {
 	struct svid_mode_info svids[0];
 } __ec_align1;
 
+/* USB Type-C commands for AP-controlled device policy. */
+#define EC_CMD_TYPEC_CONTROL 0x0132
+
+enum typec_control_command {
+	TYPEC_CONTROL_COMMAND_EXIT_MODES,
+	TYPEC_CONTROL_COMMAND_CLEAR_EVENTS,
+	TYPEC_CONTROL_COMMAND_ENTER_MODE,
+};
+
+struct ec_params_typec_control {
+	uint8_t port;
+	uint8_t command;	/* enum typec_control_command */
+	uint16_t reserved;
+
+	/*
+	 * This section will be interpreted based on |command|. Define a
+	 * placeholder structure to avoid having to increase the size and bump
+	 * the command version when adding new sub-commands.
+	 */
+	union {
+		uint32_t clear_events_mask;
+		uint8_t mode_to_enter;      /* enum typec_mode */
+		uint8_t placeholder[128];
+	};
+} __ec_align1;
+
 /*
  * Gather all status information for a port.
  *
-- 
cgit v1.2.3


From 5ae4a4b45d4396aa7f7c008c4ae9eca981d43f8c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 2 Feb 2021 10:25:11 +0530
Subject: cpufreq: Remove CPUFREQ_STICKY flag

During cpufreq driver's registration, if the ->init() callback for all
the CPUs fail then there is not much point in keeping the driver around
as it will only account for more of unnecessary noise, for example
cpufreq core will try to suspend/resume the driver which never got
registered properly.

The removal of such a driver is avoided if the driver carries the
CPUFREQ_STICKY flag. This was added way back [1] in 2004 and perhaps no
one should ever need it now. A lot of drivers do set this flag, probably
because they just copied it from other drivers.

This was added earlier for some platforms [2] because their cpufreq
drivers were getting registered before the CPUs were registered with
subsys framework. And hence they used to fail.

The same isn't true anymore though. The current code flow in the kernel
is:

start_kernel()
-> kernel_init()
   -> kernel_init_freeable()
      -> do_basic_setup()
         -> driver_init()
            -> cpu_dev_init()
               -> subsys_system_register() //For CPUs

         -> do_initcalls()
            -> cpufreq_register_driver()

Clearly, the CPUs will always get registered with subsys framework
before any cpufreq driver can get probed. Remove the flag and update the
relevant drivers.

Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/include/linux/cpufreq.h?id=7cc9f0d9a1ab04cedc60d64fd8dcf7df224a3b4d # [1]
Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/arch/arm/mach-sa1100/cpu-sa1100.c?id=f59d3bbe35f6268d729f51be82af8325d62f20f5 # [2]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c           |  2 +-
 drivers/cpufreq/cpufreq.c              |  3 +--
 drivers/cpufreq/davinci-cpufreq.c      |  2 +-
 drivers/cpufreq/loongson1-cpufreq.c    |  2 +-
 drivers/cpufreq/mediatek-cpufreq.c     |  2 +-
 drivers/cpufreq/omap-cpufreq.c         |  2 +-
 drivers/cpufreq/qcom-cpufreq-hw.c      |  2 +-
 drivers/cpufreq/s3c24xx-cpufreq.c      |  2 +-
 drivers/cpufreq/s5pv210-cpufreq.c      |  2 +-
 drivers/cpufreq/sa1100-cpufreq.c       |  2 +-
 drivers/cpufreq/sa1110-cpufreq.c       |  2 +-
 drivers/cpufreq/scmi-cpufreq.c         |  2 +-
 drivers/cpufreq/scpi-cpufreq.c         |  2 +-
 drivers/cpufreq/spear-cpufreq.c        |  2 +-
 drivers/cpufreq/tegra186-cpufreq.c     |  2 +-
 drivers/cpufreq/tegra194-cpufreq.c     |  3 +--
 drivers/cpufreq/vexpress-spc-cpufreq.c |  3 +--
 include/linux/cpufreq.h                | 17 +++++++----------
 18 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index ad4234518ef6..b1e1bdc63b01 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -175,7 +175,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver dt_cpufreq_driver = {
-	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+	.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 		 CPUFREQ_IS_COOLING_DEV,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = set_target,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d0a3525ce27f..7d0ae968def7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2810,8 +2810,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	if (ret)
 		goto err_boost_unreg;
 
-	if (!(cpufreq_driver->flags & CPUFREQ_STICKY) &&
-	    list_empty(&cpufreq_policy_list)) {
+	if (unlikely(list_empty(&cpufreq_policy_list))) {
 		/* if all ->init() calls failed, unregister */
 		ret = -ENODEV;
 		pr_debug("%s: No CPU initialized for driver %s\n", __func__,
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index 91f477a6cbc4..9e97f60f8199 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -95,7 +95,7 @@ static int davinci_cpu_init(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver davinci_driver = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= davinci_target,
 	.get		= cpufreq_generic_get,
diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c
index 86f612593e49..fb72d709db56 100644
--- a/drivers/cpufreq/loongson1-cpufreq.c
+++ b/drivers/cpufreq/loongson1-cpufreq.c
@@ -116,7 +116,7 @@ static int ls1x_cpufreq_exit(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver ls1x_cpufreq_driver = {
 	.name		= "cpufreq-ls1x",
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= ls1x_cpufreq_target,
 	.get		= cpufreq_generic_get,
diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index 022e3e966e71..f2e491b25b07 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -463,7 +463,7 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver mtk_cpufreq_driver = {
-	.flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+	.flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 		 CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
 		 CPUFREQ_IS_COOLING_DEV,
 	.verify = cpufreq_generic_frequency_table_verify,
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 3694bb030df3..e035ee216b0f 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -144,7 +144,7 @@ static int omap_cpu_exit(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver omap_driver = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= omap_target,
 	.get		= cpufreq_generic_get,
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 9ed5341dc515..2a3b4f44488b 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -374,7 +374,7 @@ static struct freq_attr *qcom_cpufreq_hw_attr[] = {
 };
 
 static struct cpufreq_driver cpufreq_qcom_hw_driver = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 			  CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
 			  CPUFREQ_IS_COOLING_DEV,
 	.verify		= cpufreq_generic_frequency_table_verify,
diff --git a/drivers/cpufreq/s3c24xx-cpufreq.c b/drivers/cpufreq/s3c24xx-cpufreq.c
index 37efc0dc3f91..7380c32b238e 100644
--- a/drivers/cpufreq/s3c24xx-cpufreq.c
+++ b/drivers/cpufreq/s3c24xx-cpufreq.c
@@ -420,7 +420,7 @@ static int s3c_cpufreq_resume(struct cpufreq_policy *policy)
 #endif
 
 static struct cpufreq_driver s3c24xx_driver = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.target		= s3c_cpufreq_target,
 	.get		= cpufreq_generic_get,
 	.init		= s3c_cpufreq_init,
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index bed496cf8d24..69786e5bbf05 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -574,7 +574,7 @@ static int s5pv210_cpufreq_reboot_notifier_event(struct notifier_block *this,
 }
 
 static struct cpufreq_driver s5pv210_driver = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= s5pv210_target,
 	.get		= cpufreq_generic_get,
diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c
index 5c075ef6adc0..252b9fc26124 100644
--- a/drivers/cpufreq/sa1100-cpufreq.c
+++ b/drivers/cpufreq/sa1100-cpufreq.c
@@ -186,7 +186,7 @@ static int __init sa1100_cpu_init(struct cpufreq_policy *policy)
 }
 
 static struct cpufreq_driver sa1100_driver __refdata = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 			  CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= sa1100_target,
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c
index d9d04d935b3a..1a83c8678a63 100644
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -310,7 +310,7 @@ static int __init sa1110_cpu_init(struct cpufreq_policy *policy)
 /* sa1110_driver needs __refdata because it must remain after init registers
  * it with cpufreq_register_driver() */
 static struct cpufreq_driver sa1110_driver __refdata = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 			  CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= sa1110_target,
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 491a0a24fb1e..5bd03b59887f 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -217,7 +217,7 @@ static int scmi_cpufreq_exit(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver scmi_cpufreq_driver = {
 	.name	= "scmi",
-	.flags	= CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+	.flags	= CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
 		  CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 		  CPUFREQ_IS_COOLING_DEV,
 	.verify	= cpufreq_generic_frequency_table_verify,
diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index e5140ad63db8..d6a698a1b5d1 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -191,7 +191,7 @@ static int scpi_cpufreq_exit(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver scpi_cpufreq_driver = {
 	.name	= "scpi-cpufreq",
-	.flags	= CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+	.flags	= CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
 		  CPUFREQ_NEED_INITIAL_FREQ_CHECK |
 		  CPUFREQ_IS_COOLING_DEV,
 	.verify	= cpufreq_generic_frequency_table_verify,
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 73bd8dc47074..7d0d62a06bf3 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -160,7 +160,7 @@ static int spear_cpufreq_init(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver spear_cpufreq_driver = {
 	.name		= "cpufreq-spear",
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= spear_cpufreq_target,
 	.get		= cpufreq_generic_get,
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index e566ea298b59..5d1943e787b0 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -117,7 +117,7 @@ static unsigned int tegra186_cpufreq_get(unsigned int cpu)
 
 static struct cpufreq_driver tegra186_cpufreq_driver = {
 	.name = "tegra186",
-	.flags = CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+	.flags = CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
 			CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.get = tegra186_cpufreq_get,
 	.verify = cpufreq_generic_frequency_table_verify,
diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c
index 6a67f36f3b80..a9620e4489ae 100644
--- a/drivers/cpufreq/tegra194-cpufreq.c
+++ b/drivers/cpufreq/tegra194-cpufreq.c
@@ -272,8 +272,7 @@ static int tegra194_cpufreq_set_target(struct cpufreq_policy *policy,
 
 static struct cpufreq_driver tegra194_cpufreq_driver = {
 	.name = "tegra194",
-	.flags = CPUFREQ_STICKY | CPUFREQ_CONST_LOOPS |
-		CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify = cpufreq_generic_frequency_table_verify,
 	.target_index = tegra194_cpufreq_set_target,
 	.get = tegra194_get_speed,
diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c
index f711d8eaea6a..51dfa9ae6cf5 100644
--- a/drivers/cpufreq/vexpress-spc-cpufreq.c
+++ b/drivers/cpufreq/vexpress-spc-cpufreq.c
@@ -486,8 +486,7 @@ static void ve_spc_cpufreq_ready(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver ve_spc_cpufreq_driver = {
 	.name			= "vexpress-spc",
-	.flags			= CPUFREQ_STICKY |
-					CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
+	.flags			= CPUFREQ_HAVE_GOVERNOR_PER_POLICY |
 					CPUFREQ_NEED_INITIAL_FREQ_CHECK,
 	.verify			= cpufreq_generic_frequency_table_verify,
 	.target_index		= ve_spc_cpufreq_set_target,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9c8b7437b6cd..c8e40e91fe9b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -387,8 +387,13 @@ struct cpufreq_driver {
 
 /* flags */
 
-/* driver isn't removed even if all ->init() calls failed */
-#define CPUFREQ_STICKY				BIT(0)
+/*
+ * Set by drivers that need to update internale upper and lower boundaries along
+ * with the target frequency and so the core and governors should also invoke
+ * the diver if the target frequency does not change, but the policy min or max
+ * may have changed.
+ */
+#define CPUFREQ_NEED_UPDATE_LIMITS		BIT(0)
 
 /* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */
 #define CPUFREQ_CONST_LOOPS			BIT(1)
@@ -432,14 +437,6 @@ struct cpufreq_driver {
  */
 #define CPUFREQ_IS_COOLING_DEV			BIT(7)
 
-/*
- * Set by drivers that need to update internale upper and lower boundaries along
- * with the target frequency and so the core and governors should also invoke
- * the diver if the target frequency does not change, but the policy min or max
- * may have changed.
- */
-#define CPUFREQ_NEED_UPDATE_LIMITS		BIT(8)
-
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 
-- 
cgit v1.2.3


From 2f0531869fd22182e769b10dd6cf151861ede791 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 2 Feb 2021 11:11:55 +0530
Subject: cpufreq: Remove unused flag CPUFREQ_PM_NO_WARN

This flag is set by one of the drivers but it isn't used in the code
otherwise. Remove the unused flag and update the driver.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/pmac32-cpufreq.c |  3 +--
 include/linux/cpufreq.h          | 13 +++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 73621bc11976..4f20c6a9108d 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -439,8 +439,7 @@ static struct cpufreq_driver pmac_cpufreq_driver = {
 	.init		= pmac_cpufreq_cpu_init,
 	.suspend	= pmac_cpufreq_suspend,
 	.resume		= pmac_cpufreq_resume,
-	.flags		= CPUFREQ_PM_NO_WARN |
-			  CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
+	.flags		= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.attr		= cpufreq_generic_attr,
 	.name		= "powermac",
 };
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c8e40e91fe9b..353969c7acd3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -398,8 +398,11 @@ struct cpufreq_driver {
 /* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */
 #define CPUFREQ_CONST_LOOPS			BIT(1)
 
-/* don't warn on suspend/resume speed mismatches */
-#define CPUFREQ_PM_NO_WARN			BIT(2)
+/*
+ * Set by drivers that want the core to automatically register the cpufreq
+ * driver as a thermal cooling device.
+ */
+#define CPUFREQ_IS_COOLING_DEV			BIT(2)
 
 /*
  * This should be set by platforms having multiple clock-domains, i.e.
@@ -431,12 +434,6 @@ struct cpufreq_driver {
  */
 #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING	BIT(6)
 
-/*
- * Set by drivers that want the core to automatically register the cpufreq
- * driver as a thermal cooling device.
- */
-#define CPUFREQ_IS_COOLING_DEV			BIT(7)
-
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 
-- 
cgit v1.2.3


From 7e3ce05e7f650371061d0b9eec1e1cf74ed6fca0 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Wed, 3 Feb 2021 22:48:16 -0300
Subject: netlink: add tracepoint at NL_SET_ERR_MSG

Often userspace won't request the extack information, or they don't log it
because of log level or so, and even when they do, sometimes it's not
enough to know exactly what caused the error.

Netlink extack is the standard way of reporting erros with descriptive
error messages. With a trace point on it, we then can know exactly where
the error happened, regardless of userspace app. Also, we can even see if
the err msg was overwritten.

The wrapper do_trace_netlink_extack() is because trace points shouldn't be
called from .h files, as trace points are not that small, and the function
call to do_trace_netlink_extack() on the macros is not protected by
tracepoint_enabled() because the macros are called from modules, and this
would require exporting some trace structs. As this is error path, it's
better to export just the wrapper instead.

v2: removed leftover tracepoint declaration

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/4546b63e67b2989789d146498b13cc09e1fdc543.1612403190.git.marcelo.leitner@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h        |  6 ++++++
 include/trace/events/netlink.h | 29 +++++++++++++++++++++++++++++
 net/netlink/af_netlink.c       |  8 ++++++++
 3 files changed, 43 insertions(+)
 create mode 100644 include/trace/events/netlink.h

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 9f118771e248..0bcf98098c5a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -11,6 +11,8 @@
 
 struct net;
 
+void do_trace_netlink_extack(const char *msg);
+
 static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 {
 	return (struct nlmsghdr *)skb->data;
@@ -90,6 +92,8 @@ struct netlink_ext_ack {
 	static const char __msg[] = msg;		\
 	struct netlink_ext_ack *__extack = (extack);	\
 							\
+	do_trace_netlink_extack(__msg);			\
+							\
 	if (__extack)					\
 		__extack->_msg = __msg;			\
 } while (0)
@@ -110,6 +114,8 @@ struct netlink_ext_ack {
 	static const char __msg[] = msg;			\
 	struct netlink_ext_ack *__extack = (extack);		\
 								\
+	do_trace_netlink_extack(__msg);				\
+								\
 	if (__extack) {						\
 		__extack->_msg = __msg;				\
 		__extack->bad_attr = (attr);			\
diff --git a/include/trace/events/netlink.h b/include/trace/events/netlink.h
new file mode 100644
index 000000000000..3b7be3b386a4
--- /dev/null
+++ b/include/trace/events/netlink.h
@@ -0,0 +1,29 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM netlink
+
+#if !defined(_TRACE_NETLINK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NETLINK_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(netlink_extack,
+
+	TP_PROTO(const char *msg),
+
+	TP_ARGS(msg),
+
+	TP_STRUCT__entry(
+		__string(	msg,	msg	)
+	),
+
+	TP_fast_assign(
+		__assign_str(msg, msg);
+	),
+
+	TP_printk("msg=%s", __get_str(msg))
+);
+
+#endif /* _TRACE_NETLINK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index daca50d6bb12..dd488938447f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -67,6 +67,8 @@
 #include <net/sock.h>
 #include <net/scm.h>
 #include <net/netlink.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/netlink.h>
 
 #include "af_netlink.h"
 
@@ -147,6 +149,12 @@ static BLOCKING_NOTIFIER_HEAD(netlink_chain);
 
 static const struct rhashtable_params netlink_rhashtable_params;
 
+void do_trace_netlink_extack(const char *msg)
+{
+	trace_netlink_extack(msg);
+}
+EXPORT_SYMBOL(do_trace_netlink_extack);
+
 static inline u32 netlink_group_mask(u32 group)
 {
 	return group ? 1 << (group - 1) : 0;
-- 
cgit v1.2.3


From 0053859496baa17c7675526936677c9213bf5a0d Mon Sep 17 00:00:00 2001
From: Brian Vazquez <brianvv@google.com>
Date: Thu, 4 Feb 2021 18:18:38 +0000
Subject: net: add EXPORT_INDIRECT_CALLABLE wrapper

When a static function is annotated with INDIRECT_CALLABLE_SCOPE and
CONFIG_RETPOLINE is set, the static keyword is removed. Sometimes the
function needs to be exported but EXPORT_SYMBOL can't be used because if
CONFIG_RETPOLINE is not set, we will attempt to export a static symbol.

This patch introduces a new indirect call wrapper:
EXPORT_INDIRECT_CALLABLE. This basically does EXPORT_SYMBOL when
CONFIG_RETPOLINE is set, but does nothing when it's not.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Brian Vazquez <brianvv@google.com>
Link: https://lore.kernel.org/r/20210204181839.558951-1-brianvv@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/indirect_call_wrapper.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index 54c02c84906a..a8345c8a613d 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -36,6 +36,7 @@
 
 #define INDIRECT_CALLABLE_DECLARE(f)	f
 #define INDIRECT_CALLABLE_SCOPE
+#define EXPORT_INDIRECT_CALLABLE(f)	EXPORT_SYMBOL(f)
 
 #else
 #define INDIRECT_CALL_1(f, f1, ...) f(__VA_ARGS__)
@@ -44,6 +45,7 @@
 #define INDIRECT_CALL_4(f, f4, f3, f2, f1, ...) f(__VA_ARGS__)
 #define INDIRECT_CALLABLE_DECLARE(f)
 #define INDIRECT_CALLABLE_SCOPE		static
+#define EXPORT_INDIRECT_CALLABLE(f)
 #endif
 
 /*
-- 
cgit v1.2.3


From 1d7bab6a94458e959f3f55788fd50ddc7d97403b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Tue, 2 Feb 2021 13:30:54 +0000
Subject: mm: constify page_is_pfmemalloc() argument

The function only tests for page->index, so its argument should be
const.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8cd6ae..078633d43af9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1584,7 +1584,7 @@ struct address_space *page_mapping_file(struct page *page);
  * ALLOC_NO_WATERMARKS and the low watermark was not
  * met implying that the system is under some pressure.
  */
-static inline bool page_is_pfmemalloc(struct page *page)
+static inline bool page_is_pfmemalloc(const struct page *page)
 {
 	/*
 	 * Page index cannot be this large so this must be
-- 
cgit v1.2.3


From 48f971c9c80a728646fc03367a28df747f20d0f4 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Tue, 2 Feb 2021 13:31:05 +0000
Subject: skbuff: constify skb_propagate_pfmemalloc() "page" argument

The function doesn't write anything to the page struct itself,
so this argument can be const.

Misc: align second argument to the brace while at it.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9313b5aaf45b..b027526da4f9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2943,8 +2943,8 @@ static inline struct page *dev_alloc_page(void)
  *	@page: The page that was allocated from skb_alloc_page
  *	@skb: The skb that may need pfmemalloc set
  */
-static inline void skb_propagate_pfmemalloc(struct page *page,
-					     struct sk_buff *skb)
+static inline void skb_propagate_pfmemalloc(const struct page *page,
+					    struct sk_buff *skb)
 {
 	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc = true;
-- 
cgit v1.2.3


From bc38f30f8dbce0afb8af05d917bee084b1329418 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Tue, 2 Feb 2021 13:31:21 +0000
Subject: net: introduce common dev_page_is_reusable()

A bunch of drivers test the page before reusing/recycling for two
common conditions:
 - if a page was allocated under memory pressure (pfmemalloc page);
 - if a page was allocated at a distant memory node (to exclude
   slowdowns).

Introduce a new common inline for doing this, with likely() already
folded inside to make driver code a bit simpler.

Suggested-by: David Rientjes <rientjes@google.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b027526da4f9..0e42c53b8ca9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2938,6 +2938,22 @@ static inline struct page *dev_alloc_page(void)
 	return dev_alloc_pages(0);
 }
 
+/**
+ * dev_page_is_reusable - check whether a page can be reused for network Rx
+ * @page: the page to test
+ *
+ * A page shouldn't be considered for reusing/recycling if it was allocated
+ * under memory pressure or at a distant memory node.
+ *
+ * Returns false if this page should be returned to page allocator, true
+ * otherwise.
+ */
+static inline bool dev_page_is_reusable(const struct page *page)
+{
+	return likely(page_to_nid(page) == numa_mem_id() &&
+		      !page_is_pfmemalloc(page));
+}
+
 /**
  *	skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
  *	@page: The page that was allocated from skb_alloc_page
-- 
cgit v1.2.3


From 1faba27f11c8da244e793546a1b35a9b1da8208e Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 3 Feb 2021 15:51:09 +0200
Subject: ipv6: silence compilation warning for non-IPV6 builds

The W=1 compilation of allmodconfig generates the following warning:

net/ipv6/icmp.c:448:6: warning: no previous prototype for 'icmp6_send' [-Wmissing-prototypes]
  448 | void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
      |      ^~~~~~~~~~

Fix it by providing function declaration for builds with ipv6 as a module.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/icmpv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 1b3371ae8193..452d8978ffc7 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -16,9 +16,9 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
 
 typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 			     const struct in6_addr *force_saddr);
-#if IS_BUILTIN(CONFIG_IPV6)
 void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		const struct in6_addr *force_saddr);
+#if IS_BUILTIN(CONFIG_IPV6)
 static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 {
 	icmp6_send(skb, type, code, info, NULL);
-- 
cgit v1.2.3


From 52cbd23a119c6ebf40a527e53f3402d2ea38eccb Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 3 Feb 2021 14:29:52 -0500
Subject: udp: fix skb_copy_and_csum_datagram with odd segment sizes

When iteratively computing a checksum with csum_block_add, track the
offset "pos" to correctly rotate in csum_block_add when offset is odd.

The open coded implementation of skb_copy_and_csum_datagram did this.
With the switch to __skb_datagram_iter calling csum_and_copy_to_iter,
pos was reinitialized to 0 on each call.

Bring back the pos by passing it along with the csum to the callback.

Changes v1->v2
  - pass csum value, instead of csump pointer (Alexander Duyck)

Link: https://lore.kernel.org/netdev/20210128152353.GB27281@optiplex/
Fixes: 950fcaecd5cc ("datagram: consolidate datagram copy to iter helpers")
Reported-by: Oliver Graute <oliver.graute@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20210203192952.1849843-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/uio.h |  8 +++++++-
 lib/iov_iter.c      | 24 ++++++++++++++----------
 net/core/datagram.c | 12 ++++++++++--
 3 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 72d88566694e..27ff8eb786dc 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -260,7 +260,13 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
 	i->count = count;
 }
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
+
+struct csum_state {
+	__wsum csum;
+	size_t off;
+};
+
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index a21e6a5792c5..f0b2ccb1bb01 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -592,14 +592,15 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
 }
 
 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
-				__wsum *csum, struct iov_iter *i)
+					 struct csum_state *csstate,
+					 struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
 	unsigned int p_mask = pipe->ring_size - 1;
+	__wsum sum = csstate->csum;
+	size_t off = csstate->off;
 	unsigned int i_head;
 	size_t n, r;
-	size_t off = 0;
-	__wsum sum = *csum;
 
 	if (!sanity(i))
 		return 0;
@@ -621,7 +622,8 @@ static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 		i_head++;
 	} while (n);
 	i->count -= bytes;
-	*csum = sum;
+	csstate->csum = sum;
+	csstate->off = off;
 	return bytes;
 }
 
@@ -1522,18 +1524,19 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter_full);
 
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 			     struct iov_iter *i)
 {
+	struct csum_state *csstate = _csstate;
 	const char *from = addr;
-	__wsum *csum = csump;
 	__wsum sum, next;
-	size_t off = 0;
+	size_t off;
 
 	if (unlikely(iov_iter_is_pipe(i)))
-		return csum_and_copy_to_pipe_iter(addr, bytes, csum, i);
+		return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
 
-	sum = *csum;
+	sum = csstate->csum;
+	off = csstate->off;
 	if (unlikely(iov_iter_is_discard(i))) {
 		WARN_ON(1);	/* for now */
 		return 0;
@@ -1561,7 +1564,8 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
 		off += v.iov_len;
 	})
 	)
-	*csum = sum;
+	csstate->csum = sum;
+	csstate->off = off;
 	return bytes;
 }
 EXPORT_SYMBOL(csum_and_copy_to_iter);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 81809fa735a7..15ab9ffb27fe 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -721,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      struct iov_iter *to, int len,
 				      __wsum *csump)
 {
-	return __skb_datagram_iter(skb, offset, to, len, true,
-			csum_and_copy_to_iter, csump);
+	struct csum_state csdata = { .csum = *csump };
+	int ret;
+
+	ret = __skb_datagram_iter(skb, offset, to, len, true,
+				  csum_and_copy_to_iter, &csdata);
+	if (ret)
+		return ret;
+
+	*csump = csdata.csum;
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From 151f6ff78cdf1d6de76e90556cfc43f1e48abe18 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Thu, 4 Feb 2021 17:17:06 +0300
Subject: software node: Provide replacement for device_add_properties()

At the moment the function device_del() is calling
device_remove_properties() unconditionally. That will result into the
reference count of the software node attached to the device being
decremented, and in most cases it will hit 0 at that point. So in
practice device_del() will unregister the software node attached to
the device, even if that was not the intention of the caller. Right
now software nodes can not be reused or shared because of that.

So device_del() can not unregister the software nodes unconditionally
like that. Unfortunately some of the users of device_add_properties()
are now relying on this behaviour. Because of that, and also in
general, we do need a function that can offer similar behaviour where
the lifetime of the software node is bound to the lifetime of the
device. But it just has to be a separate function so the behaviour is
optional. We can not remove the device_remove_properties() call from
device_del() before we have that new function, and before we have
replaced device_add_properties() calls with it in all the places that
require that behaviour.

This adds function device_create_managed_software_node() that can be
used for exactly that purpose. Software nodes created with it are
declared "managed", and separate handling for those nodes is added to
the software node code. The reference count of the "managed" nodes is
decremented when the device they are attached to is removed. This will
not affect the other nodes that are not declared "managed".

The function device_create_managed_software_node() has also one
additional feature that device_add_properties() does not have. It
allows the software nodes created with it to be part of a node
hierarchy by taking also an optional parent node as parameter.

Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20210204141711.53775-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/swnode.c    | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |  4 ++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 4842dd678200..45cf8c879d71 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -24,6 +24,7 @@ struct swnode {
 	struct swnode *parent;
 
 	unsigned int allocated:1;
+	unsigned int managed:1;
 };
 
 static DEFINE_IDA(swnode_root_ids);
@@ -903,6 +904,43 @@ void device_remove_software_node(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(device_remove_software_node);
 
+/**
+ * device_create_managed_software_node - Create a software node for a device
+ * @dev: The device the software node is assigned to.
+ * @properties: Device properties for the software node.
+ * @parent: Parent of the software node.
+ *
+ * Creates a software node as a managed resource for @dev, which means the
+ * lifetime of the newly created software node is tied to the lifetime of @dev.
+ * Software nodes created with this function should not be reused or shared
+ * because of that. The function takes a deep copy of @properties for the
+ * software node.
+ *
+ * Since the new software node is assigned directly to @dev, and since it should
+ * not be shared, it is not returned to the caller. The function returns 0 on
+ * success, and errno in case of an error.
+ */
+int device_create_managed_software_node(struct device *dev,
+					const struct property_entry *properties,
+					const struct software_node *parent)
+{
+	struct fwnode_handle *p = software_node_fwnode(parent);
+	struct fwnode_handle *fwnode;
+
+	if (parent && !p)
+		return -EINVAL;
+
+	fwnode = fwnode_create_software_node(properties, p);
+	if (IS_ERR(fwnode))
+		return PTR_ERR(fwnode);
+
+	to_swnode(fwnode)->managed = true;
+	set_secondary_fwnode(dev, fwnode);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(device_create_managed_software_node);
+
 int software_node_notify(struct device *dev, unsigned long action)
 {
 	struct swnode *swnode;
@@ -931,6 +969,11 @@ int software_node_notify(struct device *dev, unsigned long action)
 		sysfs_remove_link(&swnode->kobj, dev_name(dev));
 		sysfs_remove_link(&dev->kobj, "software_node");
 		kobject_put(&swnode->kobj);
+
+		if (swnode->managed) {
+			set_secondary_fwnode(dev, NULL);
+			kobject_put(&swnode->kobj);
+		}
 		break;
 	default:
 		break;
diff --git a/include/linux/property.h b/include/linux/property.h
index b0e413dc5927..dafccfce0262 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -491,4 +491,8 @@ void fwnode_remove_software_node(struct fwnode_handle *fwnode);
 int device_add_software_node(struct device *dev, const struct software_node *swnode);
 void device_remove_software_node(struct device *dev);
 
+int device_create_managed_software_node(struct device *dev,
+					const struct property_entry *properties,
+					const struct software_node *parent);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From a8c3209998afb5c4941b49e35b513cea9050cb4a Mon Sep 17 00:00:00 2001
From: Andres Beltran <lkmlabelt@gmail.com>
Date: Tue, 8 Dec 2020 05:53:11 +0100
Subject: Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring
 buffer

Pointers to ring-buffer packets sent by Hyper-V are used within the
guest VM. Hyper-V can send packets with erroneous values or modify
packet fields after they are processed by the guest. To defend
against these scenarios, return a copy of the incoming VMBus packet
after validating its length and offset fields in hv_pkt_iter_first().
In this way, the packet can no longer be modified by the host.

Signed-off-by: Andres Beltran <lkmlabelt@gmail.com>
Co-developed-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: netdev@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20201208045311.10244-1-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel.c              |  9 +++--
 drivers/hv/hv_fcopy.c             |  1 +
 drivers/hv/hv_kvp.c               |  1 +
 drivers/hv/hyperv_vmbus.h         |  2 +-
 drivers/hv/ring_buffer.c          | 82 +++++++++++++++++++++++++++++++++------
 drivers/net/hyperv/hyperv_net.h   |  3 ++
 drivers/net/hyperv/netvsc.c       |  2 +
 drivers/net/hyperv/rndis_filter.c |  2 +
 drivers/scsi/storvsc_drv.c        | 10 +++++
 include/linux/hyperv.h            | 48 +++++++++++++++++++----
 net/vmw_vsock/hyperv_transport.c  |  4 +-
 11 files changed, 139 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 6fb0c76bfbf8..0d63862d6551 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -597,12 +597,15 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
 
-	err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages);
+	if (!newchannel->max_pkt_size)
+		newchannel->max_pkt_size = VMBUS_DEFAULT_MAX_PKT_SIZE;
+
+	err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages, 0);
 	if (err)
 		goto error_clean_ring;
 
-	err = hv_ringbuffer_init(&newchannel->inbound,
-				 &page[send_pages], recv_pages);
+	err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
+				 recv_pages, newchannel->max_pkt_size);
 	if (err)
 		goto error_clean_ring;
 
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 5040d7e0cd9e..85784a9e6a36 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -329,6 +329,7 @@ int hv_fcopy_init(struct hv_util_service *srv)
 {
 	recv_buffer = srv->recv_buffer;
 	fcopy_transaction.recv_channel = srv->channel;
+	fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 754d35a25a1c..7d2a7b918847 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -741,6 +741,7 @@ hv_kvp_init(struct hv_util_service *srv)
 {
 	recv_buffer = srv->recv_buffer;
 	kvp_transaction.recv_channel = srv->channel;
+	kvp_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 4;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 9416e09ebd58..42f3d9d123a1 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -174,7 +174,7 @@ extern int hv_synic_cleanup(unsigned int cpu);
 void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
 
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 pagecnt);
+		       struct page *pages, u32 pagecnt, u32 max_pkt_size);
 
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 35833d4d1a1d..29e90477363a 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -190,7 +190,7 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 
 /* Initialize the ring buffer. */
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 page_cnt)
+		       struct page *pages, u32 page_cnt, u32 max_pkt_size)
 {
 	int i;
 	struct page **pages_wraparound;
@@ -232,6 +232,14 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 		sizeof(struct hv_ring_buffer);
 	ring_info->priv_read_index = 0;
 
+	/* Initialize buffer that holds copies of incoming packets */
+	if (max_pkt_size) {
+		ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL);
+		if (!ring_info->pkt_buffer)
+			return -ENOMEM;
+		ring_info->pkt_buffer_size = max_pkt_size;
+	}
+
 	spin_lock_init(&ring_info->ring_lock);
 
 	return 0;
@@ -244,6 +252,9 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 	vunmap(ring_info->ring_buffer);
 	ring_info->ring_buffer = NULL;
 	mutex_unlock(&ring_info->ring_buffer_mutex);
+
+	kfree(ring_info->pkt_buffer);
+	ring_info->pkt_buffer_size = 0;
 }
 
 /* Write to the ring buffer. */
@@ -385,7 +396,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 	memcpy(buffer, (const char *)desc + offset, packetlen);
 
 	/* Advance ring index to next packet descriptor */
-	__hv_pkt_iter_next(channel, desc);
+	__hv_pkt_iter_next(channel, desc, true);
 
 	/* Notify host of update */
 	hv_pkt_iter_close(channel);
@@ -411,6 +422,22 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 		return (rbi->ring_datasize - priv_read_loc) + write_loc;
 }
 
+/*
+ * Get first vmbus packet without copying it out of the ring buffer
+ */
+struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *rbi = &channel->inbound;
+
+	hv_debug_delay_test(channel, MESSAGE_DELAY);
+
+	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
+		return NULL;
+
+	return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
+}
+EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw);
+
 /*
  * Get first vmbus packet from ring buffer after read_index
  *
@@ -419,17 +446,49 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
-	struct vmpacket_descriptor *desc;
+	struct vmpacket_descriptor *desc, *desc_copy;
+	u32 bytes_avail, pkt_len, pkt_offset;
 
-	hv_debug_delay_test(channel, MESSAGE_DELAY);
-	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
+	desc = hv_pkt_iter_first_raw(channel);
+	if (!desc)
 		return NULL;
 
-	desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index;
-	if (desc)
-		prefetch((char *)desc + (desc->len8 << 3));
+	bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi));
+
+	/*
+	 * Ensure the compiler does not use references to incoming Hyper-V values (which
+	 * could change at any moment) when reading local variables later in the code
+	 */
+	pkt_len = READ_ONCE(desc->len8) << 3;
+	pkt_offset = READ_ONCE(desc->offset8) << 3;
+
+	/*
+	 * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and
+	 * rbi->pkt_buffer_size
+	 */
+	if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail)
+		pkt_len = bytes_avail;
+
+	/*
+	 * If pkt_offset is invalid, arbitrarily set it to
+	 * the size of vmpacket_descriptor
+	 */
+	if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len)
+		pkt_offset = sizeof(struct vmpacket_descriptor);
+
+	/* Copy the Hyper-V packet out of the ring buffer */
+	desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer;
+	memcpy(desc_copy, desc, pkt_len);
+
+	/*
+	 * Hyper-V could still change len8 and offset8 after the earlier read.
+	 * Ensure that desc_copy has legal values for len8 and offset8 that
+	 * are consistent with the copy we just made
+	 */
+	desc_copy->len8 = pkt_len >> 3;
+	desc_copy->offset8 = pkt_offset >> 3;
 
-	return desc;
+	return desc_copy;
 }
 EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
 
@@ -441,7 +500,8 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
  */
 struct vmpacket_descriptor *
 __hv_pkt_iter_next(struct vmbus_channel *channel,
-		   const struct vmpacket_descriptor *desc)
+		   const struct vmpacket_descriptor *desc,
+		   bool copy)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
 	u32 packetlen = desc->len8 << 3;
@@ -454,7 +514,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel,
 		rbi->priv_read_index -= dsize;
 
 	/* more data? */
-	return hv_pkt_iter_first(channel);
+	return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel);
 }
 EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 2a87cfa27ac0..7ea6936f86ef 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -860,9 +860,12 @@ static inline u32 netvsc_rqstor_size(unsigned long ringbytes)
 		ringbytes / NETVSC_MIN_IN_MSG_SIZE;
 }
 
+#define NETVSC_MAX_XFER_PAGE_RANGES 375
 #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \
 		(offsetof(struct vmtransfer_page_packet_header, ranges) + \
 		(rng_cnt) * sizeof(struct vmtransfer_page_range))
+#define NETVSC_MAX_PKT_SIZE (NETVSC_XFER_HEADER_SIZE(NETVSC_MAX_XFER_PAGE_RANGES) + \
+		sizeof(struct nvsp_message) + (sizeof(u32) * VRSS_SEND_TAB_SIZE))
 
 struct multi_send_data {
 	struct sk_buff *skb; /* skb containing the pkt */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2350342b961f..1510a236aa34 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1530,6 +1530,8 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 	/* Open the channel */
 	device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
+	device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
+
 	ret = vmbus_open(device->channel, netvsc_ring_bytes,
 			 netvsc_ring_bytes,  NULL, 0,
 			 netvsc_channel_cb, net_device->chan_table);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 598713c0d5a8..7e6dee2f02a4 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1174,6 +1174,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	nvchan->channel = new_sc;
 
 	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
+	new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE;
+
 	ret = vmbus_open(new_sc, netvsc_ring_bytes,
 			 netvsc_ring_bytes, NULL, 0,
 			 netvsc_channel_cb, nvchan);
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 2e4fa77445fd..7e59284dbf5b 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -414,6 +414,14 @@ static void storvsc_on_channel_callback(void *context);
 #define STORVSC_IDE_MAX_TARGETS				1
 #define STORVSC_IDE_MAX_CHANNELS			1
 
+/*
+ * Upper bound on the size of a storvsc packet. vmscsi_size_delta is not
+ * included in the calculation because it is set after STORVSC_MAX_PKT_SIZE
+ * is used in storvsc_connect_to_vsp
+ */
+#define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\
+			      sizeof(struct vstor_packet))
+
 struct storvsc_cmd_request {
 	struct scsi_cmnd *cmd;
 
@@ -698,6 +706,7 @@ static void handle_sc_creation(struct vmbus_channel *new_sc)
 		return;
 
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
+	new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 
 	/*
 	 * The size of vmbus_requestor is an upper bound on the number of requests
@@ -1280,6 +1289,7 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size,
 
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
 
+	device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 	/*
 	 * The size of vmbus_requestor is an upper bound on the number of requests
 	 * that can be in-progress at any one time across all channels.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5ddb479c4d4c..fbae8406d5d4 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -181,6 +181,10 @@ struct hv_ring_buffer_info {
 	 * being freed while the ring buffer is being accessed.
 	 */
 	struct mutex ring_buffer_mutex;
+
+	/* Buffer that holds a copy of an incoming host packet */
+	void *pkt_buffer;
+	u32 pkt_buffer_size;
 };
 
 
@@ -787,6 +791,8 @@ struct vmbus_device {
 	bool perf_device;
 };
 
+#define VMBUS_DEFAULT_MAX_PKT_SIZE 4096
+
 struct vmbus_channel {
 	struct list_head listentry;
 
@@ -1008,6 +1014,9 @@ struct vmbus_channel {
 	/* request/transaction ids for VMBus */
 	struct vmbus_requestor requestor;
 	u32 rqstor_size;
+
+	/* The max size of a packet on this channel */
+	u32 max_pkt_size;
 };
 
 u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr);
@@ -1642,32 +1651,55 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc)
 }
 
 
+struct vmpacket_descriptor *
+hv_pkt_iter_first_raw(struct vmbus_channel *channel);
+
 struct vmpacket_descriptor *
 hv_pkt_iter_first(struct vmbus_channel *channel);
 
 struct vmpacket_descriptor *
 __hv_pkt_iter_next(struct vmbus_channel *channel,
-		   const struct vmpacket_descriptor *pkt);
+		   const struct vmpacket_descriptor *pkt,
+		   bool copy);
 
 void hv_pkt_iter_close(struct vmbus_channel *channel);
 
-/*
- * Get next packet descriptor from iterator
- * If at end of list, return NULL and update host.
- */
 static inline struct vmpacket_descriptor *
-hv_pkt_iter_next(struct vmbus_channel *channel,
-		 const struct vmpacket_descriptor *pkt)
+hv_pkt_iter_next_pkt(struct vmbus_channel *channel,
+		     const struct vmpacket_descriptor *pkt,
+		     bool copy)
 {
 	struct vmpacket_descriptor *nxt;
 
-	nxt = __hv_pkt_iter_next(channel, pkt);
+	nxt = __hv_pkt_iter_next(channel, pkt, copy);
 	if (!nxt)
 		hv_pkt_iter_close(channel);
 
 	return nxt;
 }
 
+/*
+ * Get next packet descriptor without copying it out of the ring buffer
+ * If at end of list, return NULL and update host.
+ */
+static inline struct vmpacket_descriptor *
+hv_pkt_iter_next_raw(struct vmbus_channel *channel,
+		     const struct vmpacket_descriptor *pkt)
+{
+	return hv_pkt_iter_next_pkt(channel, pkt, false);
+}
+
+/*
+ * Get next packet descriptor from iterator
+ * If at end of list, return NULL and update host.
+ */
+static inline struct vmpacket_descriptor *
+hv_pkt_iter_next(struct vmbus_channel *channel,
+		 const struct vmpacket_descriptor *pkt)
+{
+	return hv_pkt_iter_next_pkt(channel, pkt, true);
+}
+
 #define foreach_vmbus_pkt(pkt, channel) \
 	for (pkt = hv_pkt_iter_first(channel); pkt; \
 	    pkt = hv_pkt_iter_next(channel, pkt))
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 630b851f8150..cd8b7c1ca9f1 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -600,7 +600,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 		return -EOPNOTSUPP;
 
 	if (need_refill) {
-		hvs->recv_desc = hv_pkt_iter_first(hvs->chan);
+		hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan);
 		ret = hvs_update_recv_data(hvs);
 		if (ret)
 			return ret;
@@ -614,7 +614,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 
 	hvs->recv_data_len -= to_read;
 	if (hvs->recv_data_len == 0) {
-		hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc);
+		hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc);
 		if (hvs->recv_desc) {
 			ret = hvs_update_recv_data(hvs);
 			if (ret)
-- 
cgit v1.2.3


From 06caa778d8b2fbcb4ac3878751e39d116424ba9b Mon Sep 17 00:00:00 2001
From: Andres Beltran <lkmlabelt@gmail.com>
Date: Mon, 9 Nov 2020 11:07:04 +0100
Subject: hv_utils: Add validation for untrusted Hyper-V values

For additional robustness in the face of Hyper-V errors or malicious
behavior, validate all values that originate from packets that Hyper-V
has sent to the guest in the host-to-guest ring buffer. Ensure that
invalid values cannot cause indexing off the end of the icversion_data
array in vmbus_prep_negotiate_resp().

Signed-off-by: Andres Beltran <lkmlabelt@gmail.com>
Co-developed-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by:  Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20201109100704.9152-1-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel_mgmt.c |  24 +++--
 drivers/hv/hv_fcopy.c     |  36 ++++++--
 drivers/hv/hv_kvp.c       | 122 ++++++++++++++-----------
 drivers/hv/hv_snapshot.c  |  89 +++++++++++--------
 drivers/hv/hv_util.c      | 222 ++++++++++++++++++++++++++++------------------
 include/linux/hyperv.h    |   9 +-
 6 files changed, 314 insertions(+), 188 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 1d44bb635bb8..5bc5eef5da15 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -190,6 +190,7 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
  * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
  * @icmsghdrp: Pointer to msg header structure
  * @buf: Raw buffer channel data
+ * @buflen: Length of the raw buffer channel data.
  * @fw_version: The framework versions we can support.
  * @fw_vercnt: The size of @fw_version.
  * @srv_version: The service versions we can support.
@@ -202,8 +203,8 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
  * Set up and fill in default negotiate response message.
  * Mainly used by Hyper-V drivers.
  */
-bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
-				u8 *buf, const int *fw_version, int fw_vercnt,
+bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
+				u32 buflen, const int *fw_version, int fw_vercnt,
 				const int *srv_version, int srv_vercnt,
 				int *nego_fw_version, int *nego_srv_version)
 {
@@ -215,10 +216,14 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
 	bool found_match = false;
 	struct icmsg_negotiate *negop;
 
+	/* Check that there's enough space for icframe_vercnt, icmsg_vercnt */
+	if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) {
+		pr_err_ratelimited("Invalid icmsg negotiate\n");
+		return false;
+	}
+
 	icmsghdrp->icmsgsize = 0x10;
-	negop = (struct icmsg_negotiate *)&buf[
-		sizeof(struct vmbuspipe_hdr) +
-		sizeof(struct icmsg_hdr)];
+	negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR];
 
 	icframe_major = negop->icframe_vercnt;
 	icframe_minor = 0;
@@ -226,6 +231,15 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
 	icmsg_major = negop->icmsg_vercnt;
 	icmsg_minor = 0;
 
+	/* Validate negop packet */
+	if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
+	    icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
+	    ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) {
+		pr_err_ratelimited("Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n",
+				   icframe_major, icmsg_major);
+		goto fw_error;
+	}
+
 	/*
 	 * Select the framework version number we will
 	 * support.
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 85784a9e6a36..660036da7449 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -235,15 +235,27 @@ void hv_fcopy_onchannelcallback(void *context)
 	if (fcopy_transaction.state > HVUTIL_READY)
 		return;
 
-	vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
-			 &requestid);
-	if (recvlen <= 0)
+	if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
+		pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n");
 		return;
+	}
+
+	if (!recvlen)
+		return;
+
+	/* Ensure recvlen is big enough to read header data */
+	if (recvlen < ICMSG_HDR) {
+		pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n",
+				   recvlen);
+		return;
+	}
 
 	icmsghdr = (struct icmsg_hdr *)&recv_buffer[
 			sizeof(struct vmbuspipe_hdr)];
+
 	if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-		if (vmbus_prep_negotiate_resp(icmsghdr, recv_buffer,
+		if (vmbus_prep_negotiate_resp(icmsghdr,
+				recv_buffer, recvlen,
 				fw_versions, FW_VER_COUNT,
 				fcopy_versions, FCOPY_VER_COUNT,
 				NULL, &fcopy_srv_version)) {
@@ -252,10 +264,14 @@ void hv_fcopy_onchannelcallback(void *context)
 				fcopy_srv_version >> 16,
 				fcopy_srv_version & 0xFFFF);
 		}
-	} else {
-		fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[
-				sizeof(struct vmbuspipe_hdr) +
-				sizeof(struct icmsg_hdr)];
+	} else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) {
+		/* Ensure recvlen is big enough to contain hv_fcopy_hdr */
+		if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) {
+			pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n",
+					   recvlen);
+			return;
+		}
+		fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR];
 
 		/*
 		 * Stash away this global state for completing the
@@ -280,6 +296,10 @@ void hv_fcopy_onchannelcallback(void *context)
 		schedule_delayed_work(&fcopy_timeout_work,
 				      HV_UTIL_TIMEOUT * HZ);
 		return;
+	} else {
+		pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n",
+				   icmsghdr->icmsgtype);
+		return;
 	}
 	icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
 	vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 7d2a7b918847..c698592b83e4 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -662,71 +662,87 @@ void hv_kvp_onchannelcallback(void *context)
 	if (kvp_transaction.state > HVUTIL_READY)
 		return;
 
-	vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen,
-			 &requestid);
-
-	if (recvlen > 0) {
-		icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
-			sizeof(struct vmbuspipe_hdr)];
-
-		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			if (vmbus_prep_negotiate_resp(icmsghdrp,
-				 recv_buffer, fw_versions, FW_VER_COUNT,
-				 kvp_versions, KVP_VER_COUNT,
-				 NULL, &kvp_srv_version)) {
-				pr_info("KVP IC version %d.%d\n",
-					kvp_srv_version >> 16,
-					kvp_srv_version & 0xFFFF);
-			}
-		} else {
-			kvp_msg = (struct hv_kvp_msg *)&recv_buffer[
-				sizeof(struct vmbuspipe_hdr) +
-				sizeof(struct icmsg_hdr)];
+	if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen, &requestid)) {
+		pr_err_ratelimited("KVP request received. Could not read into recv buf\n");
+		return;
+	}
 
-			/*
-			 * Stash away this global state for completing the
-			 * transaction; note transactions are serialized.
-			 */
+	if (!recvlen)
+		return;
 
-			kvp_transaction.recv_len = recvlen;
-			kvp_transaction.recv_req_id = requestid;
-			kvp_transaction.kvp_msg = kvp_msg;
+	/* Ensure recvlen is big enough to read header data */
+	if (recvlen < ICMSG_HDR) {
+		pr_err_ratelimited("KVP request received. Packet length too small: %d\n",
+				   recvlen);
+		return;
+	}
 
-			if (kvp_transaction.state < HVUTIL_READY) {
-				/* Userspace is not registered yet */
-				kvp_respond_to_host(NULL, HV_E_FAIL);
-				return;
-			}
-			kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
+	icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)];
+
+	if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+		if (vmbus_prep_negotiate_resp(icmsghdrp,
+				recv_buffer, recvlen,
+				fw_versions, FW_VER_COUNT,
+				kvp_versions, KVP_VER_COUNT,
+				NULL, &kvp_srv_version)) {
+			pr_info("KVP IC version %d.%d\n",
+				kvp_srv_version >> 16,
+				kvp_srv_version & 0xFFFF);
+		}
+	} else if (icmsghdrp->icmsgtype == ICMSGTYPE_KVPEXCHANGE) {
+		/*
+		 * recvlen is not checked against sizeof(struct kvp_msg) because kvp_msg contains
+		 * a union of structs and the msg type received is not known. Code using this
+		 * struct should provide validation when accessing its fields.
+		 */
+		kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ICMSG_HDR];
 
-			/*
-			 * Get the information from the
-			 * user-mode component.
-			 * component. This transaction will be
-			 * completed when we get the value from
-			 * the user-mode component.
-			 * Set a timeout to deal with
-			 * user-mode not responding.
-			 */
-			schedule_work(&kvp_sendkey_work);
-			schedule_delayed_work(&kvp_timeout_work,
-					      HV_UTIL_TIMEOUT * HZ);
+		/*
+		 * Stash away this global state for completing the
+		 * transaction; note transactions are serialized.
+		 */
 
-			return;
+		kvp_transaction.recv_len = recvlen;
+		kvp_transaction.recv_req_id = requestid;
+		kvp_transaction.kvp_msg = kvp_msg;
 
+		if (kvp_transaction.state < HVUTIL_READY) {
+			/* Userspace is not registered yet */
+			kvp_respond_to_host(NULL, HV_E_FAIL);
+			return;
 		}
+		kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
 
-		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
-			| ICMSGHDRFLAG_RESPONSE;
+		/*
+		 * Get the information from the
+		 * user-mode component.
+		 * component. This transaction will be
+		 * completed when we get the value from
+		 * the user-mode component.
+		 * Set a timeout to deal with
+		 * user-mode not responding.
+		 */
+		schedule_work(&kvp_sendkey_work);
+		schedule_delayed_work(&kvp_timeout_work,
+					HV_UTIL_TIMEOUT * HZ);
 
-		vmbus_sendpacket(channel, recv_buffer,
-				       recvlen, requestid,
-				       VM_PKT_DATA_INBAND, 0);
+		return;
 
-		host_negotiatied = NEGO_FINISHED;
-		hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
+	} else {
+		pr_err_ratelimited("KVP request received. Invalid msg type: %d\n",
+				   icmsghdrp->icmsgtype);
+		return;
 	}
 
+	icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
+		| ICMSGHDRFLAG_RESPONSE;
+
+	vmbus_sendpacket(channel, recv_buffer,
+			 recvlen, requestid,
+			 VM_PKT_DATA_INBAND, 0);
+
+	host_negotiatied = NEGO_FINISHED;
+	hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
 }
 
 static void kvp_on_reset(void)
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 783779e4cc1a..2267bd4c3472 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -298,49 +298,64 @@ void hv_vss_onchannelcallback(void *context)
 	if (vss_transaction.state > HVUTIL_READY)
 		return;
 
-	vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
-			 &requestid);
-
-	if (recvlen > 0) {
-		icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
-			sizeof(struct vmbuspipe_hdr)];
-
-		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			if (vmbus_prep_negotiate_resp(icmsghdrp,
-				 recv_buffer, fw_versions, FW_VER_COUNT,
-				 vss_versions, VSS_VER_COUNT,
-				 NULL, &vss_srv_version)) {
-
-				pr_info("VSS IC version %d.%d\n",
-					vss_srv_version >> 16,
-					vss_srv_version & 0xFFFF);
-			}
-		} else {
-			vss_msg = (struct hv_vss_msg *)&recv_buffer[
-				sizeof(struct vmbuspipe_hdr) +
-				sizeof(struct icmsg_hdr)];
-
-			/*
-			 * Stash away this global state for completing the
-			 * transaction; note transactions are serialized.
-			 */
-
-			vss_transaction.recv_len = recvlen;
-			vss_transaction.recv_req_id = requestid;
-			vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
-
-			schedule_work(&vss_handle_request_work);
+	if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
+		pr_err_ratelimited("VSS request received. Could not read into recv buf\n");
+		return;
+	}
+
+	if (!recvlen)
+		return;
+
+	/* Ensure recvlen is big enough to read header data */
+	if (recvlen < ICMSG_HDR) {
+		pr_err_ratelimited("VSS request received. Packet length too small: %d\n",
+				   recvlen);
+		return;
+	}
+
+	icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)];
+
+	if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+		if (vmbus_prep_negotiate_resp(icmsghdrp,
+				recv_buffer, recvlen,
+				fw_versions, FW_VER_COUNT,
+				vss_versions, VSS_VER_COUNT,
+				NULL, &vss_srv_version)) {
+
+			pr_info("VSS IC version %d.%d\n",
+				vss_srv_version >> 16,
+				vss_srv_version & 0xFFFF);
+		}
+	} else if (icmsghdrp->icmsgtype == ICMSGTYPE_VSS) {
+		/* Ensure recvlen is big enough to contain hv_vss_msg */
+		if (recvlen < ICMSG_HDR + sizeof(struct hv_vss_msg)) {
+			pr_err_ratelimited("Invalid VSS msg. Packet length too small: %u\n",
+					   recvlen);
 			return;
 		}
+		vss_msg = (struct hv_vss_msg *)&recv_buffer[ICMSG_HDR];
+
+		/*
+		 * Stash away this global state for completing the
+		 * transaction; note transactions are serialized.
+		 */
 
-		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
-			| ICMSGHDRFLAG_RESPONSE;
+		vss_transaction.recv_len = recvlen;
+		vss_transaction.recv_req_id = requestid;
+		vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
 
-		vmbus_sendpacket(channel, recv_buffer,
-				       recvlen, requestid,
-				       VM_PKT_DATA_INBAND, 0);
+		schedule_work(&vss_handle_request_work);
+		return;
+	} else {
+		pr_err_ratelimited("VSS request received. Invalid msg type: %d\n",
+				   icmsghdrp->icmsgtype);
+		return;
 	}
 
+	icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION |
+		ICMSGHDRFLAG_RESPONSE;
+	vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
+			 VM_PKT_DATA_INBAND, 0);
 }
 
 static void vss_on_reset(void)
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 05566ecdbe4b..34f3e789cc9a 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -195,73 +195,88 @@ static void shutdown_onchannelcallback(void *context)
 
 	struct icmsg_hdr *icmsghdrp;
 
-	vmbus_recvpacket(channel, shut_txf_buf,
-			 HV_HYP_PAGE_SIZE, &recvlen, &requestid);
+	if (vmbus_recvpacket(channel, shut_txf_buf, HV_HYP_PAGE_SIZE, &recvlen, &requestid)) {
+		pr_err_ratelimited("Shutdown request received. Could not read into shut txf buf\n");
+		return;
+	}
 
-	if (recvlen > 0) {
-		icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[
-			sizeof(struct vmbuspipe_hdr)];
+	if (!recvlen)
+		return;
 
-		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			if (vmbus_prep_negotiate_resp(icmsghdrp, shut_txf_buf,
-					fw_versions, FW_VER_COUNT,
-					sd_versions, SD_VER_COUNT,
-					NULL, &sd_srv_version)) {
-				pr_info("Shutdown IC version %d.%d\n",
-					sd_srv_version >> 16,
-					sd_srv_version & 0xFFFF);
-			}
-		} else {
-			shutdown_msg =
-				(struct shutdown_msg_data *)&shut_txf_buf[
-					sizeof(struct vmbuspipe_hdr) +
-					sizeof(struct icmsg_hdr)];
+	/* Ensure recvlen is big enough to read header data */
+	if (recvlen < ICMSG_HDR) {
+		pr_err_ratelimited("Shutdown request received. Packet length too small: %d\n",
+				   recvlen);
+		return;
+	}
 
-			/*
-			 * shutdown_msg->flags can be 0(shut down), 2(reboot),
-			 * or 4(hibernate). It may bitwise-OR 1, which means
-			 * performing the request by force. Linux always tries
-			 * to perform the request by force.
-			 */
-			switch (shutdown_msg->flags) {
-			case 0:
-			case 1:
-				icmsghdrp->status = HV_S_OK;
-				work = &shutdown_work;
-				pr_info("Shutdown request received -"
-					    " graceful shutdown initiated\n");
-				break;
-			case 2:
-			case 3:
-				icmsghdrp->status = HV_S_OK;
-				work = &restart_work;
-				pr_info("Restart request received -"
-					    " graceful restart initiated\n");
-				break;
-			case 4:
-			case 5:
-				pr_info("Hibernation request received\n");
-				icmsghdrp->status = hibernation_supported ?
-					HV_S_OK : HV_E_FAIL;
-				if (hibernation_supported)
-					work = &hibernate_context.work;
-				break;
-			default:
-				icmsghdrp->status = HV_E_FAIL;
-				pr_info("Shutdown request received -"
-					    " Invalid request\n");
-				break;
-			}
+	icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[sizeof(struct vmbuspipe_hdr)];
+
+	if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+		if (vmbus_prep_negotiate_resp(icmsghdrp,
+				shut_txf_buf, recvlen,
+				fw_versions, FW_VER_COUNT,
+				sd_versions, SD_VER_COUNT,
+				NULL, &sd_srv_version)) {
+			pr_info("Shutdown IC version %d.%d\n",
+				sd_srv_version >> 16,
+				sd_srv_version & 0xFFFF);
+		}
+	} else if (icmsghdrp->icmsgtype == ICMSGTYPE_SHUTDOWN) {
+		/* Ensure recvlen is big enough to contain shutdown_msg_data struct */
+		if (recvlen < ICMSG_HDR + sizeof(struct shutdown_msg_data)) {
+			pr_err_ratelimited("Invalid shutdown msg data. Packet length too small: %u\n",
+					   recvlen);
+			return;
 		}
 
-		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
-			| ICMSGHDRFLAG_RESPONSE;
-
-		vmbus_sendpacket(channel, shut_txf_buf,
-				       recvlen, requestid,
-				       VM_PKT_DATA_INBAND, 0);
+		shutdown_msg = (struct shutdown_msg_data *)&shut_txf_buf[ICMSG_HDR];
+
+		/*
+		 * shutdown_msg->flags can be 0(shut down), 2(reboot),
+		 * or 4(hibernate). It may bitwise-OR 1, which means
+		 * performing the request by force. Linux always tries
+		 * to perform the request by force.
+		 */
+		switch (shutdown_msg->flags) {
+		case 0:
+		case 1:
+			icmsghdrp->status = HV_S_OK;
+			work = &shutdown_work;
+			pr_info("Shutdown request received - graceful shutdown initiated\n");
+			break;
+		case 2:
+		case 3:
+			icmsghdrp->status = HV_S_OK;
+			work = &restart_work;
+			pr_info("Restart request received - graceful restart initiated\n");
+			break;
+		case 4:
+		case 5:
+			pr_info("Hibernation request received\n");
+			icmsghdrp->status = hibernation_supported ?
+				HV_S_OK : HV_E_FAIL;
+			if (hibernation_supported)
+				work = &hibernate_context.work;
+			break;
+		default:
+			icmsghdrp->status = HV_E_FAIL;
+			pr_info("Shutdown request received - Invalid request\n");
+			break;
+		}
+	} else {
+		icmsghdrp->status = HV_E_FAIL;
+		pr_err_ratelimited("Shutdown request received. Invalid msg type: %d\n",
+				   icmsghdrp->icmsgtype);
 	}
 
+	icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
+		| ICMSGHDRFLAG_RESPONSE;
+
+	vmbus_sendpacket(channel, shut_txf_buf,
+			 recvlen, requestid,
+			 VM_PKT_DATA_INBAND, 0);
+
 	if (work)
 		schedule_work(work);
 }
@@ -396,19 +411,27 @@ static void timesync_onchannelcallback(void *context)
 					   HV_HYP_PAGE_SIZE, &recvlen,
 					   &requestid);
 		if (ret) {
-			pr_warn_once("TimeSync IC pkt recv failed (Err: %d)\n",
-				     ret);
+			pr_err_ratelimited("TimeSync IC pkt recv failed (Err: %d)\n",
+					   ret);
 			break;
 		}
 
 		if (!recvlen)
 			break;
 
+		/* Ensure recvlen is big enough to read header data */
+		if (recvlen < ICMSG_HDR) {
+			pr_err_ratelimited("Timesync request received. Packet length too small: %d\n",
+					   recvlen);
+			break;
+		}
+
 		icmsghdrp = (struct icmsg_hdr *)&time_txf_buf[
 				sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			if (vmbus_prep_negotiate_resp(icmsghdrp, time_txf_buf,
+			if (vmbus_prep_negotiate_resp(icmsghdrp,
+						time_txf_buf, recvlen,
 						fw_versions, FW_VER_COUNT,
 						ts_versions, TS_VER_COUNT,
 						NULL, &ts_srv_version)) {
@@ -416,33 +439,44 @@ static void timesync_onchannelcallback(void *context)
 					ts_srv_version >> 16,
 					ts_srv_version & 0xFFFF);
 			}
-		} else {
+		} else if (icmsghdrp->icmsgtype == ICMSGTYPE_TIMESYNC) {
 			if (ts_srv_version > TS_VERSION_3) {
-				refdata = (struct ictimesync_ref_data *)
-					&time_txf_buf[
-					sizeof(struct vmbuspipe_hdr) +
-					sizeof(struct icmsg_hdr)];
+				/* Ensure recvlen is big enough to read ictimesync_ref_data */
+				if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_ref_data)) {
+					pr_err_ratelimited("Invalid ictimesync ref data. Length too small: %u\n",
+							   recvlen);
+					break;
+				}
+				refdata = (struct ictimesync_ref_data *)&time_txf_buf[ICMSG_HDR];
 
 				adj_guesttime(refdata->parenttime,
 						refdata->vmreferencetime,
 						refdata->flags);
 			} else {
-				timedatap = (struct ictimesync_data *)
-					&time_txf_buf[
-					sizeof(struct vmbuspipe_hdr) +
-					sizeof(struct icmsg_hdr)];
+				/* Ensure recvlen is big enough to read ictimesync_data */
+				if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_data)) {
+					pr_err_ratelimited("Invalid ictimesync data. Length too small: %u\n",
+							   recvlen);
+					break;
+				}
+				timedatap = (struct ictimesync_data *)&time_txf_buf[ICMSG_HDR];
+
 				adj_guesttime(timedatap->parenttime,
 					      hv_read_reference_counter(),
 					      timedatap->flags);
 			}
+		} else {
+			icmsghdrp->status = HV_E_FAIL;
+			pr_err_ratelimited("Timesync request received. Invalid msg type: %d\n",
+					   icmsghdrp->icmsgtype);
 		}
 
 		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
 			| ICMSGHDRFLAG_RESPONSE;
 
 		vmbus_sendpacket(channel, time_txf_buf,
-				recvlen, requestid,
-				VM_PKT_DATA_INBAND, 0);
+				 recvlen, requestid,
+				 VM_PKT_DATA_INBAND, 0);
 	}
 }
 
@@ -462,18 +496,28 @@ static void heartbeat_onchannelcallback(void *context)
 
 	while (1) {
 
-		vmbus_recvpacket(channel, hbeat_txf_buf,
-				 HV_HYP_PAGE_SIZE, &recvlen, &requestid);
+		if (vmbus_recvpacket(channel, hbeat_txf_buf, HV_HYP_PAGE_SIZE,
+				     &recvlen, &requestid)) {
+			pr_err_ratelimited("Heartbeat request received. Could not read into hbeat txf buf\n");
+			return;
+		}
 
 		if (!recvlen)
 			break;
 
+		/* Ensure recvlen is big enough to read header data */
+		if (recvlen < ICMSG_HDR) {
+			pr_err_ratelimited("Hearbeat request received. Packet length too small: %d\n",
+					   recvlen);
+			break;
+		}
+
 		icmsghdrp = (struct icmsg_hdr *)&hbeat_txf_buf[
 				sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
 			if (vmbus_prep_negotiate_resp(icmsghdrp,
-					hbeat_txf_buf,
+					hbeat_txf_buf, recvlen,
 					fw_versions, FW_VER_COUNT,
 					hb_versions, HB_VER_COUNT,
 					NULL, &hb_srv_version)) {
@@ -482,21 +526,31 @@ static void heartbeat_onchannelcallback(void *context)
 					hb_srv_version >> 16,
 					hb_srv_version & 0xFFFF);
 			}
-		} else {
-			heartbeat_msg =
-				(struct heartbeat_msg_data *)&hbeat_txf_buf[
-					sizeof(struct vmbuspipe_hdr) +
-					sizeof(struct icmsg_hdr)];
+		} else if (icmsghdrp->icmsgtype == ICMSGTYPE_HEARTBEAT) {
+			/*
+			 * Ensure recvlen is big enough to read seq_num. Reserved area is not
+			 * included in the check as the host may not fill it up entirely
+			 */
+			if (recvlen < ICMSG_HDR + sizeof(u64)) {
+				pr_err_ratelimited("Invalid heartbeat msg data. Length too small: %u\n",
+						   recvlen);
+				break;
+			}
+			heartbeat_msg = (struct heartbeat_msg_data *)&hbeat_txf_buf[ICMSG_HDR];
 
 			heartbeat_msg->seq_num += 1;
+		} else {
+			icmsghdrp->status = HV_E_FAIL;
+			pr_err_ratelimited("Heartbeat request received. Invalid msg type: %d\n",
+					   icmsghdrp->icmsgtype);
 		}
 
 		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
 			| ICMSGHDRFLAG_RESPONSE;
 
 		vmbus_sendpacket(channel, hbeat_txf_buf,
-				       recvlen, requestid,
-				       VM_PKT_DATA_INBAND, 0);
+				 recvlen, requestid,
+				 VM_PKT_DATA_INBAND, 0);
 	}
 }
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index fbae8406d5d4..2ea967bc17ad 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1480,6 +1480,7 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size);
 #define ICMSGTYPE_SHUTDOWN		3
 #define ICMSGTYPE_TIMESYNC		4
 #define ICMSGTYPE_VSS			5
+#define ICMSGTYPE_FCOPY			7
 
 #define ICMSGHDRFLAG_TRANSACTION	1
 #define ICMSGHDRFLAG_REQUEST		2
@@ -1523,6 +1524,12 @@ struct icmsg_hdr {
 	u8 reserved[2];
 } __packed;
 
+#define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100
+#define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr))
+#define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \
+	(ICMSG_HDR + offsetof(struct icmsg_negotiate, icversion_data) + \
+	 (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version)))
+
 struct icmsg_negotiate {
 	u16 icframe_vercnt;
 	u16 icmsg_vercnt;
@@ -1578,7 +1585,7 @@ struct hyperv_service_callback {
 };
 
 #define MAX_SRV_VER	0x7ffffff
-extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
+extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
 				const int *fw_version, int fw_vercnt,
 				const int *srv_version, int srv_vercnt,
 				int *nego_fw_version, int *nego_srv_version);
-- 
cgit v1.2.3


From e4d221b42354b2e2ddb9187a806afb651eee2cda Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Wed, 9 Dec 2020 08:08:26 +0100
Subject: Drivers: hv: vmbus: Resolve race condition in vmbus_onoffer_rescind()

An erroneous or malicious host could send multiple rescind messages for
a same channel.  In vmbus_onoffer_rescind(), the guest maps the channel
ID to obtain a pointer to the channel object and it eventually releases
such object and associated data.  The host could time rescind messages
and lead to an use-after-free.  Add a new flag to the channel structure
to make sure that only one instance of vmbus_onoffer_rescind() can get
the reference to the channel object.

Reported-by: Juan Vazquez <juvazq@microsoft.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20201209070827.29335-6-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel_mgmt.c | 12 ++++++++++++
 include/linux/hyperv.h    |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 4072fd1f2214..68950a1e4b63 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -1063,6 +1063,18 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 
 	mutex_lock(&vmbus_connection.channel_mutex);
 	channel = relid2channel(rescind->child_relid);
+	if (channel != NULL) {
+		/*
+		 * Guarantee that no other instance of vmbus_onoffer_rescind()
+		 * has got a reference to the channel object.  Synchronize on
+		 * &vmbus_connection.channel_mutex.
+		 */
+		if (channel->rescind_ref) {
+			mutex_unlock(&vmbus_connection.channel_mutex);
+			return;
+		}
+		channel->rescind_ref = true;
+	}
 	mutex_unlock(&vmbus_connection.channel_mutex);
 
 	if (channel == NULL) {
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2ea967bc17ad..f0d48a368f13 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -809,6 +809,7 @@ struct vmbus_channel {
 	u8 monitor_bit;
 
 	bool rescind; /* got rescind msg */
+	bool rescind_ref; /* got rescind msg, got channel reference */
 	struct completion rescind_event;
 
 	u32 ringbuffer_gpadlhandle;
-- 
cgit v1.2.3


From a7a5acba0e06b8f9923faa1a726f0ac1380b719a Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Thu, 21 Jan 2021 13:05:45 +0200
Subject: mtd: spi-nor: Add Global Block Unlock command

The Global Block Unlock command has different names depending
on the manufacturer, but always the same command value: 0x98.
Macronix's MX25U12835F names it Gang Block Unlock, Winbond's
W25Q128FV names it Global Block Unlock and Microchip's
SST26VF064B names it Global Block Protection Unlock.

Used in the Individual Block Protection mode, which is mutually
exclusive with the Block Protection mode (BP0-3).

Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Reviewed-by: Pratyush Yadav <p.yadav@ti.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20210121110546.382633-1-tudor.ambarus@microchip.com
---
 drivers/mtd/spi-nor/core.c  | 37 +++++++++++++++++++++++++++++++++++++
 drivers/mtd/spi-nor/core.h  |  1 +
 include/linux/mtd/spi-nor.h |  1 +
 3 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index b17faccc95c4..47ded0977056 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -853,6 +853,43 @@ int spi_nor_wait_till_ready(struct spi_nor *nor)
 						    DEFAULT_READY_WAIT_JIFFIES);
 }
 
+/**
+ * spi_nor_global_block_unlock() - Unlock Global Block Protection.
+ * @nor:	pointer to 'struct spi_nor'.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+int spi_nor_global_block_unlock(struct spi_nor *nor)
+{
+	int ret;
+
+	ret = spi_nor_write_enable(nor);
+	if (ret)
+		return ret;
+
+	if (nor->spimem) {
+		struct spi_mem_op op =
+			SPI_MEM_OP(SPI_MEM_OP_CMD(SPINOR_OP_GBULK, 0),
+				   SPI_MEM_OP_NO_ADDR,
+				   SPI_MEM_OP_NO_DUMMY,
+				   SPI_MEM_OP_NO_DATA);
+
+		spi_nor_spimem_setup_op(nor, &op, nor->reg_proto);
+
+		ret = spi_mem_exec_op(nor->spimem, &op);
+	} else {
+		ret = spi_nor_controller_ops_write_reg(nor, SPINOR_OP_GBULK,
+						       NULL, 0);
+	}
+
+	if (ret) {
+		dev_dbg(nor->dev, "error %d on Global Block Unlock\n", ret);
+		return ret;
+	}
+
+	return spi_nor_wait_till_ready(nor);
+}
+
 /**
  * spi_nor_write_sr() - Write the Status Register.
  * @nor:	pointer to 'struct spi_nor'.
diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index d631ee299de3..eb26796db026 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -434,6 +434,7 @@ int spi_nor_write_disable(struct spi_nor *nor);
 int spi_nor_set_4byte_addr_mode(struct spi_nor *nor, bool enable);
 int spi_nor_write_ear(struct spi_nor *nor, u8 ear);
 int spi_nor_wait_till_ready(struct spi_nor *nor);
+int spi_nor_global_block_unlock(struct spi_nor *nor);
 int spi_nor_lock_and_prep(struct spi_nor *nor);
 void spi_nor_unlock_and_unprep(struct spi_nor *nor);
 int spi_nor_sr1_bit6_quad_enable(struct spi_nor *nor);
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index d13958de6d8a..a0d572855444 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -53,6 +53,7 @@
 #define SPINOR_OP_WREAR		0xc5	/* Write Extended Address Register */
 #define SPINOR_OP_SRSTEN	0x66	/* Software Reset Enable */
 #define SPINOR_OP_SRST		0x99	/* Software Reset */
+#define SPINOR_OP_GBULK		0x98    /* Global Block Unlock */
 
 /* 4-byte address opcodes - used on Spansion and some Macronix flashes. */
 #define SPINOR_OP_READ_4B	0x13	/* Read data bytes (low frequency) */
-- 
cgit v1.2.3


From 3ce60f443b143e649aa26cd3f668d645434647ac Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Wed, 3 Feb 2021 15:01:29 +0200
Subject: IB/mlx5: Move mlx5_port_caps from mlx5_core_dev to mlx5_ib_dev

mlx5_port_caps are RDMA specific capabilities. These are not used by the
mlx5_core_device at all. Move them to mlx5_ib_dev where it is used and
reduce the scope of it to multiple drivers.

Link: https://lore.kernel.org/r/20210203130133.4057329-2-leon@kernel.org
Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mad.c     |  8 ++++----
 drivers/infiniband/hw/mlx5/main.c    | 14 ++++++--------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  8 ++++++++
 drivers/infiniband/hw/mlx5/qp.c      |  6 +++---
 drivers/infiniband/hw/mlx5/wr.c      |  2 +-
 include/linux/mlx5/driver.h          |  8 --------
 6 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 53dec6063245..e9d0a5269582 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -48,7 +48,7 @@ static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num,
 	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED &&
 	    in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
 		return true;
-	return dev->mdev->port_caps[port_num - 1].has_smi;
+	return dev->port_caps[port_num - 1].has_smi;
 }
 
 static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey,
@@ -299,7 +299,7 @@ int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
 
 	packet_error = be16_to_cpu(out_mad->status);
 
-	dev->mdev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ?
+	dev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ?
 		MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0;
 
 out:
@@ -549,7 +549,7 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
 	props->port_cap_flags	= be32_to_cpup((__be32 *)(out_mad->data + 20));
 	props->gid_tbl_len	= out_mad->data[50];
 	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
-	props->pkey_tbl_len	= mdev->port_caps[port - 1].pkey_table_len;
+	props->pkey_tbl_len	= dev->port_caps[port - 1].pkey_table_len;
 	props->bad_pkey_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 46));
 	props->qkey_viol_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 48));
 	props->active_width	= out_mad->data[31] & 0xf;
@@ -589,7 +589,7 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
 
 	/* If reported active speed is QDR, check if is FDR-10 */
 	if (props->active_speed == 4) {
-		if (mdev->port_caps[port - 1].ext_port_cap &
+		if (dev->port_caps[port - 1].ext_port_cap &
 		    MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) {
 			init_query_mad(in_mad);
 			in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 40fb86db3376..d2f185ae2b9c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2946,8 +2946,8 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
 	int err;
 	int port;
 
-	for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
-		dev->mdev->port_caps[port - 1].has_smi = false;
+	for (port = 1; port <= ARRAY_SIZE(dev->port_caps); port++) {
+		dev->port_caps[port - 1].has_smi = false;
 		if (MLX5_CAP_GEN(dev->mdev, port_type) ==
 		    MLX5_CAP_PORT_TYPE_IB) {
 			if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
@@ -2959,10 +2959,10 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
 						    port, err);
 					return err;
 				}
-				dev->mdev->port_caps[port - 1].has_smi =
+				dev->port_caps[port - 1].has_smi =
 					vport_ctx.has_smi;
 			} else {
-				dev->mdev->port_caps[port - 1].has_smi = true;
+				dev->port_caps[port - 1].has_smi = true;
 			}
 		}
 	}
@@ -3004,10 +3004,8 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
 		goto out;
 	}
 
-	dev->mdev->port_caps[port - 1].pkey_table_len =
-					dprops->max_pkeys;
-	dev->mdev->port_caps[port - 1].gid_table_len =
-					pprops->gid_tbl_len;
+	dev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys;
+	dev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len;
 	mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
 		    port, dprops->max_pkeys, pprops->gid_tbl_len);
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 2fd2927abe45..c2f91c15b973 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1036,6 +1036,13 @@ struct mlx5_var_table {
 	u64 num_var_hw_entries;
 };
 
+struct mlx5_port_caps {
+	int gid_table_len;
+	int pkey_table_len;
+	bool has_smi;
+	u8 ext_port_cap;
+};
+
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
@@ -1096,6 +1103,7 @@ struct mlx5_ib_dev {
 	struct mlx5_var_table var_table;
 
 	struct xarray sig_mrs;
+	struct mlx5_port_caps port_caps[MLX5_MAX_PORTS];
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 88be94215708..5274349dd998 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -3177,10 +3177,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 
 	if (ah_flags & IB_AH_GRH) {
 		if (grh->sgid_index >=
-		    dev->mdev->port_caps[port - 1].gid_table_len) {
+		    dev->port_caps[port - 1].gid_table_len) {
 			pr_err("sgid_index (%u) too large. max is %d\n",
 			       grh->sgid_index,
-			       dev->mdev->port_caps[port - 1].gid_table_len);
+			       dev->port_caps[port - 1].gid_table_len);
 			return -EINVAL;
 		}
 	}
@@ -4311,7 +4311,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	if (attr_mask & IB_QP_PKEY_INDEX) {
 		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
 		if (attr->pkey_index >=
-		    dev->mdev->port_caps[port - 1].pkey_table_len) {
+		    dev->port_caps[port - 1].pkey_table_len) {
 			mlx5_ib_dbg(dev, "invalid pkey index %d\n",
 				    attr->pkey_index);
 			goto out;
diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c
index d6038fb6c50c..cf2852cba45c 100644
--- a/drivers/infiniband/hw/mlx5/wr.c
+++ b/drivers/infiniband/hw/mlx5/wr.c
@@ -1369,7 +1369,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			handle_qpt_uc(wr, &seg, &size);
 			break;
 		case IB_QPT_SMI:
-			if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) {
+			if (unlikely(!dev->port_caps[qp->port - 1].has_smi)) {
 				mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n");
 				err = -EPERM;
 				*bad_wr = wr;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f93bfe7473aa..11558c2e99f0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -305,13 +305,6 @@ struct mlx5_cmd {
 	struct mlx5_cmd_stats *stats;
 };
 
-struct mlx5_port_caps {
-	int	gid_table_len;
-	int	pkey_table_len;
-	u8	ext_port_cap;
-	bool	has_smi;
-};
-
 struct mlx5_cmd_mailbox {
 	void	       *buf;
 	dma_addr_t	dma;
@@ -694,7 +687,6 @@ struct mlx5_core_dev {
 	u8			rev_id;
 	char			board_id[MLX5_BOARD_ID_LEN];
 	struct mlx5_cmd		cmd;
-	struct mlx5_port_caps	port_caps[MLX5_MAX_PORTS];
 	struct {
 		u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 		u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
-- 
cgit v1.2.3


From 585fc0d2871c9318c949fbf45b1f081edd489e96 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 4 Feb 2021 18:32:03 -0800
Subject: mm: hugetlbfs: fix cannot migrate the fallocated HugeTLB page

If a new hugetlb page is allocated during fallocate it will not be
marked as active (set_page_huge_active) which will result in a later
isolate_huge_page failure when the page migration code would like to
move that page.  Such a failure would be unexpected and wrong.

Only export set_page_huge_active, just leave clear_page_huge_active as
static.  Because there are no external users.

Link: https://lkml.kernel.org/r/20210115124942.46403-3-songmuchun@bytedance.com
Fixes: 70c3547e36f5 (hugetlbfs: add hugetlbfs_fallocate())
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 3 ++-
 include/linux/hugetlb.h | 2 ++
 mm/hugetlb.c            | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b5c109703daa..21c20fd5f9ee 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -735,9 +735,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
+		set_page_huge_active(page);
 		/*
 		 * unlock_page because locked by add_to_page_cache()
-		 * page_put due to reference from alloc_huge_page()
+		 * put_page() due to reference from alloc_huge_page()
 		 */
 		unlock_page(page);
 		put_page(page);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ebca2ef02212..b5807f23caf8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -770,6 +770,8 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
+void set_page_huge_active(struct page *page);
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 18f6ee317900..6f0e242d38ca 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1349,7 +1349,7 @@ bool page_huge_active(struct page *page)
 }
 
 /* never called for tail page */
-static void set_page_huge_active(struct page *page)
+void set_page_huge_active(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
 	SetPagePrivate(&page[1]);
-- 
cgit v1.2.3


From 4f6ec8602341e97b364e4e0d41a1ed08148f5e98 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Thu, 4 Feb 2021 18:32:24 -0800
Subject: mm/vmalloc: separate put pages and flush VM flags

When VM_MAP_PUT_PAGES was added, it was defined with the same value as
VM_FLUSH_RESET_PERMS.  This doesn't seem like it will cause any big
functional problems other than some excess flushing for VM_MAP_PUT_PAGES
allocations.

Redefine VM_MAP_PUT_PAGES to have its own value.  Also, rearrange things
so flags are less likely to be missed in the future.

Link: https://lkml.kernel.org/r/20210122233706.9304-1-rick.p.edgecombe@intel.com
Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap")
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Daniel Axtens <dja@axtens.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 80c0181c411d..cedcda6593f6 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -24,7 +24,8 @@ struct notifier_block;		/* in notifier.h */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
 #define VM_NO_GUARD		0x00000040      /* don't add guard page */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
-#define VM_MAP_PUT_PAGES	0x00000100	/* put pages and free array in vfree */
+#define VM_FLUSH_RESET_PERMS	0x00000100	/* reset direct map and flush TLB on unmap, can't be freed in atomic context */
+#define VM_MAP_PUT_PAGES	0x00000200	/* put pages and free array in vfree */
 
 /*
  * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -37,12 +38,6 @@ struct notifier_block;		/* in notifier.h */
  * determine which allocations need the module shadow freed.
  */
 
-/*
- * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
- * vfree_atomic().
- */
-#define VM_FLUSH_RESET_PERMS	0x00000100      /* Reset direct map and flush TLB on unmap */
-
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
-- 
cgit v1.2.3


From 49c6631d3b4f61a7b5bb0453a885a12bfa06ffd8 Mon Sep 17 00:00:00 2001
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
Date: Thu, 4 Feb 2021 18:32:49 -0800
Subject: kasan: add explicit preconditions to kasan_report()

Patch series "kasan: Fix metadata detection for KASAN_HW_TAGS", v5.

With the introduction of KASAN_HW_TAGS, kasan_report() currently assumes
that every location in memory has valid metadata associated.  This is
due to the fact that addr_has_metadata() returns always true.

As a consequence of this, an invalid address (e.g.  NULL pointer
address) passed to kasan_report() when KASAN_HW_TAGS is enabled, leads
to a kernel panic.

Example below, based on arm64:

   BUG: KASAN: invalid-access in 0x0
   Read at addr 0000000000000000 by task swapper/0/1
   Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
   Mem abort info:
     ESR = 0x96000004
     EC = 0x25: DABT (current EL), IL = 32 bits
     SET = 0, FnV = 0
     EA = 0, S1PTW = 0
   Data abort info:
     ISV = 0, ISS = 0x00000004
     CM = 0, WnR = 0

  ...

   Call trace:
    mte_get_mem_tag+0x24/0x40
    kasan_report+0x1a4/0x410
    alsa_sound_last_init+0x8c/0xa4
    do_one_initcall+0x50/0x1b0
    kernel_init_freeable+0x1d4/0x23c
    kernel_init+0x14/0x118
    ret_from_fork+0x10/0x34
   Code: d65f03c0 9000f021 f9428021 b6cfff61 (d9600000)
   ---[ end trace 377c8bb45bdd3a1a ]---
   hrtimer: interrupt took 48694256 ns
   note: swapper/0[1] exited with preempt_count 1
   Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
   SMP: stopping secondary CPUs
   Kernel Offset: 0x35abaf140000 from 0xffff800010000000
   PHYS_OFFSET: 0x40000000
   CPU features: 0x0a7e0152,61c0a030
   Memory Limit: none
   ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---

This series fixes the behavior of addr_has_metadata() that now returns
true only when the address is valid.

This patch (of 2):

With the introduction of KASAN_HW_TAGS, kasan_report() accesses the
metadata only when addr_has_metadata() succeeds.

Add a comment to make sure that the preconditions to the function are
explicitly clarified.

Link: https://lkml.kernel.org/r/20210126134409.47894-1-vincenzo.frascino@arm.com
Link: https://lkml.kernel.org/r/20210126134409.47894-2-vincenzo.frascino@arm.com
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Leon Romanovsky <leonro@mellanox.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index fe1ae73ff8b5..0aea9e2a2a01 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -333,6 +333,13 @@ static inline void *kasan_reset_tag(const void *addr)
 	return (void *)arch_kasan_reset_tag(addr);
 }
 
+/**
+ * kasan_report - print a report about a bad memory access detected by KASAN
+ * @addr: address of the bad access
+ * @size: size of the bad access
+ * @is_write: whether the bad access is a write or a read
+ * @ip: instruction pointer for the accessibility check or the bad access itself
+ */
 bool kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
-- 
cgit v1.2.3


From 6342adcaa683c2b705c24ed201dc11b35854c88d Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.com>
Date: Wed, 3 Feb 2021 13:00:48 -0500
Subject: entry: Ensure trap after single-step on system call return

Commit 299155244770 ("entry: Drop usage of TIF flags in the generic syscall
code") introduced a bug on architectures using the generic syscall entry
code, in which processes stopped by PTRACE_SYSCALL do not trap on syscall
return after receiving a TIF_SINGLESTEP.

The reason is that the meaning of TIF_SINGLESTEP flag is overloaded to
cause the trap after a system call is executed, but since the above commit,
the syscall call handler only checks for the SYSCALL_WORK flags on the exit
work.

Split the meaning of TIF_SINGLESTEP such that it only means single-step
mode, and create a new type of SYSCALL_WORK to request a trap immediately
after a syscall in single-step mode.  In the current implementation, the
SYSCALL_WORK flag shadows the TIF_SINGLESTEP flag for simplicity.

Update x86 to flip this bit when a tracer enables single stepping.

Fixes: 299155244770 ("entry: Drop usage of TIF flags in the generic syscall code")
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Kyle Huey <me@kylehuey.com>
Link: https://lore.kernel.org/r/87h7mtc9pr.fsf_-_@collabora.com
---
 arch/x86/include/asm/entry-common.h |  2 --
 arch/x86/kernel/step.c              | 10 ++++++++--
 include/linux/entry-common.h        |  1 +
 include/linux/thread_info.h         |  2 ++
 kernel/entry/common.c               | 12 ++----------
 5 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 6fe54b2813c1..2b87b191b3b8 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -43,8 +43,6 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
 }
 #define arch_check_user_regs arch_check_user_regs
 
-#define ARCH_SYSCALL_EXIT_WORK		(_TIF_SINGLESTEP)
-
 static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
 						  unsigned long ti_work)
 {
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 60d2c3798ba2..0f3c307b37b3 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child)
 		regs->flags |= X86_EFLAGS_TF;
 
 	/*
-	 * Always set TIF_SINGLESTEP - this guarantees that
-	 * we single-step system calls etc..  This will also
+	 * Always set TIF_SINGLESTEP.  This will also
 	 * cause us to set TF when returning to user mode.
 	 */
 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
 
+	/*
+	 * Ensure that a trap is triggered once stepping out of a system
+	 * call prior to executing any user instruction.
+	 */
+	set_task_syscall_work(child, SYSCALL_EXIT_TRAP);
+
 	oflags = regs->flags;
 
 	/* Set TF on the kernel stack.. */
@@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child)
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+	clear_task_syscall_work(child, SYSCALL_EXIT_TRAP);
 
 	/* But touch TF only if it was set by us.. */
 	if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index ca86a00abe86..a104b298019a 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -46,6 +46,7 @@
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
+				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
 /*
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index c8a974cead73..9b2158c69275 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -43,6 +43,7 @@ enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SYSCALL_EMU,
 	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
 	SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
+	SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
@@ -51,6 +52,7 @@ enum syscall_work_bit {
 #define SYSCALL_WORK_SYSCALL_EMU	BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
 #define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
 #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
+#define SYSCALL_WORK_SYSCALL_EXIT_TRAP	BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
 #endif
 
 #include <asm/thread_info.h>
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 6dd82be60df8..f9d491b17b78 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -209,15 +209,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
 	lockdep_sys_exit();
 }
 
-#ifndef _TIF_SINGLESTEP
-static inline bool report_single_step(unsigned long work)
-{
-	return false;
-}
-#else
 /*
  * If SYSCALL_EMU is set, then the only reason to report is when
- * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
  * instruction has been already reported in syscall_enter_from_user_mode().
  */
 static inline bool report_single_step(unsigned long work)
@@ -225,10 +219,8 @@ static inline bool report_single_step(unsigned long work)
 	if (work & SYSCALL_WORK_SYSCALL_EMU)
 		return false;
 
-	return !!(current_thread_info()->flags & _TIF_SINGLESTEP);
+	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
 }
-#endif
-
 
 static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 {
-- 
cgit v1.2.3


From 10742efc20a429b2040658af685d6bb2aa674a73 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Thu, 21 Jan 2021 19:41:52 +0200
Subject: net/mlx5e: VF tunnel TX traffic offloading

When tunnel endpoint is on VF, driver still assumes that endpoint is on
uplink and incorrectly configures encap rule offload according to that
assumption. As a result, traffic is sent directly to the uplink and rules
installed on representor of tunnel endpoint VF are ignored.

Implement following changes to allow offloading tx traffic with tunnel
endpoint on VF:

- For tunneling flows perform route lookup on route and out devices pair.
If out device is uplink and route device is VF of same physical port, then
modify packet reg_c_0 metadata register (source port) with the value of VF
vport. Use eswitch vhca_id->vport mapping introduced in one of previous
patches in the series to obtain vport from route netdevice.

- Recirculate encapsulated packets to VF vport in order to apply any flow
rules installed on VF representor that match on encapsulated traffic.

Only enable support for this functionality when all following conditions
are true:

- Hardware advertises capability to preserve reg_c_0 value on packet
recirculation.

- Vport metadata matching is enabled.

- Termination tables are to be used by the flow.

Example TC rules for VF tunnel traffic:

1. Rule that redirects packets from UL to VF rep that has the tunnel
endpoint IP address:

$ tc -s filter show dev enp8s0f0 ingress
filter protocol ip pref 4 flower chain 0
filter protocol ip pref 4 flower chain 0 handle 0x1
  dst_mac 16:c9:a0:2d:69:2c
  src_mac 0c:42:a1:58:ab:e4
  eth_type ipv4
  ip_flags nofrag
  in_hw in_hw_count 1
        action order 1: mirred (Egress Redirect to device enp8s0f0_0) stolen
        index 3 ref 1 bind 1 installed 377 sec used 0 sec
        Action statistics:
        Sent 114096 bytes 952 pkt (dropped 0, overlimits 0 requeues 0)
        Sent software 0 bytes 0 pkt
        Sent hardware 114096 bytes 952 pkt
        backlog 0b 0p requeues 0
        cookie 878fa48d8c423fc08c3b6ca599b50a97
        no_percpu
        used_hw_stats delayed

2. Rule that decapsulates the tunneled flow and redirects to destination VF
representor:

$ tc -s filter show dev vxlan_sys_4789 ingress
filter protocol ip pref 4 flower chain 0
filter protocol ip pref 4 flower chain 0 handle 0x1
  dst_mac ca:2e:a7:3f:f5:0f
  src_mac 0a:40:bd:30:89:99
  eth_type ipv4
  enc_dst_ip 7.7.7.5
  enc_src_ip 7.7.7.1
  enc_key_id 98
  enc_dst_port 4789
  enc_tos 0
  ip_flags nofrag
  in_hw in_hw_count 1
        action order 1: tunnel_key  unset pipe
         index 2 ref 1 bind 1 installed 434 sec used 434 sec
        Action statistics:
        Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0
        used_hw_stats delayed

        action order 2: mirred (Egress Redirect to device enp8s0f0_1) stolen
        index 4 ref 1 bind 1 installed 434 sec used 0 sec
        Action statistics:
        Sent 129936 bytes 1082 pkt (dropped 0, overlimits 0 requeues 0)
        Sent software 0 bytes 0 pkt
        Sent hardware 129936 bytes 1082 pkt
        backlog 0b 0p requeues 0
        cookie ac17cf398c4c69e4a5b2f7aabd1b88ff
        no_percpu
        used_hw_stats delayed

Co-developed-by: Dmytro Linkin <dlinkin@nvidia.com>
Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  87 +++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   2 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 120 +++++++++++++++++++--
 include/linux/mlx5/eswitch.h                       |   2 +
 5 files changed, 201 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 280ea1e1e039..43f1508a05b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -165,6 +165,11 @@ struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = {
 		.moffset = 0,
 		.mlen = 2,
 	},
+	[VPORT_TO_REG] = {
+		.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0,
+		.moffset = 2,
+		.mlen = 2,
+	},
 	[TUNNEL_TO_REG] = {
 		.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1,
 		.moffset = 1,
@@ -1315,6 +1320,44 @@ static void remove_unready_flow(struct mlx5e_tc_flow *flow)
 	mutex_unlock(&uplink_priv->unready_flows_lock);
 }
 
+static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv);
+
+static bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_dev)
+{
+	struct mlx5_core_dev *out_mdev, *route_mdev;
+	struct mlx5e_priv *out_priv, *route_priv;
+
+	out_priv = netdev_priv(out_dev);
+	out_mdev = out_priv->mdev;
+	route_priv = netdev_priv(route_dev);
+	route_mdev = route_priv->mdev;
+
+	if (out_mdev->coredev_type != MLX5_COREDEV_PF ||
+	    route_mdev->coredev_type != MLX5_COREDEV_VF)
+		return false;
+
+	return same_hw_devs(out_priv, route_priv);
+}
+
+static int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev,
+				      u16 *vport)
+{
+	struct mlx5e_priv *out_priv, *route_priv;
+	struct mlx5_core_dev *route_mdev;
+	struct mlx5_eswitch *esw;
+	u16 vhca_id;
+	int err;
+
+	out_priv = netdev_priv(out_dev);
+	esw = out_priv->mdev->priv.eswitch;
+	route_priv = netdev_priv(route_dev);
+	route_mdev = route_priv->mdev;
+
+	vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id);
+	err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
+	return err;
+}
+
 static int
 mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		      struct mlx5e_tc_flow *flow,
@@ -3700,6 +3743,45 @@ static bool is_duplicated_encap_entry(struct mlx5e_priv *priv,
 	return false;
 }
 
+static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw,
+			       struct mlx5_flow_attr *attr,
+			       struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts,
+			       struct net_device *out_dev,
+			       int route_dev_ifindex,
+			       int out_index)
+{
+	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
+	struct net_device *route_dev;
+	u16 vport_num;
+	int err = 0;
+	u32 data;
+
+	route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex);
+
+	if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops ||
+	    !mlx5e_tc_is_vf_tunnel(out_dev, route_dev))
+		goto out;
+
+	err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num);
+	if (err)
+		goto out;
+
+	attr->dest_chain = 0;
+	attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+	esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE;
+	data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch,
+						       vport_num);
+	err = mlx5e_tc_match_to_reg_set(esw->dev, mod_hdr_acts,
+					MLX5_FLOW_NAMESPACE_FDB, VPORT_TO_REG, data);
+	if (err)
+		goto out;
+
+out:
+	if (route_dev)
+		dev_put(route_dev);
+	return err;
+}
+
 static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 			      struct mlx5e_tc_flow *flow,
 			      struct net_device *mirred_dev,
@@ -3791,6 +3873,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 	e->compl_result = 1;
 
 attach_flow:
+	err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev,
+				  e->route_dev_ifindex, out_index);
+	if (err)
+		goto out_err;
+
 	flow->encaps[out_index].e = e;
 	list_add(&flow->encaps[out_index].list, &e->flows);
 	flow->encaps[out_index].index = out_index;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 4a2ce241522e..56d809904ea7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -167,6 +167,7 @@ void mlx5e_tc_reoffload_flows_work(struct work_struct *work);
 
 enum mlx5e_tc_attr_to_reg {
 	CHAIN_TO_REG,
+	VPORT_TO_REG,
 	TUNNEL_TO_REG,
 	CTSTATE_TO_REG,
 	ZONE_TO_REG,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 1a045e95bc68..1ab34751329e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -389,12 +389,14 @@ enum mlx5_flow_match_level {
 enum {
 	MLX5_ESW_DEST_ENCAP         = BIT(0),
 	MLX5_ESW_DEST_ENCAP_VALID   = BIT(1),
+	MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE  = BIT(2),
 };
 
 enum {
 	MLX5_ESW_ATTR_FLAG_VLAN_HANDLED  = BIT(0),
 	MLX5_ESW_ATTR_FLAG_SLOW_PATH     = BIT(1),
 	MLX5_ESW_ATTR_FLAG_NO_IN_PORT    = BIT(2),
+	MLX5_ESW_ATTR_FLAG_SRC_REWRITE   = BIT(3),
 };
 
 struct mlx5_esw_flow_attr {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 335dc83d1bb9..1b18f624e04a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -337,6 +337,65 @@ esw_setup_chain_dest(struct mlx5_flow_destination *dest,
 	return  0;
 }
 
+static void esw_put_dest_tables_loop(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr,
+				     int from, int to)
+{
+	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
+	struct mlx5_fs_chains *chains = esw_chains(esw);
+	int i;
+
+	for (i = from; i < to; i++)
+		if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
+			mlx5_chains_put_table(chains, 0, 1, 0);
+}
+
+static bool
+esw_is_chain_src_port_rewrite(struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *esw_attr)
+{
+	int i;
+
+	for (i = esw_attr->split_count; i < esw_attr->out_count; i++)
+		if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)
+			return true;
+	return false;
+}
+
+static int
+esw_setup_chain_src_port_rewrite(struct mlx5_flow_destination *dest,
+				 struct mlx5_flow_act *flow_act,
+				 struct mlx5_eswitch *esw,
+				 struct mlx5_fs_chains *chains,
+				 struct mlx5_flow_attr *attr,
+				 int *i)
+{
+	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
+	int j, err;
+
+	if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE))
+		return -EOPNOTSUPP;
+
+	for (j = esw_attr->split_count; j < esw_attr->out_count; j++, (*i)++) {
+		err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, 1, 0, *i);
+		if (err)
+			goto err_setup_chain;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+		flow_act->pkt_reformat = esw_attr->dests[j].pkt_reformat;
+	}
+	return 0;
+
+err_setup_chain:
+	esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, j);
+	return err;
+}
+
+static void esw_cleanup_chain_src_port_rewrite(struct mlx5_eswitch *esw,
+					       struct mlx5_flow_attr *attr)
+{
+	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
+
+	esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, esw_attr->out_count);
+}
+
 static void
 esw_cleanup_chain_dest(struct mlx5_fs_chains *chains, u32 chain, u32 prio, u32 level)
 {
@@ -381,12 +440,18 @@ esw_setup_dests(struct mlx5_flow_destination *dest,
 		struct mlx5_flow_act *flow_act,
 		struct mlx5_eswitch *esw,
 		struct mlx5_flow_attr *attr,
+		struct mlx5_flow_spec *spec,
 		int *i)
 {
 	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
 	struct mlx5_fs_chains *chains = esw_chains(esw);
 	int err = 0;
 
+	if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) &&
+	    MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) &&
+	    mlx5_eswitch_vport_match_metadata_enabled(esw))
+		attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
+
 	if (attr->dest_ft) {
 		esw_setup_ft_dest(dest, flow_act, attr, *i);
 		(*i)++;
@@ -397,6 +462,8 @@ esw_setup_dests(struct mlx5_flow_destination *dest,
 		err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain,
 					   1, 0, *i);
 		(*i)++;
+	} else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) {
+		err = esw_setup_chain_src_port_rewrite(dest, flow_act, esw, chains, attr, i);
 	} else {
 		*i = esw_setup_vport_dests(dest, flow_act, esw, esw_attr, *i);
 	}
@@ -408,10 +475,15 @@ static void
 esw_cleanup_dests(struct mlx5_eswitch *esw,
 		  struct mlx5_flow_attr *attr)
 {
+	struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr;
 	struct mlx5_fs_chains *chains = esw_chains(esw);
 
-	if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH) && attr->dest_chain)
-		esw_cleanup_chain_dest(chains, attr->dest_chain, 1, 0);
+	if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH)) {
+		if (attr->dest_chain)
+			esw_cleanup_chain_dest(chains, attr->dest_chain, 1, 0);
+		else if (esw_is_chain_src_port_rewrite(esw, esw_attr))
+			esw_cleanup_chain_src_port_rewrite(esw, attr);
+	}
 }
 
 struct mlx5_flow_handle *
@@ -448,10 +520,12 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 		}
 	}
 
+	mlx5_eswitch_set_rule_flow_source(esw, spec, esw_attr);
+
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		int err;
 
-		err = esw_setup_dests(dest, &flow_act, esw, attr, &i);
+		err = esw_setup_dests(dest, &flow_act, esw, attr, spec, &i);
 		if (err) {
 			rule = ERR_PTR(err);
 			goto err_create_goto_table;
@@ -498,8 +572,6 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 		goto err_esw_get;
 	}
 
-	mlx5_eswitch_set_rule_flow_source(esw, spec, esw_attr);
-
 	if (mlx5_eswitch_termtbl_required(esw, attr, &flow_act, spec))
 		rule = mlx5_eswitch_add_termtbl_rule(esw, fdb, spec, esw_attr,
 						     &flow_act, dest, i);
@@ -536,7 +608,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 	struct mlx5_flow_table *fast_fdb;
 	struct mlx5_flow_table *fwd_fdb;
 	struct mlx5_flow_handle *rule;
-	int i;
+	int i, err = 0;
 
 	fast_fdb = mlx5_chains_get_table(chains, attr->chain, attr->prio, 0);
 	if (IS_ERR(fast_fdb)) {
@@ -554,8 +626,18 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 	}
 
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	for (i = 0; i < esw_attr->split_count; i++)
-		esw_setup_vport_dest(dest, &flow_act, esw, esw_attr, i, i, false);
+	for (i = 0; i < esw_attr->split_count; i++) {
+		if (esw_is_chain_src_port_rewrite(esw, esw_attr))
+			err = esw_setup_chain_src_port_rewrite(dest, &flow_act, esw, chains, attr,
+							       &i);
+		else
+			esw_setup_vport_dest(dest, &flow_act, esw, esw_attr, i, i, false);
+
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_chain_src_rewrite;
+		}
+	}
 	dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 	dest[i].ft = fwd_fdb;
 	i++;
@@ -570,13 +652,16 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 	flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL;
 	rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i);
 
-	if (IS_ERR(rule))
-		goto add_err;
+	if (IS_ERR(rule)) {
+		i = esw_attr->split_count;
+		goto err_chain_src_rewrite;
+	}
 
 	atomic64_inc(&esw->offloads.num_flows);
 
 	return rule;
-add_err:
+err_chain_src_rewrite:
+	esw_put_dest_tables_loop(esw, attr, 0, i);
 	esw_vport_tbl_put(esw, &fwd_attr);
 err_get_fwd:
 	mlx5_chains_put_table(chains, attr->chain, attr->prio, 0);
@@ -617,6 +702,7 @@ __mlx5_eswitch_del_rule(struct mlx5_eswitch *esw,
 	if (fwd_rule)  {
 		esw_vport_tbl_put(esw, &fwd_attr);
 		mlx5_chains_put_table(chains, attr->chain, attr->prio, 0);
+		esw_put_dest_tables_loop(esw, attr, 0, esw_attr->split_count);
 	} else {
 		if (split)
 			esw_vport_tbl_put(esw, &fwd_attr);
@@ -3020,3 +3106,15 @@ int mlx5_eswitch_vhca_id_to_vport(struct mlx5_eswitch *esw, u16 vhca_id, u16 *vp
 	*vport_num = *res;
 	return 0;
 }
+
+u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
+					    u16 vport_num)
+{
+	struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num);
+
+	if (WARN_ON_ONCE(IS_ERR(vport)))
+		return 0;
+
+	return vport->metadata;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_set);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 29fd832950e0..67e341274a22 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -96,6 +96,8 @@ static inline u32 mlx5_eswitch_get_vport_metadata_mask(void)
 
 u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
 					      u16 vport_num);
+u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
+					    u16 vport_num);
 u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
 #else  /* CONFIG_MLX5_ESWITCH */
 
-- 
cgit v1.2.3


From 48d216e5596a58e3cfa6d4548343f982c5921b79 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Mon, 31 Aug 2020 16:18:19 +0300
Subject: net/mlx5e: Refactor reg_c1 usage

Following patch in series uses reg_c1 in eswitch code. To use reg_c1
helpers in both TC and eswitch code, refactor existing helpers according to
similar use case of reg_c0 and move the functionality into eswitch.h.
Calculate reg mappings length from new defines to ensure that they are
always in sync and only need to be changed in single place.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h  |  6 ++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c     |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h     |  2 +-
 include/linux/mlx5/eswitch.h                        | 19 +++++++++++++++++++
 5 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
index 76177f7c5ec2..14bcebd4a0b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
@@ -651,7 +651,7 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
 
 		tc_skb_ext->chain = chain;
 
-		zone_restore_id = reg_c1 & ZONE_RESTORE_MAX;
+		zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK;
 
 		uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
 		uplink_priv = &uplink_rpriv->uplink_priv;
@@ -660,7 +660,7 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
 			return false;
 	}
 
-	tunnel_id = reg_c1 >> REG_MAPPING_SHIFT(TUNNEL_TO_REG);
+	tunnel_id = reg_c1 >> ESW_TUN_OFFSET;
 	return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id);
 #endif /* CONFIG_NET_TC_SKB_EXT */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
index 6503b614337c..69e618d17071 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h
@@ -73,7 +73,7 @@ struct mlx5_ct_attr {
 #define zone_restore_to_reg_ct {\
 	.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1,\
 	.moffset = 0,\
-	.mlen = 1,\
+	.mlen = (ESW_ZONE_ID_BITS / 8),\
 	.soffset = MLX5_BYTE_OFF(fte_match_param,\
 				 misc_parameters_2.metadata_reg_c_1) + 3,\
 }
@@ -81,14 +81,12 @@ struct mlx5_ct_attr {
 #define nic_zone_restore_to_reg_ct {\
 	.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_B,\
 	.moffset = 2,\
-	.mlen = 1,\
+	.mlen = (ESW_ZONE_ID_BITS / 8),\
 }
 
 #define REG_MAPPING_MLEN(reg) (mlx5e_tc_attr_to_reg_mappings[reg].mlen)
 #define REG_MAPPING_MOFFSET(reg) (mlx5e_tc_attr_to_reg_mappings[reg].moffset)
 #define REG_MAPPING_SHIFT(reg) (REG_MAPPING_MOFFSET(reg) * 8)
-#define ZONE_RESTORE_BITS (REG_MAPPING_MLEN(ZONE_RESTORE_TO_REG) * 8)
-#define ZONE_RESTORE_MAX GENMASK(ZONE_RESTORE_BITS - 1, 0)
 
 #if IS_ENABLED(CONFIG_MLX5_TC_CT)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 098f3efa5d4d..90db5a99879d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -173,7 +173,7 @@ struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = {
 	[TUNNEL_TO_REG] = {
 		.mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1,
 		.moffset = 1,
-		.mlen = 3,
+		.mlen = ((ESW_TUN_OPTS_BITS + ESW_TUN_ID_BITS) / 8),
 		.soffset = MLX5_BYTE_OFF(fte_match_param,
 					 misc_parameters_2.metadata_reg_c_1),
 	},
@@ -5649,7 +5649,7 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe,
 		tc_skb_ext->chain = chain;
 
 		zone_restore_id = (reg_b >> REG_MAPPING_SHIFT(NIC_ZONE_RESTORE_TO_REG)) &
-				  ZONE_RESTORE_MAX;
+			ESW_ZONE_ID_MASK;
 
 		if (!mlx5e_tc_ct_restore_flow(tc->ct, skb,
 					      zone_restore_id))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index ee0029192504..1e4ee02bfb1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -302,7 +302,7 @@ static inline bool mlx5e_cqe_regb_chain(struct mlx5_cqe64 *cqe)
 
 	reg_b = be32_to_cpu(cqe->ft_metadata);
 
-	if (reg_b >> (MLX5E_TC_TABLE_CHAIN_TAG_BITS + ZONE_RESTORE_BITS))
+	if (reg_b >> (MLX5E_TC_TABLE_CHAIN_TAG_BITS + ESW_ZONE_ID_BITS))
 		return false;
 
 	chain = reg_b & MLX5E_TC_TABLE_CHAIN_TAG_MASK;
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 67e341274a22..3b20e84049c1 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -98,6 +98,25 @@ u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
 					      u16 vport_num);
 u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 					    u16 vport_num);
+
+/* Reg C1 usage:
+ * Reg C1 = < ESW_TUN_ID(12) | ESW_TUN_OPTS(12) | ESW_ZONE_ID(8) >
+ *
+ * Highest 12 bits of reg c1 is the encapsulation tunnel id, next 12 bits is
+ * encapsulation tunnel options, and the lowest 8 bits are used for zone id.
+ *
+ * Zone id is used to restore CT flow when packet misses on chain.
+ *
+ * Tunnel id and options are used together to restore the tunnel info metadata
+ * on miss and to support inner header rewrite by means of implicit chain 0
+ * flows.
+ */
+#define ESW_ZONE_ID_BITS 8
+#define ESW_TUN_OPTS_BITS 12
+#define ESW_TUN_ID_BITS 12
+#define ESW_TUN_OFFSET ESW_ZONE_ID_BITS
+#define ESW_ZONE_ID_MASK GENMASK(ESW_ZONE_ID_BITS - 1, 0)
+
 u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
 #else  /* CONFIG_MLX5_ESWITCH */
 
-- 
cgit v1.2.3


From 8e404fefa58b6138531e3d4b5647ee79f75ae9a8 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Mon, 31 Aug 2020 16:18:57 +0300
Subject: net/mlx5e: Match recirculated packet miss in slow table using reg_c1

Previous patch in series that implements stack devices RX path implements
indirect table rules that match on tunnel VNI. After such rule is created
all tunnel traffic is recirculated to root table. However, recirculated
packet might not match on any rules installed in the table (for example,
when IP traffic follows ARP traffic). In that case packets appear on
representor of tunnel endpoint VF instead being redirected to the VF
itself.

Extend slow table with additional flow group that matches on reg_c0 (source
port value set by indirect tables implemented by previous patch in series)
and reg_c1 (special 0xFFF mark). When creating offloads fdb tables, install
one rule per VF vport to match on recirculated miss packets and redirect
them to appropriate VF vport. Modify indirect tables code to also rewrite
reg_c1 with special 0xFFF mark.

Implementation reuses reg_c1 tunnel id bits. This is safe to do because
recirculated packets are always matched before decapsulation.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   3 +-
 .../ethernet/mellanox/mlx5/core/esw/indir_table.c  |  17 ++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   2 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 119 ++++++++++++++++++++-
 include/linux/mlx5/eswitch.h                       |  10 +-
 5 files changed, 143 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 90db5a99879d..21568d1fc00f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -5515,7 +5515,8 @@ int mlx5e_tc_esw_init(struct rhashtable *tc_ht)
 	}
 	uplink_priv->tunnel_mapping = mapping;
 
-	mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK, true);
+	/* 0xFFF is reserved for stack devices slow path table mark */
+	mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK - 1, true);
 	if (IS_ERR(mapping)) {
 		err = PTR_ERR(mapping);
 		goto err_enc_opts_mapping;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c
index a0ebf40c9907..b7d00c4c7046 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c
@@ -162,7 +162,7 @@ static int mlx5_esw_indir_table_rule_get(struct mlx5_eswitch *esw,
 			 (attr->ip_version == 4 ? ETH_P_IP : ETH_P_IPV6));
 	} else {
 		err = -EOPNOTSUPP;
-		goto err_mod_hdr;
+		goto err_ethertype;
 	}
 
 	if (attr->ip_version == 4) {
@@ -198,13 +198,18 @@ static int mlx5_esw_indir_table_rule_get(struct mlx5_eswitch *esw,
 	err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB,
 					VPORT_TO_REG, data);
 	if (err)
-		goto err_mod_hdr;
+		goto err_mod_hdr_regc0;
+
+	err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB,
+					TUNNEL_TO_REG, ESW_TUN_SLOW_TABLE_GOTO_VPORT);
+	if (err)
+		goto err_mod_hdr_regc1;
 
 	flow_act.modify_hdr = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_FDB,
 						       mod_acts.num_actions, mod_acts.actions);
 	if (IS_ERR(flow_act.modify_hdr)) {
 		err = PTR_ERR(flow_act.modify_hdr);
-		goto err_mod_hdr;
+		goto err_mod_hdr_alloc;
 	}
 
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
@@ -236,7 +241,11 @@ err_handle:
 	mlx5_chains_put_table(chains, 0, 1, 0);
 err_table:
 	mlx5_modify_header_dealloc(esw->dev, flow_act.modify_hdr);
-err_mod_hdr:
+err_mod_hdr_alloc:
+err_mod_hdr_regc1:
+	dealloc_mod_hdr_actions(&mod_acts);
+err_mod_hdr_regc0:
+err_ethertype:
 	kfree(rule);
 out:
 	kfree(rule_spec);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index c2361c5b824c..34a21ff95472 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -179,9 +179,11 @@ struct mlx5_eswitch_fdb {
 			struct mlx5_flow_namespace *ns;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
+			struct mlx5_flow_group *send_to_vport_meta_grp;
 			struct mlx5_flow_group *peer_miss_grp;
 			struct mlx5_flow_handle **peer_miss_rules;
 			struct mlx5_flow_group *miss_grp;
+			struct mlx5_flow_handle **send_to_vport_meta_rules;
 			struct mlx5_flow_handle *miss_rule_uni;
 			struct mlx5_flow_handle *miss_rule_multi;
 			int vlan_push_pop_refcount;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a44728595420..94cb0217b4f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1080,6 +1080,81 @@ void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule)
 	mlx5_del_flow_rules(rule);
 }
 
+static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_handle **flows = esw->fdb_table.offloads.send_to_vport_meta_rules;
+	int i = 0, num_vfs = esw->esw_funcs.num_vfs, vport_num;
+
+	if (!num_vfs || !flows)
+		return;
+
+	mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs)
+		mlx5_del_flow_rules(flows[i++]);
+
+	kvfree(flows);
+}
+
+static int
+mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw)
+{
+	int num_vfs, vport_num, rule_idx = 0, err = 0;
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_handle **flows;
+	struct mlx5_flow_spec *spec;
+
+	num_vfs = esw->esw_funcs.num_vfs;
+	flows = kvzalloc(num_vfs * sizeof(*flows), GFP_KERNEL);
+	if (!flows)
+		return -ENOMEM;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+
+	MLX5_SET(fte_match_param, spec->match_criteria,
+		 misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask());
+	MLX5_SET(fte_match_param, spec->match_criteria,
+		 misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK);
+	MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_1,
+		 ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK);
+
+	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+
+	mlx5_esw_for_each_vf_vport_num(esw, vport_num, num_vfs) {
+		MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num));
+		dest.vport.num = vport_num;
+
+		flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb,
+						spec, &flow_act, &dest, 1);
+		if (IS_ERR(flow_rule)) {
+			err = PTR_ERR(flow_rule);
+			esw_warn(esw->dev, "FDB: Failed to add send to vport meta rule idx %d, err %ld\n",
+				 rule_idx, PTR_ERR(flow_rule));
+			goto rule_err;
+		}
+		flows[rule_idx++] = flow_rule;
+	}
+
+	esw->fdb_table.offloads.send_to_vport_meta_rules = flows;
+	kvfree(spec);
+	return 0;
+
+rule_err:
+	while (--rule_idx >= 0)
+		mlx5_del_flow_rules(flows[rule_idx]);
+	kvfree(spec);
+alloc_err:
+	kvfree(flows);
+	return err;
+}
+
 static bool mlx5_eswitch_reg_c1_loopback_supported(struct mlx5_eswitch *esw)
 {
 	return MLX5_CAP_ESW_FLOWTABLE(esw->dev, fdb_to_vport_reg_c_id) &
@@ -1562,11 +1637,11 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	struct mlx5_flow_table_attr ft_attr = {};
+	int num_vfs, table_size, ix, err = 0;
 	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb = NULL;
 	u32 flags = 0, *flow_group_in;
-	int table_size, ix, err = 0;
 	struct mlx5_flow_group *g;
 	void *match_criteria;
 	u8 *dmac;
@@ -1592,7 +1667,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	}
 
 	table_size = esw->total_vports * MAX_SQ_NVPORTS + MAX_PF_SQ +
-		MLX5_ESW_MISS_FLOWS + esw->total_vports;
+		MLX5_ESW_MISS_FLOWS + esw->total_vports + esw->esw_funcs.num_vfs;
 
 	/* create the slow path fdb with encap set, so further table instances
 	 * can be created at run time while VFs are probed if the FW allows that.
@@ -1640,6 +1715,38 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	}
 	esw->fdb_table.offloads.send_to_vport_grp = g;
 
+	/* meta send to vport */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS_2);
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+	MLX5_SET(fte_match_param, match_criteria,
+		 misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask());
+	MLX5_SET(fte_match_param, match_criteria,
+		 misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK);
+
+	num_vfs = esw->esw_funcs.num_vfs;
+	if (num_vfs) {
+		MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ix);
+		MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix + num_vfs - 1);
+		ix += num_vfs;
+
+		g = mlx5_create_flow_group(fdb, flow_group_in);
+		if (IS_ERR(g)) {
+			err = PTR_ERR(g);
+			esw_warn(dev, "Failed to create send-to-vport meta flow group err(%d)\n",
+				 err);
+			goto send_vport_meta_err;
+		}
+		esw->fdb_table.offloads.send_to_vport_meta_grp = g;
+
+		err = mlx5_eswitch_add_send_to_vport_meta_rules(esw);
+		if (err)
+			goto meta_rule_err;
+	}
+
 	if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) {
 		/* create peer esw miss group */
 		memset(flow_group_in, 0, inlen);
@@ -1707,6 +1814,11 @@ miss_err:
 	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
 		mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp);
 peer_miss_err:
+	mlx5_eswitch_del_send_to_vport_meta_rules(esw);
+meta_rule_err:
+	if (esw->fdb_table.offloads.send_to_vport_meta_grp)
+		mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_meta_grp);
+send_vport_meta_err:
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 send_vport_err:
 	esw_chains_destroy(esw, esw_chains(esw));
@@ -1728,7 +1840,10 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
 	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_multi);
 	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni);
+	mlx5_eswitch_del_send_to_vport_meta_rules(esw);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+	if (esw->fdb_table.offloads.send_to_vport_meta_grp)
+		mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_meta_grp);
 	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
 		mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 3b20e84049c1..994c2c8cb4fd 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -114,8 +114,16 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 #define ESW_ZONE_ID_BITS 8
 #define ESW_TUN_OPTS_BITS 12
 #define ESW_TUN_ID_BITS 12
-#define ESW_TUN_OFFSET ESW_ZONE_ID_BITS
+#define ESW_TUN_OPTS_OFFSET ESW_ZONE_ID_BITS
+#define ESW_TUN_OFFSET ESW_TUN_OPTS_OFFSET
 #define ESW_ZONE_ID_MASK GENMASK(ESW_ZONE_ID_BITS - 1, 0)
+#define ESW_TUN_OPTS_MASK GENMASK(32 - ESW_TUN_ID_BITS - 1, ESW_TUN_OPTS_OFFSET)
+#define ESW_TUN_MASK GENMASK(31, ESW_TUN_OFFSET)
+#define ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT 0 /* 0 is not a valid tunnel id */
+#define ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT 0xFFF /* 0xFFF is a reserved mapping */
+#define ESW_TUN_SLOW_TABLE_GOTO_VPORT ((ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT << ESW_TUN_OPTS_BITS) | \
+				       ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT)
+#define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK
 
 u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
 #else  /* CONFIG_MLX5_ESWITCH */
-- 
cgit v1.2.3


From 52c2d15703c3a900d5f78cd599b823db40d5100b Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Date: Tue, 19 Jan 2021 17:36:14 -0800
Subject: usb: common: Parse for USB SSP genXxY

The USB "maximum-speed" property can now take the SSP signaling rate
generation and lane count with these new strings:

"super-speed-plus-gen2x2"
"super-speed-plus-gen2x1"
"super-speed-plus-gen1x2"

Introduce usb_get_maximum_ssp_rate() to parse for the corresponding
usb_ssp_rate enum. The original usb_get_maximum_speed() will return
USB_SPEED_SUPER_PLUS if it matches one of these new strings.

Signed-off-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Link: https://lore.kernel.org/r/f8ed896313d8cd8e2d2b540fc82db92b3ddf8a47.1611106162.git.Thinh.Nguyen@synopsys.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/common/common.c | 26 +++++++++++++++++++++++++-
 include/linux/usb/ch9.h     | 11 +++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index 1433260d99b4..fc21cf2d36f6 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -69,6 +69,13 @@ static const char *const speed_names[] = {
 	[USB_SPEED_SUPER_PLUS] = "super-speed-plus",
 };
 
+static const char *const ssp_rate[] = {
+	[USB_SSP_GEN_UNKNOWN] = "UNKNOWN",
+	[USB_SSP_GEN_2x1] = "super-speed-plus-gen2x1",
+	[USB_SSP_GEN_1x2] = "super-speed-plus-gen1x2",
+	[USB_SSP_GEN_2x2] = "super-speed-plus-gen2x2",
+};
+
 const char *usb_speed_string(enum usb_device_speed speed)
 {
 	if (speed < 0 || speed >= ARRAY_SIZE(speed_names))
@@ -86,12 +93,29 @@ enum usb_device_speed usb_get_maximum_speed(struct device *dev)
 	if (ret < 0)
 		return USB_SPEED_UNKNOWN;
 
-	ret = match_string(speed_names, ARRAY_SIZE(speed_names), maximum_speed);
+	ret = match_string(ssp_rate, ARRAY_SIZE(ssp_rate), maximum_speed);
+	if (ret > 0)
+		return USB_SPEED_SUPER_PLUS;
 
+	ret = match_string(speed_names, ARRAY_SIZE(speed_names), maximum_speed);
 	return (ret < 0) ? USB_SPEED_UNKNOWN : ret;
 }
 EXPORT_SYMBOL_GPL(usb_get_maximum_speed);
 
+enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev)
+{
+	const char *maximum_speed;
+	int ret;
+
+	ret = device_property_read_string(dev, "maximum-speed", &maximum_speed);
+	if (ret < 0)
+		return USB_SSP_GEN_UNKNOWN;
+
+	ret = match_string(ssp_rate, ARRAY_SIZE(ssp_rate), maximum_speed);
+	return (ret < 0) ? USB_SSP_GEN_UNKNOWN : ret;
+}
+EXPORT_SYMBOL_GPL(usb_get_maximum_ssp_rate);
+
 const char *usb_state_string(enum usb_device_state state)
 {
 	static const char *const names[] = {
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 86c50907634e..abdd310c77f0 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -71,6 +71,17 @@ extern const char *usb_speed_string(enum usb_device_speed speed);
  */
 extern enum usb_device_speed usb_get_maximum_speed(struct device *dev);
 
+/**
+ * usb_get_maximum_ssp_rate - Get the signaling rate generation and lane count
+ *	of a SuperSpeed Plus capable device.
+ * @dev: Pointer to the given USB controller device
+ *
+ * If the string from "maximum-speed" property is super-speed-plus-genXxY where
+ * 'X' is the generation number and 'Y' is the number of lanes, then this
+ * function returns the corresponding enum usb_ssp_rate.
+ */
+extern enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev);
+
 /**
  * usb_state_string - Returns human readable name for the state.
  * @state: The state to return a human-readable name for. If it's not
-- 
cgit v1.2.3


From b358e2122b9d7aa99f681d4edfafd999845d16ff Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 4 Feb 2021 18:56:35 +0800
Subject: mm: page_frag: Introduce page_frag_alloc_align()

In the current implementation of page_frag_alloc(), it doesn't have
any align guarantee for the returned buffer address. But for some
hardwares they do require the DMA buffer to be aligned correctly,
so we would have to use some workarounds like below if the buffers
allocated by the page_frag_alloc() are used by these hardwares for
DMA.
    buf = page_frag_alloc(really_needed_size + align);
    buf = PTR_ALIGN(buf, align);

These codes seems ugly and would waste a lot of memories if the buffers
are used in a network driver for the TX/RX. So introduce
page_frag_alloc_align() to make sure that an aligned buffer address is
returned.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/gfp.h | 12 ++++++++++--
 mm/page_alloc.c     |  8 +++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6e479e9c48ce..80544d5c08e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -583,8 +583,16 @@ extern void free_pages(unsigned long addr, unsigned int order);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-extern void *page_frag_alloc(struct page_frag_cache *nc,
-			     unsigned int fragsz, gfp_t gfp_mask);
+extern void *page_frag_alloc_align(struct page_frag_cache *nc,
+				   unsigned int fragsz, gfp_t gfp_mask,
+				   unsigned int align_mask);
+
+static inline void *page_frag_alloc(struct page_frag_cache *nc,
+			     unsigned int fragsz, gfp_t gfp_mask)
+{
+	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+}
+
 extern void page_frag_free(void *addr);
 
 #define __free_page(page) __free_pages((page), 0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 519a60d5b6f7..ef5070fed76b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5137,8 +5137,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
-void *page_frag_alloc(struct page_frag_cache *nc,
-		      unsigned int fragsz, gfp_t gfp_mask)
+void *page_frag_alloc_align(struct page_frag_cache *nc,
+		      unsigned int fragsz, gfp_t gfp_mask,
+		      unsigned int align_mask)
 {
 	unsigned int size = PAGE_SIZE;
 	struct page *page;
@@ -5190,11 +5191,12 @@ refill:
 	}
 
 	nc->pagecnt_bias--;
+	offset &= align_mask;
 	nc->offset = offset;
 
 	return nc->va + offset;
 }
-EXPORT_SYMBOL(page_frag_alloc);
+EXPORT_SYMBOL(page_frag_alloc_align);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
-- 
cgit v1.2.3


From 3f6e687dff395da43b056c18150a423bc7bf5d14 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 4 Feb 2021 18:56:36 +0800
Subject: net: Introduce {netdev,napi}_alloc_frag_align()

In the current implementation of {netdev,napi}_alloc_frag(), it doesn't
have any align guarantee for the returned buffer address, But for some
hardwares they do require the DMA buffer to be aligned correctly,
so we would have to use some workarounds like below if the buffers
allocated by the {netdev,napi}_alloc_frag() are used by these hardwares
for DMA.
    buf = napi_alloc_frag(really_needed_size + align);
    buf = PTR_ALIGN(buf, align);

These codes seems ugly and would waste a lot of memories if the buffers
are used in a network driver for the TX/RX. We have added the align
support for the page_frag functions, so add the corresponding
{netdev,napi}_frag functions.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 36 ++++++++++++++++++++++++++++++++++--
 net/core/skbuff.c      | 26 ++++++++++----------------
 2 files changed, 44 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0e42c53b8ca9..0a4e91a2f873 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2818,7 +2818,26 @@ void skb_queue_purge(struct sk_buff_head *list);
 
 unsigned int skb_rbtree_purge(struct rb_root *root);
 
-void *netdev_alloc_frag(unsigned int fragsz);
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+static inline void *netdev_alloc_frag(unsigned int fragsz)
+{
+	return __netdev_alloc_frag_align(fragsz, ~0u);
+}
+
+static inline void *netdev_alloc_frag_align(unsigned int fragsz,
+					    unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __netdev_alloc_frag_align(fragsz, -align);
+}
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
 				   gfp_t gfp_mask);
@@ -2877,7 +2896,20 @@ static inline void skb_free_frag(void *addr)
 	page_frag_free(addr);
 }
 
-void *napi_alloc_frag(unsigned int fragsz);
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
+
+static inline void *napi_alloc_frag(unsigned int fragsz)
+{
+	return __napi_alloc_frag_align(fragsz, ~0u);
+}
+
+static inline void *napi_alloc_frag_align(unsigned int fragsz,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __napi_alloc_frag_align(fragsz, -align);
+}
+
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
 				 unsigned int length, gfp_t gfp_mask);
 static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3787093239f5..d380c7b5a12d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -374,29 +374,23 @@ struct napi_alloc_cache {
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
+				unsigned int align_mask)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
-	return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+	return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
 }
 
-void *napi_alloc_frag(unsigned int fragsz)
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 {
 	fragsz = SKB_DATA_ALIGN(fragsz);
 
-	return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+	return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
 }
-EXPORT_SYMBOL(napi_alloc_frag);
+EXPORT_SYMBOL(__napi_alloc_frag_align);
 
-/**
- * netdev_alloc_frag - allocate a page fragment
- * @fragsz: fragment size
- *
- * Allocates a frag from a page for receive buffer.
- * Uses GFP_ATOMIC allocations.
- */
-void *netdev_alloc_frag(unsigned int fragsz)
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 {
 	struct page_frag_cache *nc;
 	void *data;
@@ -404,15 +398,15 @@ void *netdev_alloc_frag(unsigned int fragsz)
 	fragsz = SKB_DATA_ALIGN(fragsz);
 	if (in_irq() || irqs_disabled()) {
 		nc = this_cpu_ptr(&netdev_alloc_cache);
-		data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
+		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
 	} else {
 		local_bh_disable();
-		data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
+		data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
 		local_bh_enable();
 	}
 	return data;
 }
-EXPORT_SYMBOL(netdev_alloc_frag);
+EXPORT_SYMBOL(__netdev_alloc_frag_align);
 
 /**
  *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
-- 
cgit v1.2.3


From 167790abb90fa073d8341ee0e408ccad3d2109cd Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Fri, 22 Jan 2021 15:06:29 +0800
Subject: soundwire: export sdw_write/read_no_pm functions

sdw_write_no_pm and sdw_read_no_pm are useful when we want to do IO
without touching PM.

Fixes: 0231453bc08f ('soundwire: bus: add clock stop helpers')
Fixes: 60ee9be25571 ('soundwire: bus: add PM/no-PM versions of read/write functions')
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20210122070634.12825-5-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 7 ++++---
 include/linux/soundwire/sdw.h | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 675d028e923d..45e9fc9f472a 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -407,10 +407,11 @@ sdw_nwrite_no_pm(struct sdw_slave *slave, u32 addr, size_t count, u8 *val)
 	return sdw_transfer(slave->bus, &msg);
 }
 
-static int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value)
+int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value)
 {
 	return sdw_nwrite_no_pm(slave, addr, 1, &value);
 }
+EXPORT_SYMBOL(sdw_write_no_pm);
 
 static int
 sdw_bread_no_pm(struct sdw_bus *bus, u16 dev_num, u32 addr)
@@ -478,8 +479,7 @@ int sdw_bwrite_no_pm_unlocked(struct sdw_bus *bus, u16 dev_num, u32 addr, u8 val
 }
 EXPORT_SYMBOL(sdw_bwrite_no_pm_unlocked);
 
-static int
-sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
+int sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
 {
 	u8 buf;
 	int ret;
@@ -490,6 +490,7 @@ sdw_read_no_pm(struct sdw_slave *slave, u32 addr)
 	else
 		return buf;
 }
+EXPORT_SYMBOL(sdw_read_no_pm);
 
 static int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val)
 {
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index f0b01b728640..d08039d65825 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1005,6 +1005,8 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus);
 
 int sdw_read(struct sdw_slave *slave, u32 addr);
 int sdw_write(struct sdw_slave *slave, u32 addr, u8 value);
+int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value);
+int sdw_read_no_pm(struct sdw_slave *slave, u32 addr);
 int sdw_nread(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
 int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
 
-- 
cgit v1.2.3


From e17fe6579de023725ec22a16965e9099e4a05ac9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 15 Jan 2021 10:18:16 -0800
Subject: fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl

Add an ioctl FS_IOC_READ_VERITY_METADATA which will allow reading verity
metadata from a file that has fs-verity enabled, including:

- The Merkle tree
- The fsverity_descriptor (not including the signature if present)
- The built-in signature, if present

This ioctl has similar semantics to pread().  It is passed the type of
metadata to read (one of the above three), and a buffer, offset, and
size.  It returns the number of bytes read or an error.

Separate patches will add support for each of the above metadata types.
This patch just adds the ioctl itself.

This ioctl doesn't make any assumption about where the metadata is
stored on-disk.  It does assume the metadata is in a stable format, but
that's basically already the case:

- The Merkle tree and fsverity_descriptor are defined by how fs-verity
  file digests are computed; see the "File digest computation" section
  of Documentation/filesystems/fsverity.rst.  Technically, the way in
  which the levels of the tree are ordered relative to each other wasn't
  previously specified, but it's logical to put the root level first.

- The built-in signature is the value passed to FS_IOC_ENABLE_VERITY.

This ioctl is useful because it allows writing a server program that
takes a verity file and serves it to a client program, such that the
client can do its own fs-verity compatible verification of the file.
This only makes sense if the client doesn't trust the server and if the
server needs to provide the storage for the client.

More concretely, there is interest in using this ability in Android to
export APK files (which are protected by fs-verity) to "protected VMs".
This would use Protected KVM (https://lwn.net/Articles/836693), which
provides an isolated execution environment without having to trust the
traditional "host".  A "guest" VM can boot from a signed image and
perform specific tasks in a minimum trusted environment using files that
have fs-verity enabled on the host, without trusting the host or
requiring that the guest has its own trusted storage.

Technically, it would be possible to duplicate the metadata and store it
in separate files for serving.  However, that would be less efficient
and would require extra care in userspace to maintain file consistency.

In addition to the above, the ability to read the built-in signatures is
useful because it allows a system that is using the in-kernel signature
verification to migrate to userspace signature verification.

Link: https://lore.kernel.org/r/20210115181819.34732-4-ebiggers@kernel.org
Reviewed-by: Victor Hsieh <victorhsieh@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fsverity.rst | 57 ++++++++++++++++++++++++++++++++++
 fs/ext4/ioctl.c                        |  7 +++++
 fs/f2fs/file.c                         | 11 +++++++
 fs/verity/Makefile                     |  1 +
 fs/verity/read_metadata.c              | 55 ++++++++++++++++++++++++++++++++
 include/linux/fsverity.h               | 12 +++++++
 include/uapi/linux/fsverity.h          | 10 ++++++
 7 files changed, 153 insertions(+)
 create mode 100644 fs/verity/read_metadata.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
index e0204a23e997..9ef7a7de6008 100644
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@@ -217,6 +217,63 @@ FS_IOC_MEASURE_VERITY can fail with the following errors:
 - ``EOVERFLOW``: the digest is longer than the specified
   ``digest_size`` bytes.  Try providing a larger buffer.
 
+FS_IOC_READ_VERITY_METADATA
+---------------------------
+
+The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a
+verity file.  This ioctl is available since Linux v5.12.
+
+This ioctl allows writing a server program that takes a verity file
+and serves it to a client program, such that the client can do its own
+fs-verity compatible verification of the file.  This only makes sense
+if the client doesn't trust the server and if the server needs to
+provide the storage for the client.
+
+This is a fairly specialized use case, and most fs-verity users won't
+need this ioctl.
+
+This ioctl takes in a pointer to the following structure::
+
+   struct fsverity_read_metadata_arg {
+           __u64 metadata_type;
+           __u64 offset;
+           __u64 length;
+           __u64 buf_ptr;
+           __u64 __reserved;
+   };
+
+``metadata_type`` specifies the type of metadata to read.
+
+The semantics are similar to those of ``pread()``.  ``offset``
+specifies the offset in bytes into the metadata item to read from, and
+``length`` specifies the maximum number of bytes to read from the
+metadata item.  ``buf_ptr`` is the pointer to the buffer to read into,
+cast to a 64-bit integer.  ``__reserved`` must be 0.  On success, the
+number of bytes read is returned.  0 is returned at the end of the
+metadata item.  The returned length may be less than ``length``, for
+example if the ioctl is interrupted.
+
+The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed
+to be authenticated against the file digest that would be returned by
+`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to
+implement fs-verity compatible verification anyway (though absent a
+malicious disk, the metadata will indeed match).  E.g. to implement
+this ioctl, the filesystem is allowed to just read the Merkle tree
+blocks from disk without actually verifying the path to the root node.
+
+FS_IOC_READ_VERITY_METADATA can fail with the following errors:
+
+- ``EFAULT``: the caller provided inaccessible memory
+- ``EINTR``: the ioctl was interrupted before any data was read
+- ``EINVAL``: reserved fields were set, or ``offset + length``
+  overflowed
+- ``ENODATA``: the file is not a verity file
+- ``ENOTTY``: this type of filesystem does not implement fs-verity, or
+  this ioctl is not yet implemented on it
+- ``EOPNOTSUPP``: the kernel was not configured with fs-verity
+  support, or the filesystem superblock has not had the 'verity'
+  feature enabled on it.  (See `Filesystem support`_.)
+
 FS_IOC_GETFLAGS
 ---------------
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d9665d2f82db..713b1ae44c1a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1309,6 +1309,12 @@ out:
 			return -EOPNOTSUPP;
 		return fsverity_ioctl_measure(filp, (void __user *)arg);
 
+	case FS_IOC_READ_VERITY_METADATA:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_read_metadata(filp,
+						    (const void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -1391,6 +1397,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC_GETFSMAP:
 	case FS_IOC_ENABLE_VERITY:
 	case FS_IOC_MEASURE_VERITY:
+	case FS_IOC_READ_VERITY_METADATA:
 	case EXT4_IOC_CLEAR_ES_CACHE:
 	case EXT4_IOC_GETSTATE:
 	case EXT4_IOC_GET_ES_CACHE:
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f585545277d7..d0aefb5b97fa 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3357,6 +3357,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
 	return fsverity_ioctl_measure(filp, (void __user *)arg);
 }
 
+static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg)
+{
+	if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fsverity_ioctl_read_metadata(filp, (const void __user *)arg);
+}
+
 static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -4272,6 +4280,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_enable_verity(filp, arg);
 	case FS_IOC_MEASURE_VERITY:
 		return f2fs_ioc_measure_verity(filp, arg);
+	case FS_IOC_READ_VERITY_METADATA:
+		return f2fs_ioc_read_verity_metadata(filp, arg);
 	case FS_IOC_GETFSLABEL:
 		return f2fs_ioc_getfslabel(filp, arg);
 	case FS_IOC_SETFSLABEL:
@@ -4523,6 +4533,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_RESIZE_FS:
 	case FS_IOC_ENABLE_VERITY:
 	case FS_IOC_MEASURE_VERITY:
+	case FS_IOC_READ_VERITY_METADATA:
 	case FS_IOC_GETFSLABEL:
 	case FS_IOC_SETFSLABEL:
 	case F2FS_IOC_GET_COMPRESS_BLOCKS:
diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 570e9136334d..435559a4fa9e 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \
 			   init.o \
 			   measure.o \
 			   open.o \
+			   read_metadata.o \
 			   verify.o
 
 obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
new file mode 100644
index 000000000000..43be990fd53e
--- /dev/null
+++ b/fs/verity/read_metadata.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ioctl to read verity metadata
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/uaccess.h>
+
+/**
+ * fsverity_ioctl_read_metadata() - read verity metadata from a file
+ * @filp: file to read the metadata from
+ * @uarg: user pointer to fsverity_read_metadata_arg
+ *
+ * Return: length read on success, 0 on EOF, -errno on failure
+ */
+int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg)
+{
+	struct inode *inode = file_inode(filp);
+	const struct fsverity_info *vi;
+	struct fsverity_read_metadata_arg arg;
+	int length;
+	void __user *buf;
+
+	vi = fsverity_get_info(inode);
+	if (!vi)
+		return -ENODATA; /* not a verity file */
+	/*
+	 * Note that we don't have to explicitly check that the file is open for
+	 * reading, since verity files can only be opened for reading.
+	 */
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.__reserved)
+		return -EINVAL;
+
+	/* offset + length must not overflow. */
+	if (arg.offset + arg.length < arg.offset)
+		return -EINVAL;
+
+	/* Ensure that the return value will fit in INT_MAX. */
+	length = min_t(u64, arg.length, INT_MAX);
+
+	buf = u64_to_user_ptr(arg.buf_ptr);
+
+	switch (arg.metadata_type) {
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata);
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index c1144a450392..b568b3c7d095 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -138,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp);
 int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
 void fsverity_cleanup_inode(struct inode *inode);
 
+/* read_metadata.c */
+
+int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);
+
 /* verify.c */
 
 bool fsverity_verify_page(struct page *page);
@@ -183,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode)
 {
 }
 
+/* read_metadata.c */
+
+static inline int fsverity_ioctl_read_metadata(struct file *filp,
+					       const void __user *uarg)
+{
+	return -EOPNOTSUPP;
+}
+
 /* verify.c */
 
 static inline bool fsverity_verify_page(struct page *page)
diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h
index 33f44156f8ea..e062751294d0 100644
--- a/include/uapi/linux/fsverity.h
+++ b/include/uapi/linux/fsverity.h
@@ -83,7 +83,17 @@ struct fsverity_formatted_digest {
 	__u8 digest[];
 };
 
+struct fsverity_read_metadata_arg {
+	__u64 metadata_type;
+	__u64 offset;
+	__u64 length;
+	__u64 buf_ptr;
+	__u64 __reserved;
+};
+
 #define FS_IOC_ENABLE_VERITY	_IOW('f', 133, struct fsverity_enable_arg)
 #define FS_IOC_MEASURE_VERITY	_IOWR('f', 134, struct fsverity_digest)
+#define FS_IOC_READ_VERITY_METADATA \
+	_IOWR('f', 135, struct fsverity_read_metadata_arg)
 
 #endif /* _UAPI_LINUX_FSVERITY_H */
-- 
cgit v1.2.3


From a006050575745ca2be25118b90f1c37f454ac542 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:25 +0100
Subject: module: use RCU to synchronize find_module

Allow for a RCU-sched critical section around find_module, following
the lower level find_module_all helper, and switch the two callers
outside of module.c to use such a RCU-sched critical section instead
of module_mutex.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h      | 2 +-
 kernel/livepatch/core.c     | 5 +++--
 kernel/module.c             | 1 -
 kernel/trace/trace_kprobe.c | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 7a0bcb5b1ffc..a64aa84d1b18 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -586,7 +586,7 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 	return within_module_init(addr, mod) || within_module_core(addr, mod);
 }
 
-/* Search for module by name: must hold module_mutex. */
+/* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
 struct symsearch {
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f76fdb925532..262cd9b003b9 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -19,6 +19,7 @@
 #include <linux/moduleloader.h>
 #include <linux/completion.h>
 #include <linux/memory.h>
+#include <linux/rcupdate.h>
 #include <asm/cacheflush.h>
 #include "core.h"
 #include "patch.h"
@@ -57,7 +58,7 @@ static void klp_find_object_module(struct klp_object *obj)
 	if (!klp_is_module(obj))
 		return;
 
-	mutex_lock(&module_mutex);
+	rcu_read_lock_sched();
 	/*
 	 * We do not want to block removal of patched modules and therefore
 	 * we do not take a reference here. The patches are removed by
@@ -74,7 +75,7 @@ static void klp_find_object_module(struct klp_object *obj)
 	if (mod && mod->klp_alive)
 		obj->mod = mod;
 
-	mutex_unlock(&module_mutex);
+	rcu_read_unlock_sched();
 }
 
 static bool klp_initialized(void)
diff --git a/kernel/module.c b/kernel/module.c
index 8fb16e704b89..63cc03393a07 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -668,7 +668,6 @@ static struct module *find_module_all(const char *name, size_t len,
 
 struct module *find_module(const char *name)
 {
-	module_assert_mutex();
 	return find_module_all(name, strlen(name), false);
 }
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index e6fba1798771..3137992baa5e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -124,9 +124,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
 	if (!p)
 		return true;
 	*p = '\0';
-	mutex_lock(&module_mutex);
+	rcu_read_lock_sched();
 	ret = !!find_module(tk->symbol);
-	mutex_unlock(&module_mutex);
+	rcu_read_unlock_sched();
 	*p = ':';
 
 	return ret;
-- 
cgit v1.2.3


From 4331667fa14e6643859d0498b34281185eb8018b Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 5 Feb 2021 14:56:39 +0800
Subject: ssb: Use true and false for bool variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following coccicheck warnings:

./include/linux/ssb/ssb_driver_gige.h:89:8-9: WARNING: return of 0/1 in
function 'ssb_gige_one_dma_at_once' with return type bool.

./include/linux/ssb/ssb_driver_gige.h:79:8-9: WARNING: return of 0/1 in
function 'ssb_gige_have_roboswitch' with return type bool.

./include/linux/ssb/ssb_driver_gige.h:182:8-9: WARNING: return of 0/1 in
function 'ssb_gige_must_flush_posted_writes' with return type bool.

./include/linux/ssb/ssb_driver_gige.h:178:8-9: WARNING: return of 0/1 in
function 'ssb_gige_one_dma_at_once' with return type bool.

./include/linux/ssb/ssb_driver_gige.h:174:8-9: WARNING: return of 0/1 in
function 'ssb_gige_have_roboswitch' with return type bool.

./include/linux/ssb/ssb_driver_gige.h:170:8-9: WARNING: return of 0/1 in
function 'ssb_gige_is_rgmii' with return type bool.

./include/linux/ssb/ssb_driver_gige.h:162:8-9: WARNING: return of 0/1 in
function 'pdev_is_ssb_gige_core' with return type bool.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Acked-by: Michael Büsch <m@bues.ch>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/1612508199-92282-1-git-send-email-jiapeng.chong@linux.alibaba.com
---
 include/linux/ssb/ssb_driver_gige.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h
index 31593b34608e..15ba0df1ee0d 100644
--- a/include/linux/ssb/ssb_driver_gige.h
+++ b/include/linux/ssb/ssb_driver_gige.h
@@ -76,7 +76,7 @@ static inline bool ssb_gige_have_roboswitch(struct pci_dev *pdev)
 	if (dev)
 		return !!(dev->dev->bus->sprom.boardflags_lo &
 			  SSB_GIGE_BFL_ROBOSWITCH);
-	return 0;
+	return false;
 }
 
 /* Returns whether we can only do one DMA at once. */
@@ -86,7 +86,7 @@ static inline bool ssb_gige_one_dma_at_once(struct pci_dev *pdev)
 	if (dev)
 		return ((dev->dev->bus->chip_id == 0x4785) &&
 			(dev->dev->bus->chip_rev < 2));
-	return 0;
+	return false;
 }
 
 /* Returns whether we must flush posted writes. */
@@ -159,7 +159,7 @@ static inline void ssb_gige_exit(void)
 
 static inline bool pdev_is_ssb_gige_core(struct pci_dev *pdev)
 {
-	return 0;
+	return false;
 }
 static inline struct ssb_gige * pdev_to_ssb_gige(struct pci_dev *pdev)
 {
@@ -167,19 +167,19 @@ static inline struct ssb_gige * pdev_to_ssb_gige(struct pci_dev *pdev)
 }
 static inline bool ssb_gige_is_rgmii(struct pci_dev *pdev)
 {
-	return 0;
+	return false;
 }
 static inline bool ssb_gige_have_roboswitch(struct pci_dev *pdev)
 {
-	return 0;
+	return false;
 }
 static inline bool ssb_gige_one_dma_at_once(struct pci_dev *pdev)
 {
-	return 0;
+	return false;
 }
 static inline bool ssb_gige_must_flush_posted_writes(struct pci_dev *pdev)
 {
-	return 0;
+	return false;
 }
 static inline int ssb_gige_get_macaddr(struct pci_dev *pdev, u8 *macaddr)
 {
-- 
cgit v1.2.3


From 3e3552056ab42f883d7723eeb42fed712b66bacf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:27 +0100
Subject: kallsyms: only build {,module_}kallsyms_on_each_symbol when required

kallsyms_on_each_symbol and module_kallsyms_on_each_symbol are only used
by the livepatching code, so don't build them if livepatching is not
enabled.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/kallsyms.h | 17 ++++-------------
 include/linux/module.h   | 16 ++++------------
 kernel/kallsyms.c        |  2 ++
 kernel/module.c          |  2 ++
 4 files changed, 12 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 481273f0c72d..465060acc981 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -71,15 +71,14 @@ static inline void *dereference_symbol_descriptor(void *ptr)
 	return ptr;
 }
 
-#ifdef CONFIG_KALLSYMS
-/* Lookup the address for a symbol. Returns 0 if not found. */
-unsigned long kallsyms_lookup_name(const char *name);
-
-/* Call a function on each kallsyms symbol in the core kernel */
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 				      unsigned long),
 			    void *data);
 
+#ifdef CONFIG_KALLSYMS
+/* Lookup the address for a symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name);
+
 extern int kallsyms_lookup_size_offset(unsigned long addr,
 				  unsigned long *symbolsize,
 				  unsigned long *offset);
@@ -108,14 +107,6 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
-static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-						    struct module *,
-						    unsigned long),
-					  void *data)
-{
-	return 0;
-}
-
 static inline int kallsyms_lookup_size_offset(unsigned long addr,
 					      unsigned long *symbolsize,
 					      unsigned long *offset)
diff --git a/include/linux/module.h b/include/linux/module.h
index a64aa84d1b18..3ea4ffae608f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -608,10 +608,6 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-					     struct module *, unsigned long),
-				   void *data);
-
 extern void __noreturn __module_put_and_exit(struct module *mod,
 			long code);
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
@@ -795,14 +791,6 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
-static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-							   struct module *,
-							   unsigned long),
-						 void *data)
-{
-	return 0;
-}
-
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
@@ -891,4 +879,8 @@ static inline bool module_sig_ok(struct module *module)
 }
 #endif	/* CONFIG_MODULE_SIG */
 
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data);
+
 #endif /* _LINUX_MODULE_H */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index a0d3f0865916..8043a90aa50e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -177,6 +177,7 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+#ifdef CONFIG_LIVEPATCH
 /*
  * Iterate over all symbols in vmlinux.  For symbols from modules use
  * module_kallsyms_on_each_symbol instead.
@@ -198,6 +199,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 	}
 	return 0;
 }
+#endif /* CONFIG_LIVEPATCH */
 
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
diff --git a/kernel/module.c b/kernel/module.c
index 481eb348564e..06155c454779 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4488,6 +4488,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	return ret;
 }
 
+#ifdef CONFIG_LIVEPATCH
 int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data)
@@ -4518,6 +4519,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 	mutex_unlock(&module_mutex);
 	return ret;
 }
+#endif /* CONFIG_LIVEPATCH */
 #endif /* CONFIG_KALLSYMS */
 
 /* Maximum number of characters written by module_flags() */
-- 
cgit v1.2.3


From 922f2a7c822bf76dffb218331bd95b1eea3cf637 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:28 +0100
Subject: module: mark module_mutex static

Except for two lockdep asserts module_mutex is only used in module.c.
Remove the two asserts given that the functions they are in are not
exported and just called from the module code, and mark module_mutex
static.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 --
 kernel/module.c        | 2 +-
 lib/bug.c              | 3 ---
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 3ea4ffae608f..0f360c48fe92 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -550,8 +550,6 @@ static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
 }
 #endif
 
-extern struct mutex module_mutex;
-
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
    (IDE & SCSI) require entry into the module during init.*/
diff --git a/kernel/module.c b/kernel/module.c
index 06155c454779..9befd793997e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -87,7 +87,7 @@
  * 3) module_addr_min/module_addr_max.
  * (delete and add uses RCU list operations).
  */
-DEFINE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
 /* Work queue for freeing init sections in success case */
diff --git a/lib/bug.c b/lib/bug.c
index 7103440c0ee1..8f9d537bfb2a 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -91,8 +91,6 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 	char *secstrings;
 	unsigned int i;
 
-	lockdep_assert_held(&module_mutex);
-
 	mod->bug_table = NULL;
 	mod->num_bugs = 0;
 
@@ -118,7 +116,6 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 
 void module_bug_cleanup(struct module *mod)
 {
-	lockdep_assert_held(&module_mutex);
 	list_del_rcu(&mod->bug_list);
 }
 
-- 
cgit v1.2.3


From 00cc2c1cd34f81f777085cb2d65267edcd403fd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:32 +0100
Subject: module: move struct symsearch to module.c

struct symsearch is only used inside of module.h, so move the definition
out of module.h.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 11 -----------
 kernel/module.c        | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0f360c48fe92..da0f5966ee80 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -587,17 +587,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 /* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const s32 *crcs;
-	enum mod_license {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} license;
-	bool unused;
-};
-
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
diff --git a/kernel/module.c b/kernel/module.c
index 2e7ddf544338..9cd6814405b9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -428,6 +428,17 @@ extern const s32 __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const s32 *crcs;
+	enum mod_license {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} license;
+	bool unused;
+};
+
 struct find_symbol_arg {
 	/* Input */
 	const char *name;
-- 
cgit v1.2.3


From f1c3d73e973cfad85ff5d3d86086503e742d8c62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:33 +0100
Subject: module: remove EXPORT_SYMBOL_GPL_FUTURE

As far as I can tell this has never been used at all, and certainly
not any time recently.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 arch/x86/tools/relocs.c           |  4 ++--
 include/asm-generic/vmlinux.lds.h | 14 --------------
 include/linux/export.h            |  1 -
 include/linux/module.h            |  5 -----
 kernel/module.c                   | 29 ++---------------------------
 scripts/mod/modpost.c             | 13 +------------
 scripts/mod/modpost.h             |  1 -
 scripts/module.lds.S              |  2 --
 tools/include/linux/export.h      |  1 -
 9 files changed, 5 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index ce7188cbdae5..0d210d0e83e2 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -61,8 +61,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
 	"__(start|end)_pci_.*|"
 	"__(start|end)_builtin_fw|"
-	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
-	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl)|"
+	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl)|"
 	"__(start|stop)___param|"
 	"__(start|stop)___modver|"
 	"__(start|stop)___bug_table|"
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b2b3d81b1535..83243506e68b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -495,13 +495,6 @@
 		__stop___ksymtab_unused_gpl = .;			\
 	}								\
 									\
-	/* Kernel symbol table: GPL-future-only symbols */		\
-	__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
-		__start___ksymtab_gpl_future = .;			\
-		KEEP(*(SORT(___ksymtab_gpl_future+*)))			\
-		__stop___ksymtab_gpl_future = .;			\
-	}								\
-									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
 		__start___kcrctab = .;					\
@@ -530,13 +523,6 @@
 		__stop___kcrctab_unused_gpl = .;			\
 	}								\
 									\
-	/* Kernel symbol table: GPL-future-only symbols */		\
-	__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
-		__start___kcrctab_gpl_future = .;			\
-		KEEP(*(SORT(___kcrctab_gpl_future+*)))			\
-		__stop___kcrctab_gpl_future = .;			\
-	}								\
-									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
diff --git a/include/linux/export.h b/include/linux/export.h
index fceb5e855717..362b64f8d4a7 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -157,7 +157,6 @@ struct kernel_symbol {
 
 #define EXPORT_SYMBOL(sym)		_EXPORT_SYMBOL(sym, "")
 #define EXPORT_SYMBOL_GPL(sym)		_EXPORT_SYMBOL(sym, "_gpl")
-#define EXPORT_SYMBOL_GPL_FUTURE(sym)	_EXPORT_SYMBOL(sym, "_gpl_future")
 #define EXPORT_SYMBOL_NS(sym, ns)	__EXPORT_SYMBOL(sym, "", #ns)
 #define EXPORT_SYMBOL_NS_GPL(sym, ns)	__EXPORT_SYMBOL(sym, "_gpl", #ns)
 
diff --git a/include/linux/module.h b/include/linux/module.h
index da0f5966ee80..e6e50ee30412 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -411,11 +411,6 @@ struct module {
 
 	bool async_probe_requested;
 
-	/* symbols that will be GPL-only in the near future. */
-	const struct kernel_symbol *gpl_future_syms;
-	const s32 *gpl_future_crcs;
-	unsigned int num_gpl_future_syms;
-
 	/* Exception table */
 	unsigned int num_exentries;
 	struct exception_table_entry *extable;
diff --git a/kernel/module.c b/kernel/module.c
index 9cd6814405b9..95186c9d81ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -408,11 +408,8 @@ extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
 extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const s32 __start___kcrctab[];
 extern const s32 __start___kcrctab_gpl[];
-extern const s32 __start___kcrctab_gpl_future[];
 #ifdef CONFIG_UNUSED_SYMBOLS
 extern const struct kernel_symbol __start___ksymtab_unused[];
 extern const struct kernel_symbol __stop___ksymtab_unused[];
@@ -434,7 +431,6 @@ struct symsearch {
 	enum mod_license {
 		NOT_GPL_ONLY,
 		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
 	} license;
 	bool unused;
 };
@@ -458,15 +454,8 @@ static bool check_exported_symbol(const struct symsearch *syms,
 {
 	struct find_symbol_arg *fsa = data;
 
-	if (!fsa->gplok) {
-		if (syms->license == GPL_ONLY)
-			return false;
-		if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
-			pr_warn("Symbol %s is being used by a non-GPL module, "
-				"which will not be allowed in the future\n",
-				fsa->name);
-		}
-	}
+	if (!fsa->gplok && syms->license == GPL_ONLY)
+		return false;
 
 #ifdef CONFIG_UNUSED_SYMBOLS
 	if (syms->unused && fsa->warn) {
@@ -550,9 +539,6 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
 		  __start___kcrctab_gpl,
 		  GPL_ONLY, false },
-		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-		  __start___kcrctab_gpl_future,
-		  WILL_BE_GPL_ONLY, false },
 #ifdef CONFIG_UNUSED_SYMBOLS
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
 		  __start___kcrctab_unused,
@@ -579,10 +565,6 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
 			  mod->gpl_crcs,
 			  GPL_ONLY, false },
-			{ mod->gpl_future_syms,
-			  mod->gpl_future_syms + mod->num_gpl_future_syms,
-			  mod->gpl_future_crcs,
-			  WILL_BE_GPL_ONLY, false },
 #ifdef CONFIG_UNUSED_SYMBOLS
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
@@ -2292,7 +2274,6 @@ static int verify_exported_symbols(struct module *mod)
 	} arr[] = {
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
-		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
 #ifdef CONFIG_UNUSED_SYMBOLS
 		{ mod->unused_syms, mod->num_unused_syms },
 		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
@@ -3308,11 +3289,6 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 				     sizeof(*mod->gpl_syms),
 				     &mod->num_gpl_syms);
 	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
-	mod->gpl_future_syms = section_objs(info,
-					    "__ksymtab_gpl_future",
-					    sizeof(*mod->gpl_future_syms),
-					    &mod->num_gpl_future_syms);
-	mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
 
 #ifdef CONFIG_UNUSED_SYMBOLS
 	mod->unused_syms = section_objs(info, "__ksymtab_unused",
@@ -3506,7 +3482,6 @@ static int check_module_license_and_versions(struct module *mod)
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
-	    || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
 #ifdef CONFIG_UNUSED_SYMBOLS
 	    || (mod->num_unused_syms && !mod->unused_crcs)
 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index d6c81657d695..25c1446055d1 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -44,7 +44,7 @@ static bool error_occurred;
 
 enum export {
 	export_plain,      export_unused,     export_gpl,
-	export_unused_gpl, export_gpl_future, export_unknown
+	export_unused_gpl, export_unknown
 };
 
 /* In kernel, this size is defined in linux/module.h;
@@ -304,7 +304,6 @@ static const struct {
 	{ .str = "EXPORT_UNUSED_SYMBOL",     .export = export_unused },
 	{ .str = "EXPORT_SYMBOL_GPL",        .export = export_gpl },
 	{ .str = "EXPORT_UNUSED_SYMBOL_GPL", .export = export_unused_gpl },
-	{ .str = "EXPORT_SYMBOL_GPL_FUTURE", .export = export_gpl_future },
 	{ .str = "(unknown)",                .export = export_unknown },
 };
 
@@ -369,8 +368,6 @@ static enum export export_from_secname(struct elf_info *elf, unsigned int sec)
 		return export_gpl;
 	else if (strstarts(secname, "___ksymtab_unused_gpl+"))
 		return export_unused_gpl;
-	else if (strstarts(secname, "___ksymtab_gpl_future+"))
-		return export_gpl_future;
 	else
 		return export_unknown;
 }
@@ -385,8 +382,6 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec)
 		return export_gpl;
 	else if (sec == elf->export_unused_gpl_sec)
 		return export_unused_gpl;
-	else if (sec == elf->export_gpl_future_sec)
-		return export_gpl_future;
 	else
 		return export_unknown;
 }
@@ -596,8 +591,6 @@ static int parse_elf(struct elf_info *info, const char *filename)
 			info->export_gpl_sec = i;
 		else if (strcmp(secname, "__ksymtab_unused_gpl") == 0)
 			info->export_unused_gpl_sec = i;
-		else if (strcmp(secname, "__ksymtab_gpl_future") == 0)
-			info->export_gpl_future_sec = i;
 
 		if (sechdrs[i].sh_type == SHT_SYMTAB) {
 			unsigned int sh_link_idx;
@@ -2152,10 +2145,6 @@ static void check_for_gpl_usage(enum export exp, const char *m, const char *s)
 		error("GPL-incompatible module %s.ko uses GPL-only symbol marked UNUSED '%s'\n",
 		      m, s);
 		break;
-	case export_gpl_future:
-		warn("GPL-incompatible module %s.ko uses future GPL-only symbol '%s'\n",
-		     m, s);
-		break;
 	case export_plain:
 	case export_unused:
 	case export_unknown:
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index e6f46eee0af0..834220de002b 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -142,7 +142,6 @@ struct elf_info {
 	Elf_Section  export_unused_sec;
 	Elf_Section  export_gpl_sec;
 	Elf_Section  export_unused_gpl_sec;
-	Elf_Section  export_gpl_future_sec;
 	char         *strtab;
 	char	     *modinfo;
 	unsigned int modinfo_len;
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 69b9b71a6a47..d82b452e8a71 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -13,12 +13,10 @@ SECTIONS {
 	__ksymtab_gpl		0 : { *(SORT(___ksymtab_gpl+*)) }
 	__ksymtab_unused	0 : { *(SORT(___ksymtab_unused+*)) }
 	__ksymtab_unused_gpl	0 : { *(SORT(___ksymtab_unused_gpl+*)) }
-	__ksymtab_gpl_future	0 : { *(SORT(___ksymtab_gpl_future+*)) }
 	__kcrctab		0 : { *(SORT(___kcrctab+*)) }
 	__kcrctab_gpl		0 : { *(SORT(___kcrctab_gpl+*)) }
 	__kcrctab_unused	0 : { *(SORT(___kcrctab_unused+*)) }
 	__kcrctab_unused_gpl	0 : { *(SORT(___kcrctab_unused_gpl+*)) }
-	__kcrctab_gpl_future	0 : { *(SORT(___kcrctab_gpl_future+*)) }
 
 	.init_array		0 : ALIGN(8) { *(SORT(.init_array.*)) *(.init_array) }
 
diff --git a/tools/include/linux/export.h b/tools/include/linux/export.h
index d07e586b9ba0..9f61349a8944 100644
--- a/tools/include/linux/export.h
+++ b/tools/include/linux/export.h
@@ -3,7 +3,6 @@
 
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
-#define EXPORT_SYMBOL_GPL_FUTURE(sym)
 #define EXPORT_UNUSED_SYMBOL(sym)
 #define EXPORT_UNUSED_SYMBOL_GPL(sym)
 
-- 
cgit v1.2.3


From 367948220fcefcad1bf0d3d595a06efe0694acae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 13:13:34 +0100
Subject: module: remove EXPORT_UNUSED_SYMBOL*

EXPORT_UNUSED_SYMBOL* is not actually used anywhere.  Remove the
unused functionality as we generally just remove unused code anyway.

Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 arch/arm/configs/bcm2835_defconfig          |  1 -
 arch/arm/configs/mxs_defconfig              |  1 -
 arch/mips/configs/nlm_xlp_defconfig         |  1 -
 arch/mips/configs/nlm_xlr_defconfig         |  1 -
 arch/parisc/configs/generic-32bit_defconfig |  1 -
 arch/parisc/configs/generic-64bit_defconfig |  1 -
 arch/powerpc/configs/ppc6xx_defconfig       |  1 -
 arch/s390/configs/debug_defconfig           |  1 -
 arch/s390/configs/defconfig                 |  1 -
 arch/sh/configs/edosk7760_defconfig         |  1 -
 arch/sh/configs/sdk7780_defconfig           |  1 -
 arch/x86/configs/i386_defconfig             |  1 -
 arch/x86/configs/x86_64_defconfig           |  1 -
 arch/x86/tools/relocs.c                     |  4 +-
 include/asm-generic/vmlinux.lds.h           | 28 ------------
 include/linux/export.h                      |  8 ----
 include/linux/module.h                      | 12 -----
 init/Kconfig                                | 17 -------
 kernel/module.c                             | 71 +++--------------------------
 scripts/checkpatch.pl                       |  6 +--
 scripts/mod/modpost.c                       | 39 ++--------------
 scripts/mod/modpost.h                       |  2 -
 scripts/module.lds.S                        |  4 --
 tools/include/linux/export.h                |  2 -
 24 files changed, 13 insertions(+), 193 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig
index 44ff9cd88d81..d6c6c2e031c4 100644
--- a/arch/arm/configs/bcm2835_defconfig
+++ b/arch/arm/configs/bcm2835_defconfig
@@ -177,7 +177,6 @@ CONFIG_BOOT_PRINTK_DELAY=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHED_TRACER=y
diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig
index a9c6f32a9b1c..ca32446b187f 100644
--- a/arch/arm/configs/mxs_defconfig
+++ b/arch/arm/configs/mxs_defconfig
@@ -164,7 +164,6 @@ CONFIG_FONTS=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 CONFIG_FRAME_WARN=2048
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
diff --git a/arch/mips/configs/nlm_xlp_defconfig b/arch/mips/configs/nlm_xlp_defconfig
index 72a211d2d556..32c290611723 100644
--- a/arch/mips/configs/nlm_xlp_defconfig
+++ b/arch/mips/configs/nlm_xlp_defconfig
@@ -549,7 +549,6 @@ CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_FRAME_WARN=1024
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_SCHEDSTATS=y
diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig
index 4ecb157e56d4..bf9b9244929e 100644
--- a/arch/mips/configs/nlm_xlr_defconfig
+++ b/arch/mips/configs/nlm_xlr_defconfig
@@ -500,7 +500,6 @@ CONFIG_CRC7=m
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_SCHEDSTATS=y
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 3cbcfad5f724..7611d48c599e 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -22,7 +22,6 @@ CONFIG_PCI_LBA=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_UNUSED_SYMBOLS=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=m
diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig
index 8f81fcbf04c4..53054b81461a 100644
--- a/arch/parisc/configs/generic-64bit_defconfig
+++ b/arch/parisc/configs/generic-64bit_defconfig
@@ -31,7 +31,6 @@ CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BINFMT_MISC=m
 # CONFIG_COMPACTION is not set
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index ef09f3cce1fa..34c3859040f9 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1072,7 +1072,6 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_DEBUG_INFO=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_HEADERS_INSTALL=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index c4f6ff98a612..58e54d17e315 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -71,7 +71,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_INTEGRITY=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 51135893cffe..b5e62c0d3e23 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -66,7 +66,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG_SHA256=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_BLK_WBT=y
 CONFIG_BLK_CGROUP_IOLATENCY=y
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index 02ba62298576..d77f54e906fd 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -102,7 +102,6 @@ CONFIG_NLS_UTF8=y
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig
index d10a0414123a..d53c4595fb2e 100644
--- a/arch/sh/configs/sdk7780_defconfig
+++ b/arch/sh/configs/sdk7780_defconfig
@@ -130,7 +130,6 @@ CONFIG_NLS_ISO8859_15=y
 CONFIG_NLS_UTF8=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
-CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 78210793d357..9c9c4a888b1d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -50,7 +50,6 @@ CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_BINFMT_MISC=y
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9936528e1939..b60bd2d86034 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -48,7 +48,6 @@ CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_BINFMT_MISC=y
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0d210d0e83e2..b9c577a3cacc 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -61,8 +61,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
 	"__(start|end)_pci_.*|"
 	"__(start|end)_builtin_fw|"
-	"__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl)|"
-	"__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl)|"
+	"__(start|stop)___ksymtab(|_gpl)|"
+	"__(start|stop)___kcrctab(|_gpl)|"
 	"__(start|stop)___param|"
 	"__(start|stop)___modver|"
 	"__(start|stop)___bug_table|"
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 83243506e68b..1fa338ac6a54 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -481,20 +481,6 @@
 		__stop___ksymtab_gpl = .;				\
 	}								\
 									\
-	/* Kernel symbol table: Normal unused symbols */		\
-	__ksymtab_unused  : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) {	\
-		__start___ksymtab_unused = .;				\
-		KEEP(*(SORT(___ksymtab_unused+*)))			\
-		__stop___ksymtab_unused = .;				\
-	}								\
-									\
-	/* Kernel symbol table: GPL-only unused symbols */		\
-	__ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \
-		__start___ksymtab_unused_gpl = .;			\
-		KEEP(*(SORT(___ksymtab_unused_gpl+*)))			\
-		__stop___ksymtab_unused_gpl = .;			\
-	}								\
-									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
 		__start___kcrctab = .;					\
@@ -509,20 +495,6 @@
 		__stop___kcrctab_gpl = .;				\
 	}								\
 									\
-	/* Kernel symbol table: Normal unused symbols */		\
-	__kcrctab_unused  : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) {	\
-		__start___kcrctab_unused = .;				\
-		KEEP(*(SORT(___kcrctab_unused+*)))			\
-		__stop___kcrctab_unused = .;				\
-	}								\
-									\
-	/* Kernel symbol table: GPL-only unused symbols */		\
-	__kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \
-		__start___kcrctab_unused_gpl = .;			\
-		KEEP(*(SORT(___kcrctab_unused_gpl+*)))			\
-		__stop___kcrctab_unused_gpl = .;			\
-	}								\
-									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
diff --git a/include/linux/export.h b/include/linux/export.h
index 362b64f8d4a7..6271a5d9c988 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -160,14 +160,6 @@ struct kernel_symbol {
 #define EXPORT_SYMBOL_NS(sym, ns)	__EXPORT_SYMBOL(sym, "", #ns)
 #define EXPORT_SYMBOL_NS_GPL(sym, ns)	__EXPORT_SYMBOL(sym, "_gpl", #ns)
 
-#ifdef CONFIG_UNUSED_SYMBOLS
-#define EXPORT_UNUSED_SYMBOL(sym)	_EXPORT_SYMBOL(sym, "_unused")
-#define EXPORT_UNUSED_SYMBOL_GPL(sym)	_EXPORT_SYMBOL(sym, "_unused_gpl")
-#else
-#define EXPORT_UNUSED_SYMBOL(sym)
-#define EXPORT_UNUSED_SYMBOL_GPL(sym)
-#endif
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _LINUX_EXPORT_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index e6e50ee30412..59f094fa6f74 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -392,18 +392,6 @@ struct module {
 	const s32 *gpl_crcs;
 	bool using_gplonly_symbols;
 
-#ifdef CONFIG_UNUSED_SYMBOLS
-	/* unused exported symbols. */
-	const struct kernel_symbol *unused_syms;
-	const s32 *unused_crcs;
-	unsigned int num_unused_syms;
-
-	/* GPL-only, unused exported symbols. */
-	unsigned int num_unused_gpl_syms;
-	const struct kernel_symbol *unused_gpl_syms;
-	const s32 *unused_gpl_crcs;
-#endif
-
 #ifdef CONFIG_MODULE_SIG
 	/* Signature was verified. */
 	bool sig_ok;
diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..11b803b45c19 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2262,25 +2262,8 @@ config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
 
 	  If unsure, say N.
 
-config UNUSED_SYMBOLS
-	bool "Enable unused/obsolete exported symbols"
-	default y if X86
-	help
-	  Unused but exported symbols make the kernel needlessly bigger.  For
-	  that reason most of these unused exports will soon be removed.  This
-	  option is provided temporarily to provide a transition period in case
-	  some external kernel module needs one of these symbols anyway. If you
-	  encounter such a case in your module, consider if you are actually
-	  using the right API.  (rationale: since nobody in the kernel is using
-	  this in a module, there is a pretty good chance it's actually the
-	  wrong interface to use).  If you really need the symbol, please send a
-	  mail to the linux kernel mailing list mentioning the symbol and why
-	  you really need it, and what the merge plan to the mainline kernel for
-	  your module is.
-
 config TRIM_UNUSED_KSYMS
 	bool "Trim unused exported kernel symbols"
-	depends on !UNUSED_SYMBOLS
 	help
 	  The kernel and some modules make many symbols available for
 	  other modules to use via EXPORT_SYMBOL() and variants. Depending
diff --git a/kernel/module.c b/kernel/module.c
index 95186c9d81ea..93f360250bcb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -410,14 +410,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const s32 __start___kcrctab[];
 extern const s32 __start___kcrctab_gpl[];
-#ifdef CONFIG_UNUSED_SYMBOLS
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const s32 __start___kcrctab_unused[];
-extern const s32 __start___kcrctab_unused_gpl[];
-#endif
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -432,7 +424,6 @@ struct symsearch {
 		NOT_GPL_ONLY,
 		GPL_ONLY,
 	} license;
-	bool unused;
 };
 
 struct find_symbol_arg {
@@ -456,19 +447,6 @@ static bool check_exported_symbol(const struct symsearch *syms,
 
 	if (!fsa->gplok && syms->license == GPL_ONLY)
 		return false;
-
-#ifdef CONFIG_UNUSED_SYMBOLS
-	if (syms->unused && fsa->warn) {
-		pr_warn("Symbol %s is marked as UNUSED, however this module is "
-			"using it.\n", fsa->name);
-		pr_warn("This symbol will go away in the future.\n");
-		pr_warn("Please evaluate if this is the right api to use and "
-			"if it really is, submit a report to the linux kernel "
-			"mailing list together with submitting your code for "
-			"inclusion.\n");
-	}
-#endif
-
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
 	fsa->sym = &syms->start[symnum];
@@ -535,18 +513,10 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 {
 	static const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-		  NOT_GPL_ONLY, false },
+		  NOT_GPL_ONLY },
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
 		  __start___kcrctab_gpl,
-		  GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
-		{ __start___ksymtab_unused, __stop___ksymtab_unused,
-		  __start___kcrctab_unused,
-		  NOT_GPL_ONLY, true },
-		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-		  __start___kcrctab_unused_gpl,
-		  GPL_ONLY, true },
-#endif
+		  GPL_ONLY },
 	};
 	struct module *mod;
 	unsigned int i;
@@ -561,20 +531,10 @@ static bool find_symbol(struct find_symbol_arg *fsa)
 				lockdep_is_held(&module_mutex)) {
 		struct symsearch arr[] = {
 			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
-			  NOT_GPL_ONLY, false },
+			  NOT_GPL_ONLY },
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
 			  mod->gpl_crcs,
-			  GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
-			{ mod->unused_syms,
-			  mod->unused_syms + mod->num_unused_syms,
-			  mod->unused_crcs,
-			  NOT_GPL_ONLY, true },
-			{ mod->unused_gpl_syms,
-			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-			  mod->unused_gpl_crcs,
-			  GPL_ONLY, true },
-#endif
+			  GPL_ONLY },
 		};
 
 		if (mod->state == MODULE_STATE_UNFORMED)
@@ -2274,10 +2234,6 @@ static int verify_exported_symbols(struct module *mod)
 	} arr[] = {
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
-#ifdef CONFIG_UNUSED_SYMBOLS
-		{ mod->unused_syms, mod->num_unused_syms },
-		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
-#endif
 	};
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -3290,16 +3246,6 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 				     &mod->num_gpl_syms);
 	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
 
-#ifdef CONFIG_UNUSED_SYMBOLS
-	mod->unused_syms = section_objs(info, "__ksymtab_unused",
-					sizeof(*mod->unused_syms),
-					&mod->num_unused_syms);
-	mod->unused_crcs = section_addr(info, "__kcrctab_unused");
-	mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
-					    sizeof(*mod->unused_gpl_syms),
-					    &mod->num_unused_gpl_syms);
-	mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
-#endif
 #ifdef CONFIG_CONSTRUCTORS
 	mod->ctors = section_objs(info, ".ctors",
 				  sizeof(*mod->ctors), &mod->num_ctors);
@@ -3480,13 +3426,8 @@ static int check_module_license_and_versions(struct module *mod)
 		pr_warn("%s: module license taints kernel.\n", mod->name);
 
 #ifdef CONFIG_MODVERSIONS
-	if ((mod->num_syms && !mod->crcs)
-	    || (mod->num_gpl_syms && !mod->gpl_crcs)
-#ifdef CONFIG_UNUSED_SYMBOLS
-	    || (mod->num_unused_syms && !mod->unused_crcs)
-	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
-#endif
-		) {
+	if ((mod->num_syms && !mod->crcs) ||
+	    (mod->num_gpl_syms && !mod->gpl_crcs)) {
 		return try_to_force_load(mod,
 					 "no versions for exported symbols");
 	}
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 92e888ed939f..eabd2d5467b1 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4290,8 +4290,7 @@ sub process {
 		if (defined $realline_next &&
 		    exists $lines[$realline_next - 1] &&
 		    !defined $suppress_export{$realline_next} &&
-		    ($lines[$realline_next - 1] =~ /EXPORT_SYMBOL.*\((.*)\)/ ||
-		     $lines[$realline_next - 1] =~ /EXPORT_UNUSED_SYMBOL.*\((.*)\)/)) {
+		    ($lines[$realline_next - 1] =~ /EXPORT_SYMBOL.*\((.*)\)/)) {
 			# Handle definitions which produce identifiers with
 			# a prefix:
 			#   XXX(foo);
@@ -4318,8 +4317,7 @@ sub process {
 		}
 		if (!defined $suppress_export{$linenr} &&
 		    $prevline =~ /^.\s*$/ &&
-		    ($line =~ /EXPORT_SYMBOL.*\((.*)\)/ ||
-		     $line =~ /EXPORT_UNUSED_SYMBOL.*\((.*)\)/)) {
+		    ($line =~ /EXPORT_SYMBOL.*\((.*)\)/)) {
 #print "FOO B <$lines[$linenr - 1]>\n";
 			$suppress_export{$linenr} = 2;
 		}
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 25c1446055d1..20fc57837881 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -43,8 +43,9 @@ static int allow_missing_ns_imports;
 static bool error_occurred;
 
 enum export {
-	export_plain,      export_unused,     export_gpl,
-	export_unused_gpl, export_unknown
+	export_plain,
+	export_gpl,
+	export_unknown
 };
 
 /* In kernel, this size is defined in linux/module.h;
@@ -301,9 +302,7 @@ static const struct {
 	enum export export;
 } export_list[] = {
 	{ .str = "EXPORT_SYMBOL",            .export = export_plain },
-	{ .str = "EXPORT_UNUSED_SYMBOL",     .export = export_unused },
 	{ .str = "EXPORT_SYMBOL_GPL",        .export = export_gpl },
-	{ .str = "EXPORT_UNUSED_SYMBOL_GPL", .export = export_unused_gpl },
 	{ .str = "(unknown)",                .export = export_unknown },
 };
 
@@ -362,12 +361,8 @@ static enum export export_from_secname(struct elf_info *elf, unsigned int sec)
 
 	if (strstarts(secname, "___ksymtab+"))
 		return export_plain;
-	else if (strstarts(secname, "___ksymtab_unused+"))
-		return export_unused;
 	else if (strstarts(secname, "___ksymtab_gpl+"))
 		return export_gpl;
-	else if (strstarts(secname, "___ksymtab_unused_gpl+"))
-		return export_unused_gpl;
 	else
 		return export_unknown;
 }
@@ -376,12 +371,8 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec)
 {
 	if (sec == elf->export_sec)
 		return export_plain;
-	else if (sec == elf->export_unused_sec)
-		return export_unused;
 	else if (sec == elf->export_gpl_sec)
 		return export_gpl;
-	else if (sec == elf->export_unused_gpl_sec)
-		return export_unused_gpl;
 	else
 		return export_unknown;
 }
@@ -585,12 +576,8 @@ static int parse_elf(struct elf_info *info, const char *filename)
 			info->modinfo_len = sechdrs[i].sh_size;
 		} else if (strcmp(secname, "__ksymtab") == 0)
 			info->export_sec = i;
-		else if (strcmp(secname, "__ksymtab_unused") == 0)
-			info->export_unused_sec = i;
 		else if (strcmp(secname, "__ksymtab_gpl") == 0)
 			info->export_gpl_sec = i;
-		else if (strcmp(secname, "__ksymtab_unused_gpl") == 0)
-			info->export_unused_gpl_sec = i;
 
 		if (sechdrs[i].sh_type == SHT_SYMTAB) {
 			unsigned int sh_link_idx;
@@ -2141,32 +2128,13 @@ static void check_for_gpl_usage(enum export exp, const char *m, const char *s)
 		error("GPL-incompatible module %s.ko uses GPL-only symbol '%s'\n",
 		      m, s);
 		break;
-	case export_unused_gpl:
-		error("GPL-incompatible module %s.ko uses GPL-only symbol marked UNUSED '%s'\n",
-		      m, s);
-		break;
 	case export_plain:
-	case export_unused:
 	case export_unknown:
 		/* ignore */
 		break;
 	}
 }
 
-static void check_for_unused(enum export exp, const char *m, const char *s)
-{
-	switch (exp) {
-	case export_unused:
-	case export_unused_gpl:
-		warn("module %s.ko uses symbol '%s' marked UNUSED\n",
-		     m, s);
-		break;
-	default:
-		/* ignore */
-		break;
-	}
-}
-
 static void check_exports(struct module *mod)
 {
 	struct symbol *s, *exp;
@@ -2197,7 +2165,6 @@ static void check_exports(struct module *mod)
 
 		if (!mod->gpl_compatible)
 			check_for_gpl_usage(exp->export, basename, exp->name);
-		check_for_unused(exp->export, basename, exp->name);
 	}
 }
 
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index 834220de002b..0c47ff95c0e2 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -139,9 +139,7 @@ struct elf_info {
 	Elf_Sym      *symtab_start;
 	Elf_Sym      *symtab_stop;
 	Elf_Section  export_sec;
-	Elf_Section  export_unused_sec;
 	Elf_Section  export_gpl_sec;
-	Elf_Section  export_unused_gpl_sec;
 	char         *strtab;
 	char	     *modinfo;
 	unsigned int modinfo_len;
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index d82b452e8a71..24e8af579ce3 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -11,12 +11,8 @@ SECTIONS {
 
 	__ksymtab		0 : { *(SORT(___ksymtab+*)) }
 	__ksymtab_gpl		0 : { *(SORT(___ksymtab_gpl+*)) }
-	__ksymtab_unused	0 : { *(SORT(___ksymtab_unused+*)) }
-	__ksymtab_unused_gpl	0 : { *(SORT(___ksymtab_unused_gpl+*)) }
 	__kcrctab		0 : { *(SORT(___kcrctab+*)) }
 	__kcrctab_gpl		0 : { *(SORT(___kcrctab_gpl+*)) }
-	__kcrctab_unused	0 : { *(SORT(___kcrctab_unused+*)) }
-	__kcrctab_unused_gpl	0 : { *(SORT(___kcrctab_unused_gpl+*)) }
 
 	.init_array		0 : ALIGN(8) { *(SORT(.init_array.*)) *(.init_array) }
 
diff --git a/tools/include/linux/export.h b/tools/include/linux/export.h
index 9f61349a8944..acb6f4daa2f0 100644
--- a/tools/include/linux/export.h
+++ b/tools/include/linux/export.h
@@ -3,7 +3,5 @@
 
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
-#define EXPORT_UNUSED_SYMBOL(sym)
-#define EXPORT_UNUSED_SYMBOL_GPL(sym)
 
 #endif
-- 
cgit v1.2.3


From 7aa382cfe714f61b0c29f02c31d389c506b4e2ae Mon Sep 17 00:00:00 2001
From: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Date: Sun, 7 Feb 2021 14:14:16 +0800
Subject: regulator: mt6315: Add support for MT6315 regulator

The MT6315 is a regulator found on boards based on MediaTek MT8192 and
probably other SoCs. It connects as a slave to SoC using SPMI.

Signed-off-by: Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>
Link: https://lore.kernel.org/r/1612678457-11548-3-git-send-email-hsin-hsiung.wang@mediatek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig                  |  10 +
 drivers/regulator/Makefile                 |   1 +
 drivers/regulator/mt6315-regulator.c       | 299 +++++++++++++++++++++++++++++
 include/linux/regulator/mt6315-regulator.h |  44 +++++
 4 files changed, 354 insertions(+)
 create mode 100644 drivers/regulator/mt6315-regulator.c
 create mode 100644 include/linux/regulator/mt6315-regulator.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 88316b8a65d2..77c43134bc9e 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -731,6 +731,16 @@ config REGULATOR_MT6311
 	  This driver supports the control of different power rails of device
 	  through regulator interface.
 
+config REGULATOR_MT6315
+	tristate "MediaTek MT6315 PMIC"
+	depends on SPMI
+	select REGMAP_SPMI
+	help
+	  Say y here to select this option to enable the power regulator of
+	  MediaTek MT6315 PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_MT6323
 	tristate "MediaTek MT6323 PMIC"
 	depends on MFD_MT6397
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index d1957629537b..44d2f8bf4b74 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_REGULATOR_MP8859) += mp8859.o
 obj-$(CONFIG_REGULATOR_MP886X) += mp886x.o
 obj-$(CONFIG_REGULATOR_MPQ7920) += mpq7920.o
 obj-$(CONFIG_REGULATOR_MT6311) += mt6311-regulator.o
+obj-$(CONFIG_REGULATOR_MT6315) += mt6315-regulator.o
 obj-$(CONFIG_REGULATOR_MT6323)	+= mt6323-regulator.o
 obj-$(CONFIG_REGULATOR_MT6358)	+= mt6358-regulator.o
 obj-$(CONFIG_REGULATOR_MT6360) += mt6360-regulator.o
diff --git a/drivers/regulator/mt6315-regulator.c b/drivers/regulator/mt6315-regulator.c
new file mode 100644
index 000000000000..d49a1534d8e9
--- /dev/null
+++ b/drivers/regulator/mt6315-regulator.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (c) 2021 MediaTek Inc.
+
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/mt6315-regulator.h>
+#include <linux/regulator/of_regulator.h>
+#include <linux/spmi.h>
+
+#define MT6315_BUCK_MODE_AUTO		0
+#define MT6315_BUCK_MODE_FORCE_PWM	1
+#define MT6315_BUCK_MODE_LP		2
+
+struct mt6315_regulator_info {
+	struct regulator_desc desc;
+	u32 status_reg;
+	u32 lp_mode_mask;
+	u32 lp_mode_shift;
+};
+
+struct mt_regulator_init_data {
+	u32 modeset_mask[MT6315_VBUCK_MAX];
+};
+
+struct mt6315_chip {
+	struct device *dev;
+	struct regmap *regmap;
+};
+
+#define MT_BUCK(_name, _bid, _vsel)				\
+[_bid] = {							\
+	.desc = {						\
+		.name = _name,					\
+		.of_match = of_match_ptr(_name),		\
+		.regulators_node = "regulators",		\
+		.ops = &mt6315_volt_range_ops,			\
+		.type = REGULATOR_VOLTAGE,			\
+		.id = _bid,					\
+		.owner = THIS_MODULE,				\
+		.n_voltages = 0xbf,				\
+		.linear_ranges = mt_volt_range1,		\
+		.n_linear_ranges = ARRAY_SIZE(mt_volt_range1),	\
+		.vsel_reg = _vsel,				\
+		.vsel_mask = 0xff,				\
+		.enable_reg = MT6315_BUCK_TOP_CON0,		\
+		.enable_mask = BIT(_bid),			\
+		.of_map_mode = mt6315_map_mode,			\
+	},							\
+	.status_reg = _bid##_DBG4,				\
+	.lp_mode_mask = BIT(_bid),				\
+	.lp_mode_shift = _bid,					\
+}
+
+static const struct linear_range mt_volt_range1[] = {
+	REGULATOR_LINEAR_RANGE(0, 0, 0xbf, 6250),
+};
+
+static unsigned int mt6315_map_mode(u32 mode)
+{
+	switch (mode) {
+	case MT6315_BUCK_MODE_AUTO:
+		return REGULATOR_MODE_NORMAL;
+	case MT6315_BUCK_MODE_FORCE_PWM:
+		return REGULATOR_MODE_FAST;
+	case MT6315_BUCK_MODE_LP:
+		return REGULATOR_MODE_IDLE;
+	default:
+		return -EINVAL;
+	}
+}
+
+static unsigned int mt6315_regulator_get_mode(struct regulator_dev *rdev)
+{
+	struct mt_regulator_init_data *init = rdev_get_drvdata(rdev);
+	const struct mt6315_regulator_info *info;
+	int ret, regval;
+	u32 modeset_mask;
+
+	info = container_of(rdev->desc, struct mt6315_regulator_info, desc);
+	modeset_mask = init->modeset_mask[rdev_get_id(rdev)];
+	ret = regmap_read(rdev->regmap, MT6315_BUCK_TOP_4PHASE_ANA_CON42, &regval);
+	if (ret != 0) {
+		dev_notice(&rdev->dev, "Failed to get mode: %d\n", ret);
+		return ret;
+	}
+
+	if ((regval & modeset_mask) == modeset_mask)
+		return REGULATOR_MODE_FAST;
+
+	ret = regmap_read(rdev->regmap, MT6315_BUCK_TOP_CON1, &regval);
+	if (ret != 0) {
+		dev_notice(&rdev->dev, "Failed to get lp mode: %d\n", ret);
+		return ret;
+	}
+
+	if (regval & info->lp_mode_mask)
+		return REGULATOR_MODE_IDLE;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int mt6315_regulator_set_mode(struct regulator_dev *rdev,
+				     u32 mode)
+{
+	struct mt_regulator_init_data *init = rdev_get_drvdata(rdev);
+	const struct mt6315_regulator_info *info;
+	int ret, val, curr_mode;
+	u32 modeset_mask;
+
+	info = container_of(rdev->desc, struct mt6315_regulator_info, desc);
+	modeset_mask = init->modeset_mask[rdev_get_id(rdev)];
+	curr_mode = mt6315_regulator_get_mode(rdev);
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		ret = regmap_update_bits(rdev->regmap,
+					 MT6315_BUCK_TOP_4PHASE_ANA_CON42,
+					 modeset_mask,
+					 modeset_mask);
+		break;
+	case REGULATOR_MODE_NORMAL:
+		if (curr_mode == REGULATOR_MODE_FAST) {
+			ret = regmap_update_bits(rdev->regmap,
+						 MT6315_BUCK_TOP_4PHASE_ANA_CON42,
+						 modeset_mask,
+						 0);
+		} else if (curr_mode == REGULATOR_MODE_IDLE) {
+			ret = regmap_update_bits(rdev->regmap,
+						 MT6315_BUCK_TOP_CON1,
+						 info->lp_mode_mask,
+						 0);
+			usleep_range(100, 110);
+		} else {
+			ret = -EINVAL;
+		}
+		break;
+	case REGULATOR_MODE_IDLE:
+		val = MT6315_BUCK_MODE_LP >> 1;
+		val <<= info->lp_mode_shift;
+		ret = regmap_update_bits(rdev->regmap,
+					 MT6315_BUCK_TOP_CON1,
+					 info->lp_mode_mask,
+					 val);
+		break;
+	default:
+		ret = -EINVAL;
+		dev_notice(&rdev->dev, "Unsupported mode: %d\n", mode);
+		break;
+	}
+
+	if (ret != 0) {
+		dev_notice(&rdev->dev, "Failed to set mode: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int mt6315_get_status(struct regulator_dev *rdev)
+{
+	const struct mt6315_regulator_info *info;
+	int ret;
+	u32 regval;
+
+	info = container_of(rdev->desc, struct mt6315_regulator_info, desc);
+	ret = regmap_read(rdev->regmap, info->status_reg, &regval);
+	if (ret < 0) {
+		dev_notice(&rdev->dev, "Failed to get enable reg: %d\n", ret);
+		return ret;
+	}
+
+	return (regval & BIT(0)) ? REGULATOR_STATUS_ON : REGULATOR_STATUS_OFF;
+}
+
+static const struct regulator_ops mt6315_volt_range_ops = {
+	.list_voltage = regulator_list_voltage_linear_range,
+	.map_voltage = regulator_map_voltage_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6315_get_status,
+	.set_mode = mt6315_regulator_set_mode,
+	.get_mode = mt6315_regulator_get_mode,
+};
+
+static const struct mt6315_regulator_info mt6315_regulators[MT6315_VBUCK_MAX] = {
+	MT_BUCK("vbuck1", MT6315_VBUCK1, MT6315_BUCK_TOP_ELR0),
+	MT_BUCK("vbuck2", MT6315_VBUCK2, MT6315_BUCK_TOP_ELR2),
+	MT_BUCK("vbuck3", MT6315_VBUCK3, MT6315_BUCK_TOP_ELR4),
+	MT_BUCK("vbuck4", MT6315_VBUCK4, MT6315_BUCK_TOP_ELR6),
+};
+
+static const struct regmap_config mt6315_regmap_config = {
+	.reg_bits	= 16,
+	.val_bits	= 8,
+	.max_register	= 0x16d0,
+	.fast_io	= true,
+};
+
+static const struct of_device_id mt6315_of_match[] = {
+	{
+		.compatible = "mediatek,mt6315-regulator",
+	}, {
+		/* sentinel */
+	},
+};
+MODULE_DEVICE_TABLE(of, mt6315_of_match);
+
+static int mt6315_regulator_probe(struct spmi_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct regmap *regmap;
+	struct mt6315_chip *chip;
+	struct mt_regulator_init_data *init_data;
+	struct regulator_config config = {};
+	struct regulator_dev *rdev;
+	int i;
+
+	regmap = devm_regmap_init_spmi_ext(pdev, &mt6315_regmap_config);
+	if (!regmap)
+		return -ENODEV;
+
+	chip = devm_kzalloc(dev, sizeof(struct mt6315_chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	init_data = devm_kzalloc(dev, sizeof(struct mt_regulator_init_data), GFP_KERNEL);
+	if (!init_data)
+		return -ENOMEM;
+
+	switch (pdev->usid) {
+	case MT6315_PP:
+		init_data->modeset_mask[MT6315_VBUCK1] = BIT(MT6315_VBUCK1) | BIT(MT6315_VBUCK2) |
+							 BIT(MT6315_VBUCK4);
+		break;
+	case MT6315_SP:
+	case MT6315_RP:
+		init_data->modeset_mask[MT6315_VBUCK1] = BIT(MT6315_VBUCK1) | BIT(MT6315_VBUCK2);
+		break;
+	default:
+		init_data->modeset_mask[MT6315_VBUCK1] = BIT(MT6315_VBUCK1);
+		break;
+	}
+	for (i = MT6315_VBUCK2; i < MT6315_VBUCK_MAX; i++)
+		init_data->modeset_mask[i] = BIT(i);
+
+	chip->dev = dev;
+	chip->regmap = regmap;
+	dev_set_drvdata(dev, chip);
+
+	config.dev = dev;
+	config.regmap = regmap;
+	for (i = MT6315_VBUCK1; i < MT6315_VBUCK_MAX; i++) {
+		config.driver_data = init_data;
+		rdev = devm_regulator_register(dev, &mt6315_regulators[i].desc, &config);
+		if (IS_ERR(rdev)) {
+			dev_notice(dev, "Failed to register %s\n", mt6315_regulators[i].desc.name);
+			continue;
+		}
+	}
+
+	return 0;
+}
+
+static void mt6315_regulator_shutdown(struct spmi_device *pdev)
+{
+	struct mt6315_chip *chip = dev_get_drvdata(&pdev->dev);
+	int ret = 0;
+
+	ret |= regmap_write(chip->regmap, MT6315_TOP_TMA_KEY_H, PROTECTION_KEY_H);
+	ret |= regmap_write(chip->regmap, MT6315_TOP_TMA_KEY, PROTECTION_KEY);
+	ret |= regmap_update_bits(chip->regmap, MT6315_TOP2_ELR7, 1, 1);
+	ret |= regmap_write(chip->regmap, MT6315_TOP_TMA_KEY, 0);
+	ret |= regmap_write(chip->regmap, MT6315_TOP_TMA_KEY_H, 0);
+	if (ret < 0)
+		dev_notice(&pdev->dev, "[%#x] Failed to enable power off sequence. %d\n",
+			   pdev->usid, ret);
+}
+
+static struct spmi_driver mt6315_regulator_driver = {
+	.driver		= {
+		.name	= "mt6315-regulator",
+		.of_match_table = mt6315_of_match,
+	},
+	.probe = mt6315_regulator_probe,
+	.shutdown = mt6315_regulator_shutdown,
+};
+
+module_spmi_driver(mt6315_regulator_driver);
+
+MODULE_AUTHOR("Hsin-Hsiung Wang <hsin-hsiung.wang@mediatek.com>");
+MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6315 PMIC");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/regulator/mt6315-regulator.h b/include/linux/regulator/mt6315-regulator.h
new file mode 100644
index 000000000000..3b80d3f3910c
--- /dev/null
+++ b/include/linux/regulator/mt6315-regulator.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 MediaTek Inc.
+ */
+
+#ifndef __LINUX_REGULATOR_MT6315_H
+#define __LINUX_REGULATOR_MT6315_H
+
+#define MT6315_RP	3
+#define MT6315_PP	6
+#define MT6315_SP	7
+
+enum {
+	MT6315_VBUCK1 = 0,
+	MT6315_VBUCK2,
+	MT6315_VBUCK3,
+	MT6315_VBUCK4,
+	MT6315_VBUCK_MAX,
+};
+
+/* Register */
+#define MT6315_TOP2_ELR7			0x139
+#define MT6315_TOP_TMA_KEY			0x39F
+#define MT6315_TOP_TMA_KEY_H			0x3A0
+#define MT6315_BUCK_TOP_CON0			0x1440
+#define MT6315_BUCK_TOP_CON1			0x1443
+#define MT6315_BUCK_TOP_ELR0			0x1449
+#define MT6315_BUCK_TOP_ELR2			0x144B
+#define MT6315_BUCK_TOP_ELR4			0x144D
+#define MT6315_BUCK_TOP_ELR6			0x144F
+#define MT6315_VBUCK1_DBG0			0x1499
+#define MT6315_VBUCK1_DBG4			0x149D
+#define MT6315_VBUCK2_DBG0			0x1519
+#define MT6315_VBUCK2_DBG4			0x151D
+#define MT6315_VBUCK3_DBG0			0x1599
+#define MT6315_VBUCK3_DBG4			0x159D
+#define MT6315_VBUCK4_DBG0			0x1619
+#define MT6315_VBUCK4_DBG4			0x161D
+#define MT6315_BUCK_TOP_4PHASE_ANA_CON42	0x16B1
+
+#define PROTECTION_KEY_H			0x9C
+#define PROTECTION_KEY				0xEA
+
+#endif /* __LINUX_REGULATOR_MT6315_H */
-- 
cgit v1.2.3


From 951f6ccfcbb7e4a18bf5fef1fb373d21e5831957 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 2 Feb 2021 11:19:24 +0100
Subject: mmc: core: Drop redundant member in struct mmc host

The Kconfig option to use the blk-mq support was removed in commit
1bec43a3b181 ("mmc: core: Remove option not to use blk-mq"), but forgot to
remove the use_blk_mq member in the struct mmc_host, let's fix it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20210202101924.69970-1-ulf.hansson@linaro.org
---
 include/linux/mmc/host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 927ba7566617..26a3c7bc29ae 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -429,7 +429,6 @@ struct mmc_host {
 	unsigned int		doing_retune:1;	/* re-tuning in progress */
 	unsigned int		retune_now:1;	/* do re-tuning at next req */
 	unsigned int		retune_paused:1; /* re-tuning is temporarily disabled */
-	unsigned int		use_blk_mq:1;	/* use blk-mq */
 	unsigned int		retune_crc_disable:1; /* don't trigger retune upon crc */
 	unsigned int		can_dma_map_merge:1; /* merging can be used */
 
-- 
cgit v1.2.3


From 40c735db06e16bf29c74c3626318719783be3784 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 2 Feb 2021 11:16:26 +0100
Subject: mmc: core: Drop redundant bouncesz from struct mmc_card

The commit de3ee99b097d ("mmc: Delete bounce buffer handling") removed the
bounce buffer handling from the mmc core, but forgot to remove the bouncesz
member from the struct mmc_card, let's fix it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20210202101626.64503-1-ulf.hansson@linaro.org
---
 include/linux/mmc/card.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 42df06c6b19c..f9ad35dd6012 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -311,7 +311,6 @@ struct mmc_card {
 	struct mmc_part	part[MMC_NUM_PHY_PARTITION]; /* physical partitions */
 	unsigned int    nr_parts;
 
-	unsigned int		bouncesz;	/* Bounce buffer size */
 	struct workqueue_struct *complete_wq;	/* Private workqueue */
 };
 
-- 
cgit v1.2.3


From 883c36a32fc031046218ef5802023d5aa54d6cb0 Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Mon, 8 Feb 2021 17:19:28 +0530
Subject: spi: Change provied to provided in the file spi.h

s/provied/provided/

Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Link: https://lore.kernel.org/r/20210208114928.32241-1-unixbhaskar@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 797ef28fa7d5..592897fa4f03 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -622,7 +622,7 @@ struct spi_controller {
 
 	/*
 	 * These hooks are for drivers that use a generic implementation
-	 * of transfer_one_message() provied by the core.
+	 * of transfer_one_message() provided by the core.
 	 */
 	void (*set_cs)(struct spi_device *spi, bool enable);
 	int (*transfer_one)(struct spi_controller *ctlr, struct spi_device *spi,
-- 
cgit v1.2.3


From db783e769a950b3f9502dfbc0f7fdbea499a1da6 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 17 Dec 2020 08:34:20 +0000
Subject: mfd: Standardise MFD_CELL_* helper names

Start all helpers with "MFD_CELL_".

Cc: Gene Chen <gene_chen@richtek.com>
Cc: linux-mediatek@lists.infradead.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/ab8500-core.c  | 42 +++++++++++++++++++++---------------------
 drivers/mfd/db8500-prcmu.c |  6 +++---
 drivers/mfd/mt6360-core.c  | 12 ++++++------
 include/linux/mfd/core.h   |  6 +++---
 4 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index a3bac9da8cbb..ba8da061af0e 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -610,52 +610,52 @@ int ab8500_suspend(struct ab8500 *ab8500)
 }
 
 static const struct mfd_cell ab8500_bm_devs[] = {
-	OF_MFD_CELL("ab8500-charger", NULL, &ab8500_bm_data,
+	MFD_CELL_OF("ab8500-charger", NULL, &ab8500_bm_data,
 		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-charger"),
-	OF_MFD_CELL("ab8500-btemp", NULL, &ab8500_bm_data,
+	MFD_CELL_OF("ab8500-btemp", NULL, &ab8500_bm_data,
 		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-btemp"),
-	OF_MFD_CELL("ab8500-fg", NULL, &ab8500_bm_data,
+	MFD_CELL_OF("ab8500-fg", NULL, &ab8500_bm_data,
 		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-fg"),
-	OF_MFD_CELL("ab8500-chargalg", NULL, &ab8500_bm_data,
+	MFD_CELL_OF("ab8500-chargalg", NULL, &ab8500_bm_data,
 		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-chargalg"),
 };
 
 static const struct mfd_cell ab8500_devs[] = {
 #ifdef CONFIG_DEBUG_FS
-	OF_MFD_CELL("ab8500-debug",
+	MFD_CELL_OF("ab8500-debug",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-debug"),
 #endif
-	OF_MFD_CELL("ab8500-sysctrl",
+	MFD_CELL_OF("ab8500-sysctrl",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-sysctrl"),
-	OF_MFD_CELL("ab8500-ext-regulator",
+	MFD_CELL_OF("ab8500-ext-regulator",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-ext-regulator"),
-	OF_MFD_CELL("ab8500-regulator",
+	MFD_CELL_OF("ab8500-regulator",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-regulator"),
-	OF_MFD_CELL("ab8500-clk",
+	MFD_CELL_OF("ab8500-clk",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-clk"),
-	OF_MFD_CELL("ab8500-gpadc",
+	MFD_CELL_OF("ab8500-gpadc",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-gpadc"),
-	OF_MFD_CELL("ab8500-rtc",
+	MFD_CELL_OF("ab8500-rtc",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-rtc"),
-	OF_MFD_CELL("ab8500-acc-det",
+	MFD_CELL_OF("ab8500-acc-det",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-acc-det"),
-	OF_MFD_CELL("ab8500-poweron-key",
+	MFD_CELL_OF("ab8500-poweron-key",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-poweron-key"),
-	OF_MFD_CELL("ab8500-pwm",
+	MFD_CELL_OF("ab8500-pwm",
 		    NULL, NULL, 0, 1, "stericsson,ab8500-pwm"),
-	OF_MFD_CELL("ab8500-pwm",
+	MFD_CELL_OF("ab8500-pwm",
 		    NULL, NULL, 0, 2, "stericsson,ab8500-pwm"),
-	OF_MFD_CELL("ab8500-pwm",
+	MFD_CELL_OF("ab8500-pwm",
 		    NULL, NULL, 0, 3, "stericsson,ab8500-pwm"),
-	OF_MFD_CELL("ab8500-denc",
+	MFD_CELL_OF("ab8500-denc",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-denc"),
-	OF_MFD_CELL("pinctrl-ab8500",
+	MFD_CELL_OF("pinctrl-ab8500",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-gpio"),
-	OF_MFD_CELL("abx500-temp",
+	MFD_CELL_OF("abx500-temp",
 		    NULL, NULL, 0, 0, "stericsson,abx500-temp"),
-	OF_MFD_CELL("ab8500-usb",
+	MFD_CELL_OF("ab8500-usb",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-usb"),
-	OF_MFD_CELL("ab8500-codec",
+	MFD_CELL_OF("ab8500-codec",
 		    NULL, NULL, 0, 0, "stericsson,ab8500-codec"),
 };
 
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index a5983d515db0..167faac9b75b 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2954,12 +2954,12 @@ static const struct mfd_cell common_prcmu_devs[] = {
 };
 
 static const struct mfd_cell db8500_prcmu_devs[] = {
-	OF_MFD_CELL("db8500-prcmu-regulators", NULL,
+	MFD_CELL_OF("db8500-prcmu-regulators", NULL,
 		    &db8500_regulators, sizeof(db8500_regulators), 0,
 		    "stericsson,db8500-prcmu-regulator"),
-	OF_MFD_CELL("cpuidle-dbx500",
+	MFD_CELL_OF("cpuidle-dbx500",
 		    NULL, NULL, 0, 0, "stericsson,cpuidle-dbx500"),
-	OF_MFD_CELL("db8500-thermal",
+	MFD_CELL_OF("db8500-thermal",
 		    NULL, NULL, 0, 0, "stericsson,db8500-thermal"),
 };
 
diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
index 4661c1b29a72..480722acf706 100644
--- a/drivers/mfd/mt6360-core.c
+++ b/drivers/mfd/mt6360-core.c
@@ -292,17 +292,17 @@ static const struct resource mt6360_ldo_resources[] = {
 };
 
 static const struct mfd_cell mt6360_devs[] = {
-	OF_MFD_CELL("mt6360_adc", mt6360_adc_resources,
+	MFD_CELL_OF("mt6360_adc", mt6360_adc_resources,
 		    NULL, 0, 0, "mediatek,mt6360_adc"),
-	OF_MFD_CELL("mt6360_chg", mt6360_chg_resources,
+	MFD_CELL_OF("mt6360_chg", mt6360_chg_resources,
 		    NULL, 0, 0, "mediatek,mt6360_chg"),
-	OF_MFD_CELL("mt6360_led", mt6360_led_resources,
+	MFD_CELL_OF("mt6360_led", mt6360_led_resources,
 		    NULL, 0, 0, "mediatek,mt6360_led"),
-	OF_MFD_CELL("mt6360_pmic", mt6360_pmic_resources,
+	MFD_CELL_OF("mt6360_pmic", mt6360_pmic_resources,
 		    NULL, 0, 0, "mediatek,mt6360_pmic"),
-	OF_MFD_CELL("mt6360_ldo", mt6360_ldo_resources,
+	MFD_CELL_OF("mt6360_ldo", mt6360_ldo_resources,
 		    NULL, 0, 0, "mediatek,mt6360_ldo"),
-	OF_MFD_CELL("mt6360_tcpc", NULL,
+	MFD_CELL_OF("mt6360_tcpc", NULL,
 		    NULL, 0, 0, "mediatek,mt6360_tcpc"),
 };
 
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 4b35baa14d30..2009c4b936d9 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -28,13 +28,13 @@
 		.id = (_id),						\
 	}
 
-#define OF_MFD_CELL_REG(_name, _res, _pdata, _pdsize, _id, _compat, _of_reg) \
+#define MFD_CELL_OF_REG(_name, _res, _pdata, _pdsize, _id, _compat, _of_reg) \
 	MFD_CELL_ALL(_name, _res, _pdata, _pdsize, _id, _compat, _of_reg, true, NULL)
 
-#define OF_MFD_CELL(_name, _res, _pdata, _pdsize, _id, _compat) \
+#define MFD_CELL_OF(_name, _res, _pdata, _pdsize, _id, _compat) \
 	MFD_CELL_ALL(_name, _res, _pdata, _pdsize, _id, _compat, 0, false, NULL)
 
-#define ACPI_MFD_CELL(_name, _res, _pdata, _pdsize, _id, _match) \
+#define MFD_CELL_ACPI(_name, _res, _pdata, _pdsize, _id, _match) \
 	MFD_CELL_ALL(_name, _res, _pdata, _pdsize, _id, NULL, 0, false, _match)
 
 #define MFD_CELL_BASIC(_name, _res, _pdata, _pdsize, _id) \
-- 
cgit v1.2.3


From 296f5568c6ee906e2a8db00adc751674f1745bd8 Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Thu, 14 Jan 2021 15:16:48 -0800
Subject: mfd: intel-m10-bmc: Expose MAC address and count

Create two sysfs entries for exposing the MAC address and count
from the MAX10 BMC register space. The MAC address is the first
in a sequential block of MAC addresses reserved for the FPGA card.
The MAC count is the number of MAC addresses in the reserved block.

Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../ABI/testing/sysfs-driver-intel-m10-bmc         | 21 +++++++++++
 drivers/mfd/intel-m10-bmc.c                        | 43 ++++++++++++++++++++++
 include/linux/mfd/intel-m10-bmc.h                  |  9 +++++
 3 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc
index 979a2d62513f..9773925138af 100644
--- a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc
+++ b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc
@@ -13,3 +13,24 @@ Contact:	Xu Yilun <yilun.xu@intel.com>
 Description:	Read only. Returns the firmware version of Intel MAX10
 		BMC chip.
 		Format: "0x%x".
+
+What:		/sys/bus/spi/devices/.../mac_address
+Date:		January 2021
+KernelVersion:  5.12
+Contact:	Russ Weight <russell.h.weight@intel.com>
+Description:	Read only. Returns the first MAC address in a block
+		of sequential MAC addresses assigned to the board
+		that is managed by the Intel MAX10 BMC. It is stored in
+		FLASH storage and is mirrored in the MAX10 BMC register
+		space.
+		Format: "%02x:%02x:%02x:%02x:%02x:%02x".
+
+What:		/sys/bus/spi/devices/.../mac_count
+Date:		January 2021
+KernelVersion:  5.12
+Contact:	Russ Weight <russell.h.weight@intel.com>
+Description:	Read only. Returns the number of sequential MAC
+		addresses assigned to the board managed by the Intel
+		MAX10 BMC. This value is stored in FLASH and is mirrored
+		in the MAX10 BMC register space.
+		Format: "%u".
diff --git a/drivers/mfd/intel-m10-bmc.c b/drivers/mfd/intel-m10-bmc.c
index b84579b7b4f0..06c977519479 100644
--- a/drivers/mfd/intel-m10-bmc.c
+++ b/drivers/mfd/intel-m10-bmc.c
@@ -60,9 +60,52 @@ static ssize_t bmcfw_version_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(bmcfw_version);
 
+static ssize_t mac_address_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct intel_m10bmc *max10 = dev_get_drvdata(dev);
+	unsigned int macaddr_low, macaddr_high;
+	int ret;
+
+	ret = m10bmc_sys_read(max10, M10BMC_MAC_LOW, &macaddr_low);
+	if (ret)
+		return ret;
+
+	ret = m10bmc_sys_read(max10, M10BMC_MAC_HIGH, &macaddr_high);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%02x:%02x:%02x:%02x:%02x:%02x\n",
+			  (u8)FIELD_GET(M10BMC_MAC_BYTE1, macaddr_low),
+			  (u8)FIELD_GET(M10BMC_MAC_BYTE2, macaddr_low),
+			  (u8)FIELD_GET(M10BMC_MAC_BYTE3, macaddr_low),
+			  (u8)FIELD_GET(M10BMC_MAC_BYTE4, macaddr_low),
+			  (u8)FIELD_GET(M10BMC_MAC_BYTE5, macaddr_high),
+			  (u8)FIELD_GET(M10BMC_MAC_BYTE6, macaddr_high));
+}
+static DEVICE_ATTR_RO(mac_address);
+
+static ssize_t mac_count_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct intel_m10bmc *max10 = dev_get_drvdata(dev);
+	unsigned int macaddr_high;
+	int ret;
+
+	ret = m10bmc_sys_read(max10, M10BMC_MAC_HIGH, &macaddr_high);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%u\n",
+			  (u8)FIELD_GET(M10BMC_MAC_COUNT, macaddr_high));
+}
+static DEVICE_ATTR_RO(mac_count);
+
 static struct attribute *m10bmc_attrs[] = {
 	&dev_attr_bmc_version.attr,
 	&dev_attr_bmcfw_version.attr,
+	&dev_attr_mac_address.attr,
+	&dev_attr_mac_count.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(m10bmc);
diff --git a/include/linux/mfd/intel-m10-bmc.h b/include/linux/mfd/intel-m10-bmc.h
index c8ef2f1654a4..74d4e193966a 100644
--- a/include/linux/mfd/intel-m10-bmc.h
+++ b/include/linux/mfd/intel-m10-bmc.h
@@ -15,6 +15,15 @@
 
 /* Register offset of system registers */
 #define NIOS2_FW_VERSION		0x0
+#define M10BMC_MAC_LOW			0x10
+#define M10BMC_MAC_BYTE4		GENMASK(7, 0)
+#define M10BMC_MAC_BYTE3		GENMASK(15, 8)
+#define M10BMC_MAC_BYTE2		GENMASK(23, 16)
+#define M10BMC_MAC_BYTE1		GENMASK(31, 24)
+#define M10BMC_MAC_HIGH			0x14
+#define M10BMC_MAC_BYTE6		GENMASK(7, 0)
+#define M10BMC_MAC_BYTE5		GENMASK(15, 8)
+#define M10BMC_MAC_COUNT		GENMASK(23, 16)
 #define M10BMC_TEST_REG			0x3c
 #define M10BMC_BUILD_VER		0x68
 #define M10BMC_VER_MAJOR_MSK		GENMASK(23, 16)
-- 
cgit v1.2.3


From 02e550d5b706f034d24e7e46234eb1982f05a137 Mon Sep 17 00:00:00 2001
From: Jeff LaBundy <jeff@labundy.com>
Date: Sun, 17 Jan 2021 21:57:07 -0600
Subject: mfd: iqs62x: Do not poll during ATI

After loading firmware, the driver triggers ATI (calibration) with
the newly loaded register configuration in place. Next, the driver
polls a register field to ensure ATI completed in a timely fashion
and that the device is ready to sense.

However, communicating with the device over I2C while ATI is under-
way may induce noise in the device and cause ATI to fail. As such,
the vendor recommends not to poll the device during ATI.

To solve this problem, let the device naturally signal to the host
that ATI is complete by way of an interrupt. A completion prevents
the sub-devices from being registered until this happens.

The former logic that scaled ATI timeout and filter settling delay
is not carried forward with the new implementation, as it produces
overly conservative delays at lower clock rates. Instead, a single
pair of delays that covers all cases is used.

Signed-off-by: Jeff LaBundy <jeff@labundy.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/iqs62x.c       | 125 +++++++++++++++++++++++++--------------------
 include/linux/mfd/iqs62x.h |  11 ++--
 2 files changed, 73 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/iqs62x.c b/drivers/mfd/iqs62x.c
index 07c972531715..9b5c389ca2c3 100644
--- a/drivers/mfd/iqs62x.c
+++ b/drivers/mfd/iqs62x.c
@@ -36,7 +36,6 @@
 #define IQS62X_PROD_NUM				0x00
 
 #define IQS62X_SYS_FLAGS			0x10
-#define IQS62X_SYS_FLAGS_IN_ATI			BIT(2)
 
 #define IQS620_HALL_FLAGS			0x16
 #define IQS621_HALL_FLAGS			0x19
@@ -60,6 +59,7 @@
 #define IQS62X_SYS_SETTINGS_ACK_RESET		BIT(6)
 #define IQS62X_SYS_SETTINGS_EVENT_MODE		BIT(5)
 #define IQS62X_SYS_SETTINGS_CLK_DIV		BIT(4)
+#define IQS62X_SYS_SETTINGS_COMM_ATI		BIT(3)
 #define IQS62X_SYS_SETTINGS_REDO_ATI		BIT(1)
 
 #define IQS62X_PWR_SETTINGS			0xD2
@@ -81,9 +81,7 @@
 #define IQS62X_FW_REC_TYPE_MASK			3
 #define IQS62X_FW_REC_TYPE_DATA			4
 
-#define IQS62X_ATI_POLL_SLEEP_US		10000
-#define IQS62X_ATI_POLL_TIMEOUT_US		500000
-#define IQS62X_ATI_STABLE_DELAY_MS		150
+#define IQS62X_FILT_SETTLE_MS			250
 
 struct iqs62x_fw_rec {
 	u8 type;
@@ -111,7 +109,6 @@ static int iqs62x_dev_init(struct iqs62x_core *iqs62x)
 	struct iqs62x_fw_blk *fw_blk;
 	unsigned int val;
 	int ret;
-	u8 clk_div = 1;
 
 	list_for_each_entry(fw_blk, &iqs62x->fw_blk_head, list) {
 		if (fw_blk->mask)
@@ -181,28 +178,32 @@ static int iqs62x_dev_init(struct iqs62x_core *iqs62x)
 			return ret;
 	}
 
-	ret = regmap_read(iqs62x->regmap, IQS62X_SYS_SETTINGS, &val);
-	if (ret)
-		return ret;
-
-	if (val & IQS62X_SYS_SETTINGS_CLK_DIV)
-		clk_div = iqs62x->dev_desc->clk_div;
-
-	ret = regmap_write(iqs62x->regmap, IQS62X_SYS_SETTINGS, val |
-			   IQS62X_SYS_SETTINGS_ACK_RESET |
-			   IQS62X_SYS_SETTINGS_EVENT_MODE |
-			   IQS62X_SYS_SETTINGS_REDO_ATI);
-	if (ret)
-		return ret;
-
-	ret = regmap_read_poll_timeout(iqs62x->regmap, IQS62X_SYS_FLAGS, val,
-				       !(val & IQS62X_SYS_FLAGS_IN_ATI),
-				       IQS62X_ATI_POLL_SLEEP_US,
-				       IQS62X_ATI_POLL_TIMEOUT_US * clk_div);
+	/*
+	 * Place the device in streaming mode at first so as not to miss the
+	 * limited number of interrupts that would otherwise occur after ATI
+	 * completes. The device is subsequently placed in event mode by the
+	 * interrupt handler.
+	 *
+	 * In the meantime, mask interrupts during ATI to prevent the device
+	 * from soliciting I2C traffic until the noise-sensitive ATI process
+	 * is complete.
+	 */
+	ret = regmap_update_bits(iqs62x->regmap, IQS62X_SYS_SETTINGS,
+				 IQS62X_SYS_SETTINGS_ACK_RESET |
+				 IQS62X_SYS_SETTINGS_EVENT_MODE |
+				 IQS62X_SYS_SETTINGS_COMM_ATI |
+				 IQS62X_SYS_SETTINGS_REDO_ATI,
+				 IQS62X_SYS_SETTINGS_ACK_RESET |
+				 IQS62X_SYS_SETTINGS_REDO_ATI);
 	if (ret)
 		return ret;
 
-	msleep(IQS62X_ATI_STABLE_DELAY_MS * clk_div);
+	/*
+	 * The following delay gives the device time to deassert its RDY output
+	 * in case a communication window was open while the REDO_ATI field was
+	 * written. This prevents an interrupt from being serviced prematurely.
+	 */
+	usleep_range(5000, 5100);
 
 	return 0;
 }
@@ -433,6 +434,11 @@ const struct iqs62x_event_desc iqs62x_events[IQS62X_NUM_EVENTS] = {
 		.mask	= BIT(7),
 		.val	= BIT(7),
 	},
+	[IQS62X_EVENT_SYS_ATI] = {
+		.reg	= IQS62X_EVENT_SYS,
+		.mask	= BIT(2),
+		.val	= BIT(2),
+	},
 };
 EXPORT_SYMBOL_GPL(iqs62x_events);
 
@@ -521,12 +527,39 @@ static irqreturn_t iqs62x_irq(int irq, void *context)
 				"Failed to re-initialize device: %d\n", ret);
 			return IRQ_NONE;
 		}
+
+		iqs62x->event_cache |= BIT(IQS62X_EVENT_SYS_RESET);
+		reinit_completion(&iqs62x->ati_done);
+	} else if (event_flags & BIT(IQS62X_EVENT_SYS_ATI)) {
+		iqs62x->event_cache |= BIT(IQS62X_EVENT_SYS_ATI);
+		reinit_completion(&iqs62x->ati_done);
+	} else if (!completion_done(&iqs62x->ati_done)) {
+		ret = regmap_update_bits(iqs62x->regmap, IQS62X_SYS_SETTINGS,
+					 IQS62X_SYS_SETTINGS_EVENT_MODE, 0xFF);
+		if (ret) {
+			dev_err(&client->dev,
+				"Failed to enable event mode: %d\n", ret);
+			return IRQ_NONE;
+		}
+
+		msleep(IQS62X_FILT_SETTLE_MS);
+		complete_all(&iqs62x->ati_done);
 	}
 
-	ret = blocking_notifier_call_chain(&iqs62x->nh, event_flags,
-					   &event_data);
-	if (ret & NOTIFY_STOP_MASK)
-		return IRQ_NONE;
+	/*
+	 * Reset and ATI events are not broadcast to the sub-device drivers
+	 * until ATI has completed. Any other events that may have occurred
+	 * during ATI are ignored.
+	 */
+	if (completion_done(&iqs62x->ati_done)) {
+		event_flags |= iqs62x->event_cache;
+		ret = blocking_notifier_call_chain(&iqs62x->nh, event_flags,
+						   &event_data);
+		if (ret & NOTIFY_STOP_MASK)
+			return IRQ_NONE;
+
+		iqs62x->event_cache = 0;
+	}
 
 	/*
 	 * Once the communication window is closed, a small delay is added to
@@ -567,6 +600,12 @@ static void iqs62x_firmware_load(const struct firmware *fw, void *context)
 		goto err_out;
 	}
 
+	if (!wait_for_completion_timeout(&iqs62x->ati_done,
+					 msecs_to_jiffies(2000))) {
+		dev_err(&client->dev, "Failed to complete ATI\n");
+		goto err_out;
+	}
+
 	ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_NONE,
 				   iqs62x->dev_desc->sub_devs,
 				   iqs62x->dev_desc->num_sub_devs,
@@ -748,22 +787,17 @@ static const struct iqs62x_dev_desc iqs62x_devs[] = {
 		.dev_name	= "iqs620at",
 		.sub_devs	= iqs620at_sub_devs,
 		.num_sub_devs	= ARRAY_SIZE(iqs620at_sub_devs),
-
 		.prod_num	= IQS620_PROD_NUM,
 		.sw_num		= 0x08,
 		.cal_regs	= iqs620at_cal_regs,
 		.num_cal_regs	= ARRAY_SIZE(iqs620at_cal_regs),
-
 		.prox_mask	= BIT(0),
 		.sar_mask	= BIT(1) | BIT(7),
 		.hall_mask	= BIT(2),
 		.hyst_mask	= BIT(3),
 		.temp_mask	= BIT(4),
-
 		.prox_settings	= IQS620_PROX_SETTINGS_4,
 		.hall_flags	= IQS620_HALL_FLAGS,
-
-		.clk_div	= 4,
 		.fw_name	= "iqs620a.bin",
 		.event_regs	= &iqs620a_event_regs[IQS62X_UI_PROX],
 	},
@@ -771,20 +805,15 @@ static const struct iqs62x_dev_desc iqs62x_devs[] = {
 		.dev_name	= "iqs620a",
 		.sub_devs	= iqs620a_sub_devs,
 		.num_sub_devs	= ARRAY_SIZE(iqs620a_sub_devs),
-
 		.prod_num	= IQS620_PROD_NUM,
 		.sw_num		= 0x08,
-
 		.prox_mask	= BIT(0),
 		.sar_mask	= BIT(1) | BIT(7),
 		.hall_mask	= BIT(2),
 		.hyst_mask	= BIT(3),
 		.temp_mask	= BIT(4),
-
 		.prox_settings	= IQS620_PROX_SETTINGS_4,
 		.hall_flags	= IQS620_HALL_FLAGS,
-
-		.clk_div	= 4,
 		.fw_name	= "iqs620a.bin",
 		.event_regs	= &iqs620a_event_regs[IQS62X_UI_PROX],
 	},
@@ -792,23 +821,18 @@ static const struct iqs62x_dev_desc iqs62x_devs[] = {
 		.dev_name	= "iqs621",
 		.sub_devs	= iqs621_sub_devs,
 		.num_sub_devs	= ARRAY_SIZE(iqs621_sub_devs),
-
 		.prod_num	= IQS621_PROD_NUM,
 		.sw_num		= 0x09,
 		.cal_regs	= iqs621_cal_regs,
 		.num_cal_regs	= ARRAY_SIZE(iqs621_cal_regs),
-
 		.prox_mask	= BIT(0),
 		.hall_mask	= BIT(1),
 		.als_mask	= BIT(2),
 		.hyst_mask	= BIT(3),
 		.temp_mask	= BIT(4),
-
 		.als_flags	= IQS621_ALS_FLAGS,
 		.hall_flags	= IQS621_HALL_FLAGS,
 		.hyst_shift	= 5,
-
-		.clk_div	= 2,
 		.fw_name	= "iqs621.bin",
 		.event_regs	= &iqs621_event_regs[IQS62X_UI_PROX],
 	},
@@ -816,21 +840,16 @@ static const struct iqs62x_dev_desc iqs62x_devs[] = {
 		.dev_name	= "iqs622",
 		.sub_devs	= iqs622_sub_devs,
 		.num_sub_devs	= ARRAY_SIZE(iqs622_sub_devs),
-
 		.prod_num	= IQS622_PROD_NUM,
 		.sw_num		= 0x06,
-
 		.prox_mask	= BIT(0),
 		.sar_mask	= BIT(1),
 		.hall_mask	= BIT(2),
 		.als_mask	= BIT(3),
 		.ir_mask	= BIT(4),
-
 		.prox_settings	= IQS622_PROX_SETTINGS_4,
 		.als_flags	= IQS622_ALS_FLAGS,
 		.hall_flags	= IQS622_HALL_FLAGS,
-
-		.clk_div	= 2,
 		.fw_name	= "iqs622.bin",
 		.event_regs	= &iqs622_event_regs[IQS62X_UI_PROX],
 	},
@@ -838,14 +857,10 @@ static const struct iqs62x_dev_desc iqs62x_devs[] = {
 		.dev_name	= "iqs624",
 		.sub_devs	= iqs624_sub_devs,
 		.num_sub_devs	= ARRAY_SIZE(iqs624_sub_devs),
-
 		.prod_num	= IQS624_PROD_NUM,
 		.sw_num		= 0x0B,
-
 		.interval	= IQS624_INTERVAL_NUM,
 		.interval_div	= 3,
-
-		.clk_div	= 2,
 		.fw_name	= "iqs624.bin",
 		.event_regs	= &iqs624_event_regs[IQS62X_UI_PROX],
 	},
@@ -853,14 +868,10 @@ static const struct iqs62x_dev_desc iqs62x_devs[] = {
 		.dev_name	= "iqs625",
 		.sub_devs	= iqs625_sub_devs,
 		.num_sub_devs	= ARRAY_SIZE(iqs625_sub_devs),
-
 		.prod_num	= IQS625_PROD_NUM,
 		.sw_num		= 0x0B,
-
 		.interval	= IQS625_INTERVAL_NUM,
 		.interval_div	= 10,
-
-		.clk_div	= 2,
 		.fw_name	= "iqs625.bin",
 		.event_regs	= &iqs625_event_regs[IQS62X_UI_PROX],
 	},
@@ -890,6 +901,8 @@ static int iqs62x_probe(struct i2c_client *client)
 
 	BLOCKING_INIT_NOTIFIER_HEAD(&iqs62x->nh);
 	INIT_LIST_HEAD(&iqs62x->fw_blk_head);
+
+	init_completion(&iqs62x->ati_done);
 	init_completion(&iqs62x->fw_done);
 
 	iqs62x->regmap = devm_regmap_init_i2c(client, &iqs62x_regmap_config);
diff --git a/include/linux/mfd/iqs62x.h b/include/linux/mfd/iqs62x.h
index 043d3b6de9ec..5ced55eae11b 100644
--- a/include/linux/mfd/iqs62x.h
+++ b/include/linux/mfd/iqs62x.h
@@ -28,7 +28,7 @@
 #define IQS620_GLBL_EVENT_MASK_PMU		BIT(6)
 
 #define IQS62X_NUM_KEYS				16
-#define IQS62X_NUM_EVENTS			(IQS62X_NUM_KEYS + 5)
+#define IQS62X_NUM_EVENTS			(IQS62X_NUM_KEYS + 6)
 
 #define IQS62X_EVENT_SIZE			10
 
@@ -78,6 +78,7 @@ enum iqs62x_event_flag {
 
 	/* everything else */
 	IQS62X_EVENT_SYS_RESET,
+	IQS62X_EVENT_SYS_ATI,
 };
 
 struct iqs62x_event_data {
@@ -97,12 +98,10 @@ struct iqs62x_dev_desc {
 	const char *dev_name;
 	const struct mfd_cell *sub_devs;
 	int num_sub_devs;
-
 	u8 prod_num;
 	u8 sw_num;
 	const u8 *cal_regs;
 	int num_cal_regs;
-
 	u8 prox_mask;
 	u8 sar_mask;
 	u8 hall_mask;
@@ -110,16 +109,12 @@ struct iqs62x_dev_desc {
 	u8 temp_mask;
 	u8 als_mask;
 	u8 ir_mask;
-
 	u8 prox_settings;
 	u8 als_flags;
 	u8 hall_flags;
 	u8 hyst_shift;
-
 	u8 interval;
 	u8 interval_div;
-
-	u8 clk_div;
 	const char *fw_name;
 	const enum iqs62x_event_reg (*event_regs)[IQS62X_EVENT_SIZE];
 };
@@ -130,8 +125,10 @@ struct iqs62x_core {
 	struct regmap *regmap;
 	struct blocking_notifier_head nh;
 	struct list_head fw_blk_head;
+	struct completion ati_done;
 	struct completion fw_done;
 	enum iqs62x_ui_sel ui_sel;
+	unsigned long event_cache;
 };
 
 extern const struct iqs62x_event_desc iqs62x_events[IQS62X_NUM_EVENTS];
-- 
cgit v1.2.3


From 6ac0b71537e1c14e7532408fe4aae553aa314237 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 18:19:20 +0100
Subject: block: move struct biovec_slab to bio.c

struct biovec_slab is only used inside of bio.c, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 6 ++++++
 include/linux/bio.h | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index cee2d310f02e..2c359dadfdf6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -25,6 +25,12 @@
 #include "blk.h"
 #include "blk-rq-qos.h"
 
+struct biovec_slab {
+	int nr_vecs;
+	char *name;
+	struct kmem_cache *slab;
+};
+
 /*
  * if you change this list, also change bvec_alloc or things will
  * break badly! cannot be bigger than what you can fit into an
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c74857cf1252..4a84207dd996 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -720,12 +720,6 @@ struct bio_set {
 	struct workqueue_struct	*rescue_workqueue;
 };
 
-struct biovec_slab {
-	int nr_vecs;
-	char *name;
-	struct kmem_cache *slab;
-};
-
 static inline bool bioset_initialized(struct bio_set *bs)
 {
 	return bs->bio_slab != NULL;
-- 
cgit v1.2.3


From 0f2e6ab851ae146c468bc5151c302c6e2473f70a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 18:19:24 +0100
Subject: block: turn the nr_iovecs argument to bio_alloc* into an unsigned
 short

The bi_max_vecs and bi_vcnt fields are defined as unsigned short, so
don't allow passing larger values in.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 4 ++--
 include/linux/bio.h | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index ae241252ea14..3d28d4723f6f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -407,7 +407,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *
  * Returns: Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
 			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
@@ -493,7 +493,7 @@ EXPORT_SYMBOL(bio_alloc_bioset);
  *
  * Returns: Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
 {
 	struct bio *bio;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4a84207dd996..9ceeb8ecdb7f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -407,8 +407,9 @@ extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src);
 
-extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
+		struct bio_set *bs);
+struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
 extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
@@ -416,7 +417,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 
 extern struct bio_set fs_bio_set;
 
-static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs)
 {
 	return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set);
 }
-- 
cgit v1.2.3


From 7a800a20ae6329e803c5c646b20811a6ae9ca136 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2021 18:19:29 +0100
Subject: block: use bi_max_vecs to find the bvec pool

Instead of encoding of the bvec pool using magic bio flags, just use
a helper to find the pool based on the max_vecs value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c     |  11 ++---
 block/bio.c               | 104 +++++++++++++++++++---------------------------
 block/blk.h               |   6 +--
 include/linux/bio.h       |   1 -
 include/linux/blk_types.h |  29 +------------
 5 files changed, 51 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 19617fa326c3..dfa652122a2d 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -28,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs,
 	if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
 		if (bip->bip_vec)
 			bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
-				  bip->bip_slab);
+				  bip->bip_max_vcnt);
 		mempool_free(bip, &bs->bio_integrity_pool);
 	} else {
 		kfree(bip);
@@ -70,14 +70,11 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	memset(bip, 0, sizeof(*bip));
 
 	if (nr_vecs > inline_vecs) {
-		unsigned long idx = 0;
-
-		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
-					  &bs->bvec_integrity_pool);
+		bip->bip_max_vcnt = nr_vecs;
+		bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
+					  &bip->bip_max_vcnt, gfp_mask);
 		if (!bip->bip_vec)
 			goto err;
-		bip->bip_max_vcnt = bvec_nr_vecs(idx);
-		bip->bip_slab = idx;
 	} else {
 		bip->bip_vec = bip->bip_inline_vecs;
 		bip->bip_max_vcnt = inline_vecs;
diff --git a/block/bio.c b/block/bio.c
index a36f955cd120..a0eabe2f8b07 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -36,6 +36,24 @@ static struct biovec_slab {
 	{ .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" },
 };
 
+static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
+{
+	switch (nr_vecs) {
+	/* smaller bios use inline vecs */
+	case 5 ... 16:
+		return &bvec_slabs[0];
+	case 17 ... 64:
+		return &bvec_slabs[1];
+	case 65 ... 128:
+		return &bvec_slabs[2];
+	case 129 ... BIO_MAX_PAGES:
+		return &bvec_slabs[3];
+	default:
+		BUG();
+		return NULL;
+	}
+}
+
 /*
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
@@ -131,26 +149,14 @@ out:
 	mutex_unlock(&bio_slab_lock);
 }
 
-unsigned int bvec_nr_vecs(unsigned short idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
 {
-	return bvec_slabs[--idx].nr_vecs;
-}
+	BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES);
 
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
-{
-	if (!idx)
-		return;
-	idx--;
-
-	BIO_BUG_ON(idx >= BVEC_POOL_NR);
-
-	if (idx == BVEC_POOL_MAX) {
+	if (nr_vecs == BIO_MAX_PAGES)
 		mempool_free(bv, pool);
-	} else {
-		struct biovec_slab *bvs = bvec_slabs + idx;
-
-		kmem_cache_free(bvs->slab, bv);
-	}
+	else if (nr_vecs > BIO_INLINE_VECS)
+		kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
 }
 
 /*
@@ -163,48 +169,34 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
 		__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 }
 
-struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
-			   mempool_t *pool)
+struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
+		gfp_t gfp_mask)
 {
+	struct biovec_slab *bvs = biovec_slab(*nr_vecs);
+
+	if (WARN_ON_ONCE(!bvs))
+		return NULL;
+
 	/*
-	 * see comment near bvec_array define!
+	 * Upgrade the nr_vecs request to take full advantage of the allocation.
+	 * We also rely on this in the bvec_free path.
 	 */
-	switch (nr) {
-	/* smaller bios use inline vecs */
-	case 5 ... 16:
-		*idx = 2;
-		break;
-	case 17 ... 64:
-		*idx = 3;
-		break;
-	case 65 ... 128:
-		*idx = 4;
-		break;
-	case 129 ... BIO_MAX_PAGES:
-		*idx = 5;
-		break;
-	default:
-		return NULL;
-	}
+	*nr_vecs = bvs->nr_vecs;
 
 	/*
 	 * Try a slab allocation first for all smaller allocations.  If that
 	 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
 	 * The mempool is sized to handle up to BIO_MAX_PAGES entries.
 	 */
-	if (*idx < BVEC_POOL_MAX) {
-		struct biovec_slab *bvs = bvec_slabs + *idx;
+	if (*nr_vecs < BIO_MAX_PAGES) {
 		struct bio_vec *bvl;
 
 		bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
-		if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) {
-			(*idx)++;
+		if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 			return bvl;
-		}
-		*idx = BVEC_POOL_MAX;
+		*nr_vecs = BIO_MAX_PAGES;
 	}
 
-	(*idx)++;
 	return mempool_alloc(pool, gfp_mask);
 }
 
@@ -231,7 +223,7 @@ static void bio_free(struct bio *bio)
 	bio_uninit(bio);
 
 	if (bs) {
-		bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+		bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -275,12 +267,8 @@ EXPORT_SYMBOL(bio_init);
  */
 void bio_reset(struct bio *bio)
 {
-	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
-
 	bio_uninit(bio);
-
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags;
 	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
@@ -453,22 +441,18 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
 
 	bio = p + bs->front_pad;
 	if (nr_iovecs > BIO_INLINE_VECS) {
-		unsigned long idx = 0;
 		struct bio_vec *bvl = NULL;
 
-		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
+		bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx,
-					 &bs->bvec_pool);
+			bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
 		}
-
 		if (unlikely(!bvl))
 			goto err_free;
 
-		bio_init(bio, bvl, bvec_nr_vecs(idx));
-		bio->bi_flags |= idx << BVEC_POOL_OFFSET;
+		bio_init(bio, bvl, nr_iovecs);
 	} else if (nr_iovecs) {
 		bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
 	} else {
@@ -644,7 +628,7 @@ EXPORT_SYMBOL(bio_put);
  */
 void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 {
-	BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
+	WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
 
 	/*
 	 * most users will be overriding ->bi_bdev with a new target,
@@ -934,7 +918,7 @@ EXPORT_SYMBOL_GPL(bio_release_pages);
 
 static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
-	WARN_ON_ONCE(BVEC_POOL_IDX(bio) != 0);
+	WARN_ON_ONCE(bio->bi_max_vecs);
 
 	bio->bi_vcnt = iter->nr_segs;
 	bio->bi_io_vec = (struct bio_vec *)iter->bvec;
@@ -1495,7 +1479,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
  */
 int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
-	struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
+	struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
 
 	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
@@ -1605,8 +1589,6 @@ static int __init init_bio(void)
 {
 	int i;
 
-	BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET);
-
 	bio_integrity_init();
 
 	for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
diff --git a/block/blk.h b/block/blk.h
index e022a0d0f2ce..bfc4d526f626 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -56,9 +56,9 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 void blk_freeze_queue(struct request_queue *q);
 
 #define BIO_INLINE_VECS 4
-struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
-void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
-unsigned int bvec_nr_vecs(unsigned short idx);
+struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
+		gfp_t gfp_mask);
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
 
 static inline bool biovec_phys_mergeable(struct request_queue *q,
 		struct bio_vec *vec1, struct bio_vec *vec2)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9ceeb8ecdb7f..3cbbaf76906e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -329,7 +329,6 @@ struct bio_integrity_payload {
 
 	struct bvec_iter	bip_iter;
 
-	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
 	unsigned short		bip_max_vcnt;	/* integrity bio_vec slots */
 	unsigned short		bip_flags;	/* control flags */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1bc6f6a01070..db026b6ec15a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -227,7 +227,7 @@ struct bio {
 						 * top bits REQ_OP. Use
 						 * accessors.
 						 */
-	unsigned short		bi_flags;	/* status, etc and bvec pool number */
+	unsigned short		bi_flags;	/* BIO_* below */
 	unsigned short		bi_ioprio;
 	unsigned short		bi_write_hint;
 	blk_status_t		bi_status;
@@ -307,33 +307,6 @@ enum {
 	BIO_FLAG_LAST
 };
 
-/* See BVEC_POOL_OFFSET below before adding new flags */
-
-/*
- * We support 6 different bvec pools, the last one is magic in that it
- * is backed by a mempool.
- */
-#define BVEC_POOL_NR		6
-#define BVEC_POOL_MAX		(BVEC_POOL_NR - 1)
-
-/*
- * Top 3 bits of bio flags indicate the pool the bvecs came from.  We add
- * 1 to the actual index so that 0 indicates that there are no bvecs to be
- * freed.
- */
-#define BVEC_POOL_BITS		(3)
-#define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
-#define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
-#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
-# error "BVEC_POOL_BITS is too small"
-#endif
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * only BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS	BVEC_POOL_OFFSET
-
 typedef __u32 __bitwise blk_mq_req_flags_t;
 
 /*
-- 
cgit v1.2.3


From 28aa2f9e73e762dbaa28fdca20cccb59c74cc139 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 8 Feb 2021 08:55:47 -0500
Subject: NFS: Always clear an invalid mapping when attempting a buffered write

If the page cache is invalid, then we can't do read-modify-write, so
ensure that we do clear it when we know it is invalid.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/file.c          |  2 ++
 fs/nfs/inode.c         | 97 ++++++++++++++++++++++++++++----------------------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 57 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 02795a01c7ef..03fd1dcc96bd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -632,6 +632,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 			goto out;
 	}
 
+	nfs_clear_invalid_mapping(file->f_mapping);
+
 	since = filemap_sample_wb_err(file->f_mapping);
 	nfs_start_io_write(inode);
 	result = generic_write_checks(iocb, from);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 522aa10a1a3e..2533053d764a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1257,55 +1257,19 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 	return 0;
 }
 
-bool nfs_mapping_need_revalidate_inode(struct inode *inode)
-{
-	return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
-		NFS_STALE(inode);
-}
-
-int nfs_revalidate_mapping_rcu(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	unsigned long *bitlock = &nfsi->flags;
-	int ret = 0;
-
-	if (IS_SWAPFILE(inode))
-		goto out;
-	if (nfs_mapping_need_revalidate_inode(inode)) {
-		ret = -ECHILD;
-		goto out;
-	}
-	spin_lock(&inode->i_lock);
-	if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
-	    (nfsi->cache_validity & NFS_INO_INVALID_DATA))
-		ret = -ECHILD;
-	spin_unlock(&inode->i_lock);
-out:
-	return ret;
-}
-
 /**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode: pointer to host inode
+ * nfs_clear_invalid_mapping - Conditionally clear a mapping
  * @mapping: pointer to mapping
+ *
+ * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping.
  */
-int nfs_revalidate_mapping(struct inode *inode,
-		struct address_space *mapping)
+int nfs_clear_invalid_mapping(struct address_space *mapping)
 {
+	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long *bitlock = &nfsi->flags;
 	int ret = 0;
 
-	/* swapfiles are not supposed to be shared. */
-	if (IS_SWAPFILE(inode))
-		goto out;
-
-	if (nfs_mapping_need_revalidate_inode(inode)) {
-		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-		if (ret < 0)
-			goto out;
-	}
-
 	/*
 	 * We must clear NFS_INO_INVALID_DATA first to ensure that
 	 * invalidations that come in while we're shooting down the mappings
@@ -1336,8 +1300,8 @@ int nfs_revalidate_mapping(struct inode *inode,
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
-			NFS_INO_DATA_INVAL_DEFER);
+	nfsi->cache_validity &=
+		~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1350,6 +1314,53 @@ out:
 	return ret;
 }
 
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+	return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+		NFS_STALE(inode);
+}
+
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &nfsi->flags;
+	int ret = 0;
+
+	if (IS_SWAPFILE(inode))
+		goto out;
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		ret = -ECHILD;
+		goto out;
+	}
+	spin_lock(&inode->i_lock);
+	if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+	    (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+		ret = -ECHILD;
+	spin_unlock(&inode->i_lock);
+out:
+	return ret;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	/* swapfiles are not supposed to be shared. */
+	if (IS_SWAPFILE(inode))
+		return 0;
+
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+		if (ret < 0)
+			return ret;
+	}
+
+	return nfs_clear_invalid_mapping(mapping);
+}
+
 static bool nfs_file_has_writers(struct nfs_inode *nfsi)
 {
 	struct inode *inode = &nfsi->vfs_inode;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3cfcf219e96b..2c662857247a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -387,6 +387,7 @@ extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
+extern int nfs_clear_invalid_mapping(struct address_space *mapping);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
-- 
cgit v1.2.3


From 71c36788b9253f086d09763b98804ed473e12a3b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 22 Jan 2021 11:58:04 +0200
Subject: lib/zstd: convert constants to defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These constants are really used internally by zstd and including
linux/zstd.h into users results in the following warnings:

In file included from fs/btrfs/zstd.c:19:
./include/linux/zstd.h:798:21: warning: ‘ZSTD_skippableHeaderSize’ defined but not used [-Wunused-const-variable=]
  798 | static const size_t ZSTD_skippableHeaderSize = 8;
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~
./include/linux/zstd.h:796:21: warning: ‘ZSTD_frameHeaderSize_max’ defined but not used [-Wunused-const-variable=]
  796 | static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~
./include/linux/zstd.h:795:21: warning: ‘ZSTD_frameHeaderSize_min’ defined but not used [-Wunused-const-variable=]
  795 | static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~
./include/linux/zstd.h:794:21: warning: ‘ZSTD_frameHeaderSize_prefix’ defined but not used [-Wunused-const-variable=]
  794 | static const size_t ZSTD_frameHeaderSize_prefix = 5;

So fix those warnings by turning the constants into defines.

Reviewed-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/zstd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 249575e2485f..e87f78c9b19c 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -791,11 +791,11 @@ size_t ZSTD_DStreamOutSize(void);
 /* for static allocation */
 #define ZSTD_FRAMEHEADERSIZE_MAX 18
 #define ZSTD_FRAMEHEADERSIZE_MIN  6
-static const size_t ZSTD_frameHeaderSize_prefix = 5;
-static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
-static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+#define ZSTD_frameHeaderSize_prefix 5
+#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN
+#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX
 /* magic number + skippable frame length */
-static const size_t ZSTD_skippableHeaderSize = 8;
+#define ZSTD_skippableHeaderSize 8
 
 
 /*-*************************************
-- 
cgit v1.2.3


From ae29333fa644679b96d88c9dd3afbef25cbac0f6 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Thu, 4 Feb 2021 19:21:40 +0900
Subject: block: add bio_add_zone_append_page

Add bio_add_zone_append_page(), a wrapper around bio_add_hw_page() which
is intended to be used by file systems that directly add pages to a bio
instead of using bio_iov_iter_get_pages().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 block/bio.c         | 33 +++++++++++++++++++++++++++++++++
 include/linux/bio.h |  2 ++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 1f2cc1fbe283..2f21d2958b60 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -851,6 +851,39 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio,
 }
 EXPORT_SYMBOL(bio_add_pc_page);
 
+/**
+ * bio_add_zone_append_page - attempt to add page to zone-append bio
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
+ * for a zone-append request. This can fail for a number of reasons, such as the
+ * bio being full or the target block device is not a zoned block device or
+ * other limitations of the target block device. The target block device must
+ * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
+ * to an empty bio.
+ *
+ * Returns: number of bytes added to the bio, or 0 in case of a failure.
+ */
+int bio_add_zone_append_page(struct bio *bio, struct page *page,
+			     unsigned int len, unsigned int offset)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	bool same_page = false;
+
+	if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
+		return 0;
+
+	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+		return 0;
+
+	return bio_add_hw_page(q, bio, page, len, offset,
+			       queue_max_zone_append_sectors(q), &same_page);
+}
+EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
+
 /**
  * __bio_try_merge_page - try appending data to an existing bvec.
  * @bio: destination bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1edda614f7ce..de62911473bb 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -455,6 +455,8 @@ void bio_chain(struct bio *, struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
+int bio_add_zone_append_page(struct bio *bio, struct page *page,
+			     unsigned int len, unsigned int offset);
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off, bool *same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
-- 
cgit v1.2.3


From c3b0e880bbfafab6beed92b1ee6db2cdaf4bc54c Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 4 Feb 2021 19:21:41 +0900
Subject: iomap: support REQ_OP_ZONE_APPEND

A ZONE_APPEND bio must follow hardware restrictions (e.g. not exceeding
max_zone_append_sectors) not to be split. bio_iov_iter_get_pages builds
such restricted bio using __bio_iov_append_get_pages if bio_op(bio) ==
REQ_OP_ZONE_APPEND.

To utilize it, we need to set the bio_op before calling
bio_iov_iter_get_pages(). This commit introduces IOMAP_F_ZONE_APPEND, so
that iomap user can set the flag to indicate they want REQ_OP_ZONE_APPEND
and restricted bio.

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/iomap/direct-io.c  | 43 +++++++++++++++++++++++++++++++++++++------
 include/linux/iomap.h |  1 +
 2 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 933f234d5bec..2273120d8ed7 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -201,6 +201,34 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	iomap_dio_submit_bio(dio, iomap, bio, pos);
 }
 
+/*
+ * Figure out the bio's operation flags from the dio request, the
+ * mapping, and whether or not we want FUA.  Note that we can end up
+ * clearing the WRITE_FUA flag in the dio request.
+ */
+static inline unsigned int
+iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua)
+{
+	unsigned int opflags = REQ_SYNC | REQ_IDLE;
+
+	if (!(dio->flags & IOMAP_DIO_WRITE)) {
+		WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
+		return REQ_OP_READ;
+	}
+
+	if (iomap->flags & IOMAP_F_ZONE_APPEND)
+		opflags |= REQ_OP_ZONE_APPEND;
+	else
+		opflags |= REQ_OP_WRITE;
+
+	if (use_fua)
+		opflags |= REQ_FUA;
+	else
+		dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+
+	return opflags;
+}
+
 static loff_t
 iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		struct iomap_dio *dio, struct iomap *iomap)
@@ -208,6 +236,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
 	unsigned int fs_block_size = i_blocksize(inode), pad;
 	unsigned int align = iov_iter_alignment(dio->submit.iter);
+	unsigned int bio_opf;
 	struct bio *bio;
 	bool need_zeroout = false;
 	bool use_fua = false;
@@ -263,6 +292,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 			iomap_dio_zero(dio, iomap, pos - pad, pad);
 	}
 
+	/*
+	 * Set the operation flags early so that bio_iov_iter_get_pages
+	 * can set up the page vector appropriately for a ZONE_APPEND
+	 * operation.
+	 */
+	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+
 	do {
 		size_t n;
 		if (dio->error) {
@@ -278,6 +314,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
+		bio->bi_opf = bio_opf;
 
 		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
 		if (unlikely(ret)) {
@@ -293,14 +330,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 		n = bio->bi_iter.bi_size;
 		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-			if (use_fua)
-				bio->bi_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
 			task_io_account_write(n);
 		} else {
-			bio->bi_opf = REQ_OP_READ;
 			if (dio->flags & IOMAP_DIO_DIRTY)
 				bio_set_pages_dirty(bio);
 		}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5bd3cac4df9c..8ebb1fa6f3b7 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -55,6 +55,7 @@ struct vm_fault;
 #define IOMAP_F_SHARED		0x04
 #define IOMAP_F_MERGED		0x08
 #define IOMAP_F_BUFFER_HEAD	0x10
+#define IOMAP_F_ZONE_APPEND	0x20
 
 /*
  * Flags set by the core iomap code during operations:
-- 
cgit v1.2.3


From 3aa6bce9af0e25b735c9c1263739a5639a336ae8 Mon Sep 17 00:00:00 2001
From: Edwin Peer <edwin.peer@broadcom.com>
Date: Fri, 5 Feb 2021 17:37:32 -0800
Subject: net: watchdog: hold device global xmit lock during tx disable

Prevent netif_tx_disable() running concurrently with dev_watchdog() by
taking the device global xmit lock. Otherwise, the recommended:

	netif_carrier_off(dev);
	netif_tx_disable(dev);

driver shutdown sequence can happen after the watchdog has already
checked carrier, resulting in possible false alarms. This is because
netif_tx_lock() only sets the frozen bit without maintaining the locks
on the individual queues.

Fixes: c3f26a269c24 ("netdev: Fix lockdep warnings in multiqueue configurations.")
Signed-off-by: Edwin Peer <edwin.peer@broadcom.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 259be67644e3..5ff27c12ce68 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4352,6 +4352,7 @@ static inline void netif_tx_disable(struct net_device *dev)
 
 	local_bh_disable();
 	cpu = smp_processor_id();
+	spin_lock(&dev->tx_global_lock);
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
@@ -4359,6 +4360,7 @@ static inline void netif_tx_disable(struct net_device *dev)
 		netif_tx_stop_queue(txq);
 		__netif_tx_unlock(txq);
 	}
+	spin_unlock(&dev->tx_global_lock);
 	local_bh_enable();
 }
 
-- 
cgit v1.2.3


From db72438c9319cfd37e3c237a7754ca862ae12d63 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 2 Feb 2021 09:13:09 +0200
Subject: RDMA/mlx5: Cleanup the synchronize_srcu() from the ODP flow

Cleanup the synchronize_srcu() from the ODP flow as it was found to be a
very heavy time consumer as part of dereg_mr.

For example de-registration of 10000 ODP MRs each with size of 2M hugepage
took 19.6 sec comparing de-registration of same number of non ODP MRs that
took 172 ms.

The new locking scheme uses the wait_event() mechanism which follows the
use count of the MR instead of using synchronize_srcu().

By that change, the time required for the above test took 95 ms which is
even better than the non ODP flow.

Once fully dropped the srcu usage, had to come with a lock to protect the
XA access.

As part of using the above mechanism we could also clean the
num_deferred_work stuff and follow the use count instead.

Link: https://lore.kernel.org/r/20210202071309.2057998-1-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/devx.c            |  13 +-
 drivers/infiniband/hw/mlx5/main.c            |   5 -
 drivers/infiniband/hw/mlx5/mlx5_ib.h         |  31 +++-
 drivers/infiniband/hw/mlx5/mr.c              |  26 ++--
 drivers/infiniband/hw/mlx5/odp.c             | 224 ++++++++++-----------------
 drivers/net/ethernet/mellanox/mlx5/core/mr.c |   1 +
 include/linux/mlx5/driver.h                  |   2 +
 7 files changed, 127 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index e6d2709eb09e..e39661db0d2f 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1310,9 +1310,9 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj,
 	mkey->size = MLX5_GET64(mkc, mkc, len);
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
 	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
+	init_waitqueue_head(&mkey->wait);
 
-	return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey,
-			       GFP_KERNEL));
+	return mlx5r_store_odp_mkey(dev, mkey);
 }
 
 static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1385,16 +1385,15 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
 	int ret;
 
 	dev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
+	if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY &&
+	    xa_erase(&obj->ib_dev->odp_mkeys,
+		     mlx5_base_mkey(obj->devx_mr.mmkey.key)))
 		/*
 		 * The pagefault_single_data_segment() does commands against
 		 * the mmkey, we must wait for that to stop before freeing the
 		 * mkey, as another allocation could get the same mkey #.
 		 */
-		xa_erase(&obj->ib_dev->odp_mkeys,
-			 mlx5_base_mkey(obj->devx_mr.mmkey.key));
-		synchronize_srcu(&dev->odp_srcu);
-	}
+		mlx5r_deref_wait_odp_mkey(&obj->devx_mr.mmkey);
 
 	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
 		ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index ce4a9eba53f9..93dd568a5397 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3869,7 +3869,6 @@ static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
 	WARN_ON(!xa_empty(&dev->odp_mkeys));
-	cleanup_srcu_struct(&dev->odp_srcu);
 	mutex_destroy(&dev->cap_mask_mutex);
 	WARN_ON(!xa_empty(&dev->sig_mrs));
 	WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
@@ -3914,10 +3913,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 
 	dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
 
-	err = init_srcu_struct(&dev->odp_srcu);
-	if (err)
-		goto err_mp;
-
 	mutex_init(&dev->cap_mask_mutex);
 	INIT_LIST_HEAD(&dev->qp_list);
 	spin_lock_init(&dev->reset_flow_resource_lock);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ab52083634e6..88cc26e008fc 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -684,11 +684,8 @@ struct mlx5_ib_mr {
 	u64			pi_iova;
 
 	/* For ODP and implicit */
-	atomic_t		num_deferred_work;
-	wait_queue_head_t       q_deferred_work;
 	struct xarray		implicit_children;
 	union {
-		struct rcu_head rcu;
 		struct list_head elm;
 		struct work_struct work;
 	} odp_destroy;
@@ -1068,11 +1065,6 @@ struct mlx5_ib_dev {
 	u64			odp_max_size;
 	struct mlx5_ib_pf_eq	odp_pf_eq;
 
-	/*
-	 * Sleepable RCU that prevents destruction of MRs while they are still
-	 * being used by a page fault handler.
-	 */
-	struct srcu_struct      odp_srcu;
 	struct xarray		odp_mkeys;
 
 	u32			null_mkey;
@@ -1599,6 +1591,29 @@ static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev,
 	return true;
 }
 
+static inline int mlx5r_store_odp_mkey(struct mlx5_ib_dev *dev,
+				       struct mlx5_core_mkey *mmkey)
+{
+	refcount_set(&mmkey->usecount, 1);
+
+	return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mmkey->key),
+			       mmkey, GFP_KERNEL));
+}
+
+/* deref an mkey that can participate in ODP flow */
+static inline void mlx5r_deref_odp_mkey(struct mlx5_core_mkey *mmkey)
+{
+	if (refcount_dec_and_test(&mmkey->usecount))
+		wake_up(&mmkey->wait);
+}
+
+/* deref an mkey that can participate in ODP flow and wait for relese */
+static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_core_mkey *mmkey)
+{
+	mlx5r_deref_odp_mkey(mmkey);
+	wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0);
+}
+
 int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
 
 static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev)
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index a63ef7c66e38..db05b0e0a8d7 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -158,6 +158,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	mr->mmkey.type = MLX5_MKEY_MR;
 	mr->mmkey.key |= mlx5_idx_to_mkey(
 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
+	init_waitqueue_head(&mr->mmkey.wait);
 
 	WRITE_ONCE(dev->cache.last_add, jiffies);
 
@@ -1551,10 +1552,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
 	}
 
 	odp->private = mr;
-	init_waitqueue_head(&mr->q_deferred_work);
-	atomic_set(&mr->num_deferred_work, 0);
-	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
-			      &mr->mmkey, GFP_KERNEL));
+	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
 	if (err)
 		goto err_dereg_mr;
 
@@ -1651,10 +1649,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 
 	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
 	umem_dmabuf->private = mr;
-	init_waitqueue_head(&mr->q_deferred_work);
-	atomic_set(&mr->num_deferred_work, 0);
-	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
-			      &mr->mmkey, GFP_KERNEL));
+	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
 	if (err)
 		goto err_dereg_mr;
 
@@ -2330,9 +2325,7 @@ int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 	}
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		err = xa_err(xa_store(&dev->odp_mkeys,
-				      mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
-				      GFP_KERNEL));
+		err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
 		if (err)
 			goto free_mkey;
 	}
@@ -2352,14 +2345,13 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw)
 	struct mlx5_ib_dev *dev = to_mdev(mw->device);
 	struct mlx5_ib_mw *mmw = to_mmw(mw);
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
+	    xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
 		/*
-		 * pagefault_single_data_segment() may be accessing mmw under
-		 * SRCU if the user bound an ODP MR to this MW.
+		 * pagefault_single_data_segment() may be accessing mmw
+		 * if the user bound an ODP MR to this MW.
 		 */
-		synchronize_srcu(&dev->odp_srcu);
-	}
+		mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
 
 	return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
 }
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index e77b1db73893..374698186662 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -115,7 +115,6 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 	 * xarray would be protected by the umem_mutex, however that is not
 	 * possible. Instead this uses a weaker update-then-lock pattern:
 	 *
-	 *  srcu_read_lock()
 	 *    xa_store()
 	 *    mutex_lock(umem_mutex)
 	 *     mlx5_ib_update_xlt()
@@ -126,12 +125,9 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 	 * before destroying.
 	 *
 	 * The umem_mutex provides the acquire/release semantic needed to make
-	 * the xa_store() visible to a racing thread. While SRCU is not
-	 * technically required, using it gives consistent use of the SRCU
-	 * locking around the xarray.
+	 * the xa_store() visible to a racing thread.
 	 */
 	lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
-	lockdep_assert_held(&mr_to_mdev(imr)->odp_srcu);
 
 	for (; pklm != end; pklm++, idx++) {
 		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
@@ -207,8 +203,8 @@ static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
 }
 
 /*
- * This must be called after the mr has been removed from implicit_children
- * and the SRCU synchronized.  NOTE: The MR does not necessarily have to be
+ * This must be called after the mr has been removed from implicit_children.
+ * NOTE: The MR does not necessarily have to be
  * empty here, parallel page faults could have raced with the free process and
  * added pages to it.
  */
@@ -218,19 +214,15 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
 	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
-	int srcu_key;
 
-	/* implicit_child_mr's are not allowed to have deferred work */
-	WARN_ON(atomic_read(&mr->num_deferred_work));
+	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 
 	if (need_imr_xlt) {
-		srcu_key = srcu_read_lock(&mr_to_mdev(mr)->odp_srcu);
 		mutex_lock(&odp_imr->umem_mutex);
 		mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
 				   MLX5_IB_UPD_XLT_INDIRECT |
 				   MLX5_IB_UPD_XLT_ATOMIC);
 		mutex_unlock(&odp_imr->umem_mutex);
-		srcu_read_unlock(&mr_to_mdev(mr)->odp_srcu, srcu_key);
 	}
 
 	dma_fence_odp_mr(mr);
@@ -238,26 +230,16 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
 	mr->parent = NULL;
 	mlx5_mr_cache_free(mr_to_mdev(mr), mr);
 	ib_umem_odp_release(odp);
-	if (atomic_dec_and_test(&imr->num_deferred_work))
-		wake_up(&imr->q_deferred_work);
 }
 
 static void free_implicit_child_mr_work(struct work_struct *work)
 {
 	struct mlx5_ib_mr *mr =
 		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
+	struct mlx5_ib_mr *imr = mr->parent;
 
 	free_implicit_child_mr(mr, true);
-}
-
-static void free_implicit_child_mr_rcu(struct rcu_head *head)
-{
-	struct mlx5_ib_mr *mr =
-		container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
-
-	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
-	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
-	queue_work(system_unbound_wq, &mr->odp_destroy.work);
+	mlx5r_deref_odp_mkey(&imr->mmkey);
 }
 
 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
@@ -266,21 +248,14 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
 	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 	struct mlx5_ib_mr *imr = mr->parent;
 
-	xa_lock(&imr->implicit_children);
-	/*
-	 * This can race with mlx5_ib_free_implicit_mr(), the first one to
-	 * reach the xa lock wins the race and destroys the MR.
-	 */
-	if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
-	    mr)
-		goto out_unlock;
+	if (!refcount_inc_not_zero(&imr->mmkey.usecount))
+		return;
 
-	atomic_inc(&imr->num_deferred_work);
-	call_srcu(&mr_to_mdev(mr)->odp_srcu, &mr->odp_destroy.rcu,
-		  free_implicit_child_mr_rcu);
+	xa_erase(&imr->implicit_children, idx);
 
-out_unlock:
-	xa_unlock(&imr->implicit_children);
+	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
+	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
+	queue_work(system_unbound_wq, &mr->odp_destroy.work);
 }
 
 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
@@ -492,6 +467,12 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	mr->parent = imr;
 	odp->private = mr;
 
+	/*
+	 * First refcount is owned by the xarray and second refconut
+	 * is returned to the caller.
+	 */
+	refcount_set(&mr->mmkey.usecount, 2);
+
 	err = mlx5_ib_update_xlt(mr, 0,
 				 MLX5_IMR_MTT_ENTRIES,
 				 PAGE_SHIFT,
@@ -502,27 +483,28 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 		goto out_mr;
 	}
 
-	/*
-	 * Once the store to either xarray completes any error unwind has to
-	 * use synchronize_srcu(). Avoid this with xa_reserve()
-	 */
-	ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
-			 GFP_KERNEL);
+	xa_lock(&imr->implicit_children);
+	ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
+			   GFP_KERNEL);
 	if (unlikely(ret)) {
 		if (xa_is_err(ret)) {
 			ret = ERR_PTR(xa_err(ret));
-			goto out_mr;
+			goto out_lock;
 		}
 		/*
 		 * Another thread beat us to creating the child mr, use
 		 * theirs.
 		 */
-		goto out_mr;
+		refcount_inc(&ret->mmkey.usecount);
+		goto out_lock;
 	}
+	xa_unlock(&imr->implicit_children);
 
 	mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
 	return mr;
 
+out_lock:
+	xa_unlock(&imr->implicit_children);
 out_mr:
 	mlx5_mr_cache_free(mr_to_mdev(imr), mr);
 out_umem:
@@ -561,8 +543,6 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	imr->ibmr.device = &dev->ib_dev;
 	imr->umem = &umem_odp->umem;
 	imr->is_odp_implicit = true;
-	atomic_set(&imr->num_deferred_work, 0);
-	init_waitqueue_head(&imr->q_deferred_work);
 	xa_init(&imr->implicit_children);
 
 	err = mlx5_ib_update_xlt(imr, 0,
@@ -574,8 +554,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	if (err)
 		goto out_mr;
 
-	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
-			      &imr->mmkey, GFP_KERNEL));
+	err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
 	if (err)
 		goto out_mr;
 
@@ -593,51 +572,24 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 {
 	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
 	struct mlx5_ib_dev *dev = mr_to_mdev(imr);
-	struct list_head destroy_list;
 	struct mlx5_ib_mr *mtt;
-	struct mlx5_ib_mr *tmp;
 	unsigned long idx;
 
-	INIT_LIST_HEAD(&destroy_list);
-
 	xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
-	/*
-	 * This stops the SRCU protected page fault path from touching either
-	 * the imr or any children. The page fault path can only reach the
-	 * children xarray via the imr.
-	 */
-	synchronize_srcu(&dev->odp_srcu);
-
 	/*
 	 * All work on the prefetch list must be completed, xa_erase() prevented
 	 * new work from being created.
 	 */
-	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
-
+	mlx5r_deref_wait_odp_mkey(&imr->mmkey);
 	/*
 	 * At this point it is forbidden for any other thread to enter
 	 * pagefault_mr() on this imr. It is already forbidden to call
 	 * pagefault_mr() on an implicit child. Due to this additions to
 	 * implicit_children are prevented.
+	 * In addition, any new call to destroy_unused_implicit_child_mr()
+	 * may return immediately.
 	 */
 
-	/*
-	 * Block destroy_unused_implicit_child_mr() from incrementing
-	 * num_deferred_work.
-	 */
-	xa_lock(&imr->implicit_children);
-	xa_for_each (&imr->implicit_children, idx, mtt) {
-		__xa_erase(&imr->implicit_children, idx);
-		list_add(&mtt->odp_destroy.elm, &destroy_list);
-	}
-	xa_unlock(&imr->implicit_children);
-
-	/*
-	 * Wait for any concurrent destroy_unused_implicit_child_mr() to
-	 * complete.
-	 */
-	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
-
 	/*
 	 * Fence the imr before we destroy the children. This allows us to
 	 * skip updating the XLT of the imr during destroy of the child mkey
@@ -645,8 +597,10 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 	 */
 	mlx5_mr_cache_invalidate(imr);
 
-	list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
+	xa_for_each(&imr->implicit_children, idx, mtt) {
+		xa_erase(&imr->implicit_children, idx);
 		free_implicit_child_mr(mtt, false);
+	}
 
 	mlx5_mr_cache_free(dev, imr);
 	ib_umem_odp_release(odp_imr);
@@ -665,9 +619,7 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
 	xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
 
 	/* Wait for all running page-fault handlers to finish. */
-	synchronize_srcu(&mr_to_mdev(mr)->odp_srcu);
-
-	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 
 	dma_fence_odp_mr(mr);
 }
@@ -686,10 +638,7 @@ void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
 	/* Prevent new page faults and prefetch requests from succeeding */
 	xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
 
-	/* Wait for all running page-fault handlers to finish. */
-	synchronize_srcu(&mr_to_mdev(mr)->odp_srcu);
-
-	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 
 	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
 	mlx5_mr_cache_invalidate(mr);
@@ -780,8 +729,10 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 		struct mlx5_ib_mr *mtt;
 		u64 len;
 
+		xa_lock(&imr->implicit_children);
 		mtt = xa_load(&imr->implicit_children, idx);
 		if (unlikely(!mtt)) {
+			xa_unlock(&imr->implicit_children);
 			mtt = implicit_get_child_mr(imr, idx);
 			if (IS_ERR(mtt)) {
 				ret = PTR_ERR(mtt);
@@ -789,6 +740,9 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 			}
 			upd_start_idx = min(upd_start_idx, idx);
 			upd_len = idx - upd_start_idx + 1;
+		} else {
+			refcount_inc(&mtt->mmkey.usecount);
+			xa_unlock(&imr->implicit_children);
 		}
 
 		umem_odp = to_ib_umem_odp(mtt->umem);
@@ -797,6 +751,9 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 
 		ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
 					bytes_mapped, flags);
+
+		mlx5r_deref_odp_mkey(&mtt->mmkey);
+
 		if (ret < 0)
 			goto out;
 		user_va += len;
@@ -888,7 +845,6 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 
-	lockdep_assert_held(&mr_to_mdev(mr)->odp_srcu);
 	if (unlikely(io_virt < mr->mmkey.iova))
 		return -EFAULT;
 
@@ -980,7 +936,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 					 u32 *bytes_committed,
 					 u32 *bytes_mapped)
 {
-	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+	int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
 	struct pf_frame *head = NULL, *frame;
 	struct mlx5_core_mkey *mmkey;
 	struct mlx5_ib_mr *mr;
@@ -989,14 +945,14 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 	size_t offset;
 	int ndescs;
 
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
-
 	io_virt += *bytes_committed;
 	bcnt -= *bytes_committed;
 
 next_mr:
+	xa_lock(&dev->odp_mkeys);
 	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
 	if (!mmkey) {
+		xa_unlock(&dev->odp_mkeys);
 		mlx5_ib_dbg(
 			dev,
 			"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
@@ -1009,12 +965,15 @@ next_mr:
 		 * faulted.
 		 */
 		ret = 0;
-		goto srcu_unlock;
+		goto end;
 	}
+	refcount_inc(&mmkey->usecount);
+	xa_unlock(&dev->odp_mkeys);
+
 	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
-		goto srcu_unlock;
+		goto end;
 	}
 
 	switch (mmkey->type) {
@@ -1023,7 +982,7 @@ next_mr:
 
 		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
 		if (ret < 0)
-			goto srcu_unlock;
+			goto end;
 
 		mlx5_update_odp_stats(mr, faults, ret);
 
@@ -1038,7 +997,7 @@ next_mr:
 		if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
 			mlx5_ib_dbg(dev, "indirection level exceeded\n");
 			ret = -EFAULT;
-			goto srcu_unlock;
+			goto end;
 		}
 
 		outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
@@ -1049,7 +1008,7 @@ next_mr:
 			out = kzalloc(outlen, GFP_KERNEL);
 			if (!out) {
 				ret = -ENOMEM;
-				goto srcu_unlock;
+				goto end;
 			}
 			cur_outlen = outlen;
 		}
@@ -1059,7 +1018,7 @@ next_mr:
 
 		ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
 		if (ret)
-			goto srcu_unlock;
+			goto end;
 
 		offset = io_virt - MLX5_GET64(query_mkey_out, out,
 					      memory_key_mkey_entry.start_addr);
@@ -1073,7 +1032,7 @@ next_mr:
 			frame = kzalloc(sizeof(*frame), GFP_KERNEL);
 			if (!frame) {
 				ret = -ENOMEM;
-				goto srcu_unlock;
+				goto end;
 			}
 
 			frame->key = be32_to_cpu(pklm->key);
@@ -1092,7 +1051,7 @@ next_mr:
 	default:
 		mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
 		ret = -EFAULT;
-		goto srcu_unlock;
+		goto end;
 	}
 
 	if (head) {
@@ -1105,10 +1064,13 @@ next_mr:
 		depth = frame->depth;
 		kfree(frame);
 
+		mlx5r_deref_odp_mkey(mmkey);
 		goto next_mr;
 	}
 
-srcu_unlock:
+end:
+	if (mmkey)
+		mlx5r_deref_odp_mkey(mmkey);
 	while (head) {
 		frame = head;
 		head = frame->next;
@@ -1116,7 +1078,6 @@ srcu_unlock:
 	}
 	kfree(out);
 
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
@@ -1824,8 +1785,8 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
 	u32 i;
 
 	for (i = 0; i < work->num_sge; ++i)
-		if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
-			wake_up(&work->frags[i].mr->q_deferred_work);
+		mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
+
 	kvfree(work);
 }
 
@@ -1835,24 +1796,30 @@ get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_core_mkey *mmkey;
-	struct mlx5_ib_mr *mr;
-
-	lockdep_assert_held(&dev->odp_srcu);
+	struct mlx5_ib_mr *mr = NULL;
 
+	xa_lock(&dev->odp_mkeys);
 	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
 	if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
-		return NULL;
+		goto end;
 
 	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 
-	if (mr->ibmr.pd != pd)
-		return NULL;
+	if (mr->ibmr.pd != pd) {
+		mr = NULL;
+		goto end;
+	}
 
 	/* prefetch with write-access must be supported by the MR */
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
-	    !mr->umem->writable)
-		return NULL;
+	    !mr->umem->writable) {
+		mr = NULL;
+		goto end;
+	}
 
+	refcount_inc(&mmkey->usecount);
+end:
+	xa_unlock(&dev->odp_mkeys);
 	return mr;
 }
 
@@ -1860,17 +1827,12 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
 {
 	struct prefetch_mr_work *work =
 		container_of(w, struct prefetch_mr_work, work);
-	struct mlx5_ib_dev *dev;
 	u32 bytes_mapped = 0;
-	int srcu_key;
 	int ret;
 	u32 i;
 
 	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
 	WARN_ON(!work->num_sge);
-	dev = mr_to_mdev(work->frags[0].mr);
-	/* SRCU should be held when calling to mlx5_odp_populate_xlt() */
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	for (i = 0; i < work->num_sge; ++i) {
 		ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
 				   work->frags[i].length, &bytes_mapped,
@@ -1879,7 +1841,6 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
 			continue;
 		mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
 	}
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 
 	destroy_prefetch_work(work);
 }
@@ -1903,9 +1864,6 @@ static bool init_prefetch_work(struct ib_pd *pd,
 			work->num_sge = i;
 			return false;
 		}
-
-		/* Keep the MR pointer will valid outside the SRCU */
-		atomic_inc(&work->frags[i].mr->num_deferred_work);
 	}
 	work->num_sge = num_sge;
 	return true;
@@ -1916,42 +1874,35 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
 				    u32 pf_flags, struct ib_sge *sg_list,
 				    u32 num_sge)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	u32 bytes_mapped = 0;
-	int srcu_key;
 	int ret = 0;
 	u32 i;
 
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	for (i = 0; i < num_sge; ++i) {
 		struct mlx5_ib_mr *mr;
 
 		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
-		if (!mr) {
-			ret = -ENOENT;
-			goto out;
-		}
+		if (!mr)
+			return -ENOENT;
 		ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
 				   &bytes_mapped, pf_flags);
-		if (ret < 0)
-			goto out;
+		if (ret < 0) {
+			mlx5r_deref_odp_mkey(&mr->mmkey);
+			return ret;
+		}
 		mlx5_update_odp_stats(mr, prefetch, ret);
+		mlx5r_deref_odp_mkey(&mr->mmkey);
 	}
-	ret = 0;
 
-out:
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
-	return ret;
+	return 0;
 }
 
 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       enum ib_uverbs_advise_mr_advice advice,
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	u32 pf_flags = 0;
 	struct prefetch_mr_work *work;
-	int srcu_key;
 
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
 		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
@@ -1967,13 +1918,10 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 	if (!work)
 		return -ENOMEM;
 
-	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
-		srcu_read_unlock(&dev->odp_srcu, srcu_key);
 		destroy_prefetch_work(work);
 		return -EINVAL;
 	}
 	queue_work(system_unbound_wq, &work->work);
-	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 9eb51f06d3ae..50af84e76fb6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -56,6 +56,7 @@ int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
 	mkey->size = MLX5_GET64(mkc, mkc, len);
 	mkey->key |= mlx5_idx_to_mkey(mkey_index);
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
+	init_waitqueue_head(&mkey->wait);
 
 	mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index, mkey->key);
 	return 0;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 11558c2e99f0..b0a59a18a708 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -366,6 +366,8 @@ struct mlx5_core_mkey {
 	u32			key;
 	u32			pd;
 	u32			type;
+	struct wait_queue_head wait;
+	refcount_t usecount;
 };
 
 #define MLX5_24BIT_MASK		((1 << 24) - 1)
-- 
cgit v1.2.3


From ee7294ba49bf8559b560b21629ed8153082c25cf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:10:26 +0100
Subject: clk: remove u300 driver

The ST-Ericsson U300 platform is getting removed, so this driver is no
longer needed.

Cc: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20210120131026.1721788-5-arnd@kernel.org
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/ste-u300-syscon-clock.txt       |   80 --
 drivers/clk/Makefile                               |    1 -
 drivers/clk/clk-u300.c                             | 1199 --------------------
 include/linux/platform_data/clk-u300.h             |    1 -
 4 files changed, 1281 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/clock/ste-u300-syscon-clock.txt
 delete mode 100644 drivers/clk/clk-u300.c
 delete mode 100644 include/linux/platform_data/clk-u300.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/ste-u300-syscon-clock.txt b/Documentation/devicetree/bindings/clock/ste-u300-syscon-clock.txt
deleted file mode 100644
index 7cafcb98ead7..000000000000
--- a/Documentation/devicetree/bindings/clock/ste-u300-syscon-clock.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-Clock bindings for ST-Ericsson U300 System Controller Clocks
-
-Bindings for the gated system controller clocks:
-
-Required properties:
-- compatible: must be "stericsson,u300-syscon-clk"
-- #clock-cells: must be <0>
-- clock-type: specifies the type of clock:
-  0 = slow clock
-  1 = fast clock
-  2 = rest/remaining clock
-- clock-id: specifies the clock in the type range
-
-Optional properties:
-- clocks: parent clock(s)
-
-The available clocks per type are as follows:
-
-Type:  ID:   Clock:
--------------------
-0      0     Slow peripheral bridge clock
-0      1     UART0 clock
-0      4     GPIO clock
-0      6     RTC clock
-0      7     Application timer clock
-0      8     Access timer clock
-
-1      0     Fast peripheral bridge clock
-1      1     I2C bus 0 clock
-1      2     I2C bus 1 clock
-1      5     MMC interface peripheral (silicon) clock
-1      6     SPI clock
-
-2      3     CPU clock
-2      4     DMA controller clock
-2      5     External Memory Interface (EMIF) clock
-2      6     NAND flask interface clock
-2      8     XGAM graphics engine clock
-2      9     Shared External Memory Interface (SEMI) clock
-2      10    AHB Subsystem Bridge clock
-2      12    Interrupt controller clock
-
-Example:
-
-gpio_clk: gpio_clk@13M {
-	#clock-cells = <0>;
-	compatible = "stericsson,u300-syscon-clk";
-	clock-type = <0>; /* Slow */
-	clock-id = <4>;
-	clocks = <&slow_clk>;
-};
-
-gpio: gpio@c0016000 {
-	compatible = "stericsson,gpio-coh901";
-	(...)
-	clocks = <&gpio_clk>;
-};
-
-
-Bindings for the MMC/SD card clock:
-
-Required properties:
-- compatible: must be "stericsson,u300-syscon-mclk"
-- #clock-cells: must be <0>
-
-Optional properties:
-- clocks: parent clock(s)
-
-mmc_mclk: mmc_mclk {
-	#clock-cells = <0>;
-	compatible = "stericsson,u300-syscon-mclk";
-	clocks = <&mmc_pclk>;
-};
-
-mmcsd: mmcsd@c0001000 {
-	compatible = "arm,pl18x", "arm,primecell";
-	clocks = <&mmc_pclk>, <&mmc_mclk>;
-	clock-names = "apb_pclk", "mclk";
-	(...)
-};
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index 663414cef29a..2e4fc98ca1db 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -63,7 +63,6 @@ obj-$(CONFIG_COMMON_CLK_STM32F)		+= clk-stm32f4.o
 obj-$(CONFIG_COMMON_CLK_STM32H7)	+= clk-stm32h7.o
 obj-$(CONFIG_COMMON_CLK_STM32MP157)	+= clk-stm32mp1.o
 obj-$(CONFIG_CLK_TWL6040)		+= clk-twl6040.o
-obj-$(CONFIG_ARCH_U300)			+= clk-u300.o
 obj-$(CONFIG_ARCH_VT8500)		+= clk-vt8500.o
 obj-$(CONFIG_COMMON_CLK_VC5)		+= clk-versaclock5.o
 obj-$(CONFIG_COMMON_CLK_WM831X)		+= clk-wm831x.o
diff --git a/drivers/clk/clk-u300.c b/drivers/clk/clk-u300.c
deleted file mode 100644
index e228c07c4c6e..000000000000
--- a/drivers/clk/clk-u300.c
+++ /dev/null
@@ -1,1199 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * U300 clock implementation
- * Copyright (C) 2007-2012 ST-Ericsson AB
- * Author: Linus Walleij <linus.walleij@stericsson.com>
- * Author: Jonas Aaberg <jonas.aberg@stericsson.com>
- */
-#include <linux/clkdev.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/clk-provider.h>
-#include <linux/spinlock.h>
-#include <linux/of.h>
-#include <linux/platform_data/clk-u300.h>
-
-/* APP side SYSCON registers */
-/* CLK Control Register 16bit (R/W) */
-#define U300_SYSCON_CCR						(0x0000)
-#define U300_SYSCON_CCR_I2S1_USE_VCXO				(0x0040)
-#define U300_SYSCON_CCR_I2S0_USE_VCXO				(0x0020)
-#define U300_SYSCON_CCR_TURN_VCXO_ON				(0x0008)
-#define U300_SYSCON_CCR_CLKING_PERFORMANCE_MASK			(0x0007)
-#define U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER		(0x04)
-#define U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW			(0x03)
-#define U300_SYSCON_CCR_CLKING_PERFORMANCE_INTERMEDIATE		(0x02)
-#define U300_SYSCON_CCR_CLKING_PERFORMANCE_HIGH			(0x01)
-#define U300_SYSCON_CCR_CLKING_PERFORMANCE_BEST			(0x00)
-/* CLK Status Register 16bit (R/W) */
-#define U300_SYSCON_CSR						(0x0004)
-#define U300_SYSCON_CSR_PLL208_LOCK_IND				(0x0002)
-#define U300_SYSCON_CSR_PLL13_LOCK_IND				(0x0001)
-/* Reset lines for SLOW devices 16bit (R/W) */
-#define U300_SYSCON_RSR						(0x0014)
-#define U300_SYSCON_RSR_PPM_RESET_EN				(0x0200)
-#define U300_SYSCON_RSR_ACC_TMR_RESET_EN			(0x0100)
-#define U300_SYSCON_RSR_APP_TMR_RESET_EN			(0x0080)
-#define U300_SYSCON_RSR_RTC_RESET_EN				(0x0040)
-#define U300_SYSCON_RSR_KEYPAD_RESET_EN				(0x0020)
-#define U300_SYSCON_RSR_GPIO_RESET_EN				(0x0010)
-#define U300_SYSCON_RSR_EH_RESET_EN				(0x0008)
-#define U300_SYSCON_RSR_BTR_RESET_EN				(0x0004)
-#define U300_SYSCON_RSR_UART_RESET_EN				(0x0002)
-#define U300_SYSCON_RSR_SLOW_BRIDGE_RESET_EN			(0x0001)
-/* Reset lines for FAST devices 16bit (R/W) */
-#define U300_SYSCON_RFR						(0x0018)
-#define U300_SYSCON_RFR_UART1_RESET_ENABLE			(0x0080)
-#define U300_SYSCON_RFR_SPI_RESET_ENABLE			(0x0040)
-#define U300_SYSCON_RFR_MMC_RESET_ENABLE			(0x0020)
-#define U300_SYSCON_RFR_PCM_I2S1_RESET_ENABLE			(0x0010)
-#define U300_SYSCON_RFR_PCM_I2S0_RESET_ENABLE			(0x0008)
-#define U300_SYSCON_RFR_I2C1_RESET_ENABLE			(0x0004)
-#define U300_SYSCON_RFR_I2C0_RESET_ENABLE			(0x0002)
-#define U300_SYSCON_RFR_FAST_BRIDGE_RESET_ENABLE		(0x0001)
-/* Reset lines for the rest of the peripherals 16bit (R/W) */
-#define U300_SYSCON_RRR						(0x001c)
-#define U300_SYSCON_RRR_CDS_RESET_EN				(0x4000)
-#define U300_SYSCON_RRR_ISP_RESET_EN				(0x2000)
-#define U300_SYSCON_RRR_INTCON_RESET_EN				(0x1000)
-#define U300_SYSCON_RRR_MSPRO_RESET_EN				(0x0800)
-#define U300_SYSCON_RRR_XGAM_RESET_EN				(0x0100)
-#define U300_SYSCON_RRR_XGAM_VC_SYNC_RESET_EN			(0x0080)
-#define U300_SYSCON_RRR_NANDIF_RESET_EN				(0x0040)
-#define U300_SYSCON_RRR_EMIF_RESET_EN				(0x0020)
-#define U300_SYSCON_RRR_DMAC_RESET_EN				(0x0010)
-#define U300_SYSCON_RRR_CPU_RESET_EN				(0x0008)
-#define U300_SYSCON_RRR_APEX_RESET_EN				(0x0004)
-#define U300_SYSCON_RRR_AHB_RESET_EN				(0x0002)
-#define U300_SYSCON_RRR_AAIF_RESET_EN				(0x0001)
-/* Clock enable for SLOW peripherals 16bit (R/W) */
-#define U300_SYSCON_CESR					(0x0020)
-#define U300_SYSCON_CESR_PPM_CLK_EN				(0x0200)
-#define U300_SYSCON_CESR_ACC_TMR_CLK_EN				(0x0100)
-#define U300_SYSCON_CESR_APP_TMR_CLK_EN				(0x0080)
-#define U300_SYSCON_CESR_KEYPAD_CLK_EN				(0x0040)
-#define U300_SYSCON_CESR_GPIO_CLK_EN				(0x0010)
-#define U300_SYSCON_CESR_EH_CLK_EN				(0x0008)
-#define U300_SYSCON_CESR_BTR_CLK_EN				(0x0004)
-#define U300_SYSCON_CESR_UART_CLK_EN				(0x0002)
-#define U300_SYSCON_CESR_SLOW_BRIDGE_CLK_EN			(0x0001)
-/* Clock enable for FAST peripherals 16bit (R/W) */
-#define U300_SYSCON_CEFR					(0x0024)
-#define U300_SYSCON_CEFR_UART1_CLK_EN				(0x0200)
-#define U300_SYSCON_CEFR_I2S1_CORE_CLK_EN			(0x0100)
-#define U300_SYSCON_CEFR_I2S0_CORE_CLK_EN			(0x0080)
-#define U300_SYSCON_CEFR_SPI_CLK_EN				(0x0040)
-#define U300_SYSCON_CEFR_MMC_CLK_EN				(0x0020)
-#define U300_SYSCON_CEFR_I2S1_CLK_EN				(0x0010)
-#define U300_SYSCON_CEFR_I2S0_CLK_EN				(0x0008)
-#define U300_SYSCON_CEFR_I2C1_CLK_EN				(0x0004)
-#define U300_SYSCON_CEFR_I2C0_CLK_EN				(0x0002)
-#define U300_SYSCON_CEFR_FAST_BRIDGE_CLK_EN			(0x0001)
-/* Clock enable for the rest of the peripherals 16bit (R/W) */
-#define U300_SYSCON_CERR					(0x0028)
-#define U300_SYSCON_CERR_CDS_CLK_EN				(0x2000)
-#define U300_SYSCON_CERR_ISP_CLK_EN				(0x1000)
-#define U300_SYSCON_CERR_MSPRO_CLK_EN				(0x0800)
-#define U300_SYSCON_CERR_AHB_SUBSYS_BRIDGE_CLK_EN		(0x0400)
-#define U300_SYSCON_CERR_SEMI_CLK_EN				(0x0200)
-#define U300_SYSCON_CERR_XGAM_CLK_EN				(0x0100)
-#define U300_SYSCON_CERR_VIDEO_ENC_CLK_EN			(0x0080)
-#define U300_SYSCON_CERR_NANDIF_CLK_EN				(0x0040)
-#define U300_SYSCON_CERR_EMIF_CLK_EN				(0x0020)
-#define U300_SYSCON_CERR_DMAC_CLK_EN				(0x0010)
-#define U300_SYSCON_CERR_CPU_CLK_EN				(0x0008)
-#define U300_SYSCON_CERR_APEX_CLK_EN				(0x0004)
-#define U300_SYSCON_CERR_AHB_CLK_EN				(0x0002)
-#define U300_SYSCON_CERR_AAIF_CLK_EN				(0x0001)
-/* Single block clock enable 16bit (-/W) */
-#define U300_SYSCON_SBCER					(0x002c)
-#define U300_SYSCON_SBCER_PPM_CLK_EN				(0x0009)
-#define U300_SYSCON_SBCER_ACC_TMR_CLK_EN			(0x0008)
-#define U300_SYSCON_SBCER_APP_TMR_CLK_EN			(0x0007)
-#define U300_SYSCON_SBCER_KEYPAD_CLK_EN				(0x0006)
-#define U300_SYSCON_SBCER_GPIO_CLK_EN				(0x0004)
-#define U300_SYSCON_SBCER_EH_CLK_EN				(0x0003)
-#define U300_SYSCON_SBCER_BTR_CLK_EN				(0x0002)
-#define U300_SYSCON_SBCER_UART_CLK_EN				(0x0001)
-#define U300_SYSCON_SBCER_SLOW_BRIDGE_CLK_EN			(0x0000)
-#define U300_SYSCON_SBCER_UART1_CLK_EN				(0x0019)
-#define U300_SYSCON_SBCER_I2S1_CORE_CLK_EN			(0x0018)
-#define U300_SYSCON_SBCER_I2S0_CORE_CLK_EN			(0x0017)
-#define U300_SYSCON_SBCER_SPI_CLK_EN				(0x0016)
-#define U300_SYSCON_SBCER_MMC_CLK_EN				(0x0015)
-#define U300_SYSCON_SBCER_I2S1_CLK_EN				(0x0014)
-#define U300_SYSCON_SBCER_I2S0_CLK_EN				(0x0013)
-#define U300_SYSCON_SBCER_I2C1_CLK_EN				(0x0012)
-#define U300_SYSCON_SBCER_I2C0_CLK_EN				(0x0011)
-#define U300_SYSCON_SBCER_FAST_BRIDGE_CLK_EN			(0x0010)
-#define U300_SYSCON_SBCER_CDS_CLK_EN				(0x002D)
-#define U300_SYSCON_SBCER_ISP_CLK_EN				(0x002C)
-#define U300_SYSCON_SBCER_MSPRO_CLK_EN				(0x002B)
-#define U300_SYSCON_SBCER_AHB_SUBSYS_BRIDGE_CLK_EN		(0x002A)
-#define U300_SYSCON_SBCER_SEMI_CLK_EN				(0x0029)
-#define U300_SYSCON_SBCER_XGAM_CLK_EN				(0x0028)
-#define U300_SYSCON_SBCER_VIDEO_ENC_CLK_EN			(0x0027)
-#define U300_SYSCON_SBCER_NANDIF_CLK_EN				(0x0026)
-#define U300_SYSCON_SBCER_EMIF_CLK_EN				(0x0025)
-#define U300_SYSCON_SBCER_DMAC_CLK_EN				(0x0024)
-#define U300_SYSCON_SBCER_CPU_CLK_EN				(0x0023)
-#define U300_SYSCON_SBCER_APEX_CLK_EN				(0x0022)
-#define U300_SYSCON_SBCER_AHB_CLK_EN				(0x0021)
-#define U300_SYSCON_SBCER_AAIF_CLK_EN				(0x0020)
-/* Single block clock disable 16bit (-/W) */
-#define U300_SYSCON_SBCDR					(0x0030)
-/* Same values as above for SBCER */
-/* Clock force SLOW peripherals 16bit (R/W) */
-#define U300_SYSCON_CFSR					(0x003c)
-#define U300_SYSCON_CFSR_PPM_CLK_FORCE_EN			(0x0200)
-#define U300_SYSCON_CFSR_ACC_TMR_CLK_FORCE_EN			(0x0100)
-#define U300_SYSCON_CFSR_APP_TMR_CLK_FORCE_EN			(0x0080)
-#define U300_SYSCON_CFSR_KEYPAD_CLK_FORCE_EN			(0x0020)
-#define U300_SYSCON_CFSR_GPIO_CLK_FORCE_EN			(0x0010)
-#define U300_SYSCON_CFSR_EH_CLK_FORCE_EN			(0x0008)
-#define U300_SYSCON_CFSR_BTR_CLK_FORCE_EN			(0x0004)
-#define U300_SYSCON_CFSR_UART_CLK_FORCE_EN			(0x0002)
-#define U300_SYSCON_CFSR_SLOW_BRIDGE_CLK_FORCE_EN		(0x0001)
-/* Clock force FAST peripherals 16bit (R/W) */
-#define U300_SYSCON_CFFR					(0x40)
-/* Values not defined. Define if you want to use them. */
-/* Clock force the rest of the peripherals 16bit (R/W) */
-#define U300_SYSCON_CFRR					(0x44)
-#define U300_SYSCON_CFRR_CDS_CLK_FORCE_EN			(0x2000)
-#define U300_SYSCON_CFRR_ISP_CLK_FORCE_EN			(0x1000)
-#define U300_SYSCON_CFRR_MSPRO_CLK_FORCE_EN			(0x0800)
-#define U300_SYSCON_CFRR_AHB_SUBSYS_BRIDGE_CLK_FORCE_EN		(0x0400)
-#define U300_SYSCON_CFRR_SEMI_CLK_FORCE_EN			(0x0200)
-#define U300_SYSCON_CFRR_XGAM_CLK_FORCE_EN			(0x0100)
-#define U300_SYSCON_CFRR_VIDEO_ENC_CLK_FORCE_EN			(0x0080)
-#define U300_SYSCON_CFRR_NANDIF_CLK_FORCE_EN			(0x0040)
-#define U300_SYSCON_CFRR_EMIF_CLK_FORCE_EN			(0x0020)
-#define U300_SYSCON_CFRR_DMAC_CLK_FORCE_EN			(0x0010)
-#define U300_SYSCON_CFRR_CPU_CLK_FORCE_EN			(0x0008)
-#define U300_SYSCON_CFRR_APEX_CLK_FORCE_EN			(0x0004)
-#define U300_SYSCON_CFRR_AHB_CLK_FORCE_EN			(0x0002)
-#define U300_SYSCON_CFRR_AAIF_CLK_FORCE_EN			(0x0001)
-/* PLL208 Frequency Control 16bit (R/W) */
-#define U300_SYSCON_PFCR					(0x48)
-#define U300_SYSCON_PFCR_DPLL_MULT_NUM				(0x000F)
-/* Power Management Control 16bit (R/W) */
-#define U300_SYSCON_PMCR					(0x50)
-#define U300_SYSCON_PMCR_DCON_ENABLE				(0x0002)
-#define U300_SYSCON_PMCR_PWR_MGNT_ENABLE			(0x0001)
-/* Reset Out 16bit (R/W) */
-#define U300_SYSCON_RCR						(0x6c)
-#define U300_SYSCON_RCR_RESOUT0_RST_N_DISABLE			(0x0001)
-/* EMIF Slew Rate Control 16bit (R/W) */
-#define U300_SYSCON_SRCLR					(0x70)
-#define U300_SYSCON_SRCLR_MASK					(0x03FF)
-#define U300_SYSCON_SRCLR_VALUE					(0x03FF)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_5_B			(0x0200)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_5_A			(0x0100)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_4_B			(0x0080)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_4_A			(0x0040)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_3_B			(0x0020)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_3_A			(0x0010)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_2_B			(0x0008)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_2_A			(0x0004)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_1_B			(0x0002)
-#define U300_SYSCON_SRCLR_EMIF_1_SLRC_1_A			(0x0001)
-/* EMIF Clock Control Register 16bit (R/W) */
-#define U300_SYSCON_ECCR					(0x0078)
-#define U300_SYSCON_ECCR_MASK					(0x000F)
-#define U300_SYSCON_ECCR_EMIF_1_STATIC_CLK_EN_N_DISABLE		(0x0008)
-#define U300_SYSCON_ECCR_EMIF_1_RET_OUT_CLK_EN_N_DISABLE	(0x0004)
-#define U300_SYSCON_ECCR_EMIF_MEMCLK_RET_EN_N_DISABLE		(0x0002)
-#define U300_SYSCON_ECCR_EMIF_SDRCLK_RET_EN_N_DISABLE		(0x0001)
-/* MMC/MSPRO frequency divider register 0 16bit (R/W) */
-#define U300_SYSCON_MMF0R					(0x90)
-#define U300_SYSCON_MMF0R_MASK					(0x00FF)
-#define U300_SYSCON_MMF0R_FREQ_0_HIGH_MASK			(0x00F0)
-#define U300_SYSCON_MMF0R_FREQ_0_LOW_MASK			(0x000F)
-/* MMC/MSPRO frequency divider register 1 16bit (R/W) */
-#define U300_SYSCON_MMF1R					(0x94)
-#define U300_SYSCON_MMF1R_MASK					(0x00FF)
-#define U300_SYSCON_MMF1R_FREQ_1_HIGH_MASK			(0x00F0)
-#define U300_SYSCON_MMF1R_FREQ_1_LOW_MASK			(0x000F)
-/* Clock control for the MMC and MSPRO blocks 16bit (R/W) */
-#define U300_SYSCON_MMCR					(0x9C)
-#define U300_SYSCON_MMCR_MASK					(0x0003)
-#define U300_SYSCON_MMCR_MMC_FB_CLK_SEL_ENABLE			(0x0002)
-#define U300_SYSCON_MMCR_MSPRO_FREQSEL_ENABLE			(0x0001)
-/* SYS_0_CLK_CONTROL first clock control 16bit (R/W) */
-#define U300_SYSCON_S0CCR					(0x120)
-#define U300_SYSCON_S0CCR_FIELD_MASK				(0x43FF)
-#define U300_SYSCON_S0CCR_CLOCK_REQ				(0x4000)
-#define U300_SYSCON_S0CCR_CLOCK_REQ_MONITOR			(0x2000)
-#define U300_SYSCON_S0CCR_CLOCK_INV				(0x0200)
-#define U300_SYSCON_S0CCR_CLOCK_FREQ_MASK			(0x01E0)
-#define U300_SYSCON_S0CCR_CLOCK_SELECT_MASK			(0x001E)
-#define U300_SYSCON_S0CCR_CLOCK_ENABLE				(0x0001)
-#define U300_SYSCON_S0CCR_SEL_MCLK				(0x8 << 1)
-#define U300_SYSCON_S0CCR_SEL_ACC_FSM_CLK			(0xA << 1)
-#define U300_SYSCON_S0CCR_SEL_PLL60_48_CLK			(0xC << 1)
-#define U300_SYSCON_S0CCR_SEL_PLL60_60_CLK			(0xD << 1)
-#define U300_SYSCON_S0CCR_SEL_ACC_PLL208_CLK			(0xE << 1)
-#define U300_SYSCON_S0CCR_SEL_APP_PLL13_CLK			(0x0 << 1)
-#define U300_SYSCON_S0CCR_SEL_APP_FSM_CLK			(0x2 << 1)
-#define U300_SYSCON_S0CCR_SEL_RTC_CLK				(0x4 << 1)
-#define U300_SYSCON_S0CCR_SEL_APP_PLL208_CLK			(0x6 << 1)
-/* SYS_1_CLK_CONTROL second clock control 16 bit (R/W) */
-#define U300_SYSCON_S1CCR					(0x124)
-#define U300_SYSCON_S1CCR_FIELD_MASK				(0x43FF)
-#define U300_SYSCON_S1CCR_CLOCK_REQ				(0x4000)
-#define U300_SYSCON_S1CCR_CLOCK_REQ_MONITOR			(0x2000)
-#define U300_SYSCON_S1CCR_CLOCK_INV				(0x0200)
-#define U300_SYSCON_S1CCR_CLOCK_FREQ_MASK			(0x01E0)
-#define U300_SYSCON_S1CCR_CLOCK_SELECT_MASK			(0x001E)
-#define U300_SYSCON_S1CCR_CLOCK_ENABLE				(0x0001)
-#define U300_SYSCON_S1CCR_SEL_MCLK				(0x8 << 1)
-#define U300_SYSCON_S1CCR_SEL_ACC_FSM_CLK			(0xA << 1)
-#define U300_SYSCON_S1CCR_SEL_PLL60_48_CLK			(0xC << 1)
-#define U300_SYSCON_S1CCR_SEL_PLL60_60_CLK			(0xD << 1)
-#define U300_SYSCON_S1CCR_SEL_ACC_PLL208_CLK			(0xE << 1)
-#define U300_SYSCON_S1CCR_SEL_ACC_PLL13_CLK			(0x0 << 1)
-#define U300_SYSCON_S1CCR_SEL_APP_FSM_CLK			(0x2 << 1)
-#define U300_SYSCON_S1CCR_SEL_RTC_CLK				(0x4 << 1)
-#define U300_SYSCON_S1CCR_SEL_APP_PLL208_CLK			(0x6 << 1)
-/* SYS_2_CLK_CONTROL third clock control 16 bit (R/W) */
-#define U300_SYSCON_S2CCR					(0x128)
-#define U300_SYSCON_S2CCR_FIELD_MASK				(0xC3FF)
-#define U300_SYSCON_S2CCR_CLK_STEAL				(0x8000)
-#define U300_SYSCON_S2CCR_CLOCK_REQ				(0x4000)
-#define U300_SYSCON_S2CCR_CLOCK_REQ_MONITOR			(0x2000)
-#define U300_SYSCON_S2CCR_CLOCK_INV				(0x0200)
-#define U300_SYSCON_S2CCR_CLOCK_FREQ_MASK			(0x01E0)
-#define U300_SYSCON_S2CCR_CLOCK_SELECT_MASK			(0x001E)
-#define U300_SYSCON_S2CCR_CLOCK_ENABLE				(0x0001)
-#define U300_SYSCON_S2CCR_SEL_MCLK				(0x8 << 1)
-#define U300_SYSCON_S2CCR_SEL_ACC_FSM_CLK			(0xA << 1)
-#define U300_SYSCON_S2CCR_SEL_PLL60_48_CLK			(0xC << 1)
-#define U300_SYSCON_S2CCR_SEL_PLL60_60_CLK			(0xD << 1)
-#define U300_SYSCON_S2CCR_SEL_ACC_PLL208_CLK			(0xE << 1)
-#define U300_SYSCON_S2CCR_SEL_ACC_PLL13_CLK			(0x0 << 1)
-#define U300_SYSCON_S2CCR_SEL_APP_FSM_CLK			(0x2 << 1)
-#define U300_SYSCON_S2CCR_SEL_RTC_CLK				(0x4 << 1)
-#define U300_SYSCON_S2CCR_SEL_APP_PLL208_CLK			(0x6 << 1)
-/* SC_PLL_IRQ_CONTROL 16bit (R/W) */
-#define U300_SYSCON_PICR					(0x0130)
-#define U300_SYSCON_PICR_MASK					(0x00FF)
-#define U300_SYSCON_PICR_FORCE_PLL208_LOCK_LOW_ENABLE		(0x0080)
-#define U300_SYSCON_PICR_FORCE_PLL208_LOCK_HIGH_ENABLE		(0x0040)
-#define U300_SYSCON_PICR_FORCE_PLL13_LOCK_LOW_ENABLE		(0x0020)
-#define U300_SYSCON_PICR_FORCE_PLL13_LOCK_HIGH_ENABLE		(0x0010)
-#define U300_SYSCON_PICR_IRQMASK_PLL13_UNLOCK_ENABLE		(0x0008)
-#define U300_SYSCON_PICR_IRQMASK_PLL13_LOCK_ENABLE		(0x0004)
-#define U300_SYSCON_PICR_IRQMASK_PLL208_UNLOCK_ENABLE		(0x0002)
-#define U300_SYSCON_PICR_IRQMASK_PLL208_LOCK_ENABLE		(0x0001)
-/* SC_PLL_IRQ_STATUS 16 bit (R/-) */
-#define U300_SYSCON_PISR					(0x0134)
-#define U300_SYSCON_PISR_MASK					(0x000F)
-#define U300_SYSCON_PISR_PLL13_UNLOCK_IND			(0x0008)
-#define U300_SYSCON_PISR_PLL13_LOCK_IND				(0x0004)
-#define U300_SYSCON_PISR_PLL208_UNLOCK_IND			(0x0002)
-#define U300_SYSCON_PISR_PLL208_LOCK_IND			(0x0001)
-/* SC_PLL_IRQ_CLEAR 16 bit (-/W) */
-#define U300_SYSCON_PICLR					(0x0138)
-#define U300_SYSCON_PICLR_MASK					(0x000F)
-#define U300_SYSCON_PICLR_RWMASK				(0x0000)
-#define U300_SYSCON_PICLR_PLL13_UNLOCK_SC			(0x0008)
-#define U300_SYSCON_PICLR_PLL13_LOCK_SC				(0x0004)
-#define U300_SYSCON_PICLR_PLL208_UNLOCK_SC			(0x0002)
-#define U300_SYSCON_PICLR_PLL208_LOCK_SC			(0x0001)
-/* Clock activity observability register 0 */
-#define U300_SYSCON_C0OAR					(0x140)
-#define U300_SYSCON_C0OAR_MASK					(0xFFFF)
-#define U300_SYSCON_C0OAR_VALUE					(0xFFFF)
-#define U300_SYSCON_C0OAR_BT_H_CLK				(0x8000)
-#define U300_SYSCON_C0OAR_ASPB_P_CLK				(0x4000)
-#define U300_SYSCON_C0OAR_APP_SEMI_H_CLK			(0x2000)
-#define U300_SYSCON_C0OAR_APP_SEMI_CLK				(0x1000)
-#define U300_SYSCON_C0OAR_APP_MMC_MSPRO_CLK			(0x0800)
-#define U300_SYSCON_C0OAR_APP_I2S1_CLK				(0x0400)
-#define U300_SYSCON_C0OAR_APP_I2S0_CLK				(0x0200)
-#define U300_SYSCON_C0OAR_APP_CPU_CLK				(0x0100)
-#define U300_SYSCON_C0OAR_APP_52_CLK				(0x0080)
-#define U300_SYSCON_C0OAR_APP_208_CLK				(0x0040)
-#define U300_SYSCON_C0OAR_APP_104_CLK				(0x0020)
-#define U300_SYSCON_C0OAR_APEX_CLK				(0x0010)
-#define U300_SYSCON_C0OAR_AHPB_M_H_CLK				(0x0008)
-#define U300_SYSCON_C0OAR_AHB_CLK				(0x0004)
-#define U300_SYSCON_C0OAR_AFPB_P_CLK				(0x0002)
-#define U300_SYSCON_C0OAR_AAIF_CLK				(0x0001)
-/* Clock activity observability register 1 */
-#define U300_SYSCON_C1OAR					(0x144)
-#define U300_SYSCON_C1OAR_MASK					(0x3FFE)
-#define U300_SYSCON_C1OAR_VALUE					(0x3FFE)
-#define U300_SYSCON_C1OAR_NFIF_F_CLK				(0x2000)
-#define U300_SYSCON_C1OAR_MSPRO_CLK				(0x1000)
-#define U300_SYSCON_C1OAR_MMC_P_CLK				(0x0800)
-#define U300_SYSCON_C1OAR_MMC_CLK				(0x0400)
-#define U300_SYSCON_C1OAR_KP_P_CLK				(0x0200)
-#define U300_SYSCON_C1OAR_I2C1_P_CLK				(0x0100)
-#define U300_SYSCON_C1OAR_I2C0_P_CLK				(0x0080)
-#define U300_SYSCON_C1OAR_GPIO_CLK				(0x0040)
-#define U300_SYSCON_C1OAR_EMIF_MPMC_CLK				(0x0020)
-#define U300_SYSCON_C1OAR_EMIF_H_CLK				(0x0010)
-#define U300_SYSCON_C1OAR_EVHIST_CLK				(0x0008)
-#define U300_SYSCON_C1OAR_PPM_CLK				(0x0004)
-#define U300_SYSCON_C1OAR_DMA_CLK				(0x0002)
-/* Clock activity observability register 2 */
-#define U300_SYSCON_C2OAR					(0x148)
-#define U300_SYSCON_C2OAR_MASK					(0x0FFF)
-#define U300_SYSCON_C2OAR_VALUE					(0x0FFF)
-#define U300_SYSCON_C2OAR_XGAM_CDI_CLK				(0x0800)
-#define U300_SYSCON_C2OAR_XGAM_CLK				(0x0400)
-#define U300_SYSCON_C2OAR_VC_H_CLK				(0x0200)
-#define U300_SYSCON_C2OAR_VC_CLK				(0x0100)
-#define U300_SYSCON_C2OAR_UA_P_CLK				(0x0080)
-#define U300_SYSCON_C2OAR_TMR1_CLK				(0x0040)
-#define U300_SYSCON_C2OAR_TMR0_CLK				(0x0020)
-#define U300_SYSCON_C2OAR_SPI_P_CLK				(0x0010)
-#define U300_SYSCON_C2OAR_PCM_I2S1_CORE_CLK			(0x0008)
-#define U300_SYSCON_C2OAR_PCM_I2S1_CLK				(0x0004)
-#define U300_SYSCON_C2OAR_PCM_I2S0_CORE_CLK			(0x0002)
-#define U300_SYSCON_C2OAR_PCM_I2S0_CLK				(0x0001)
-
-
-/*
- * The clocking hierarchy currently looks like this.
- * NOTE: the idea is NOT to show how the clocks are routed on the chip!
- * The ideas is to show dependencies, so a clock higher up in the
- * hierarchy has to be on in order for another clock to be on. Now,
- * both CPU and DMA can actually be on top of the hierarchy, and that
- * is not modeled currently. Instead we have the backbone AMBA bus on
- * top. This bus cannot be programmed in any way but conceptually it
- * needs to be active for the bridges and devices to transport data.
- *
- * Please be aware that a few clocks are hw controlled, which mean that
- * the hw itself can turn on/off or change the rate of the clock when
- * needed!
- *
- *  AMBA bus
- *  |
- *  +- CPU
- *  +- FSMC NANDIF NAND Flash interface
- *  +- SEMI Shared Memory interface
- *  +- ISP Image Signal Processor (U335 only)
- *  +- CDS (U335 only)
- *  +- DMA Direct Memory Access Controller
- *  +- AAIF APP/ACC Interface (Mobile Scalable Link, MSL)
- *  +- APEX
- *  +- VIDEO_ENC AVE2/3 Video Encoder
- *  +- XGAM Graphics Accelerator Controller
- *  +- AHB
- *  |
- *  +- ahb:0 AHB Bridge
- *  |  |
- *  |  +- ahb:1 INTCON Interrupt controller
- *  |  +- ahb:3 MSPRO  Memory Stick Pro controller
- *  |  +- ahb:4 EMIF   External Memory interface
- *  |
- *  +- fast:0 FAST bridge
- *  |  |
- *  |  +- fast:1 MMCSD MMC/SD card reader controller
- *  |  +- fast:2 I2S0  PCM I2S channel 0 controller
- *  |  +- fast:3 I2S1  PCM I2S channel 1 controller
- *  |  +- fast:4 I2C0  I2C channel 0 controller
- *  |  +- fast:5 I2C1  I2C channel 1 controller
- *  |  +- fast:6 SPI   SPI controller
- *  |  +- fast:7 UART1 Secondary UART (U335 only)
- *  |
- *  +- slow:0 SLOW bridge
- *     |
- *     +- slow:1 SYSCON (not possible to control)
- *     +- slow:2 WDOG Watchdog
- *     +- slow:3 UART0 primary UART
- *     +- slow:4 TIMER_APP Application timer - used in Linux
- *     +- slow:5 KEYPAD controller
- *     +- slow:6 GPIO controller
- *     +- slow:7 RTC controller
- *     +- slow:8 BT Bus Tracer (not used currently)
- *     +- slow:9 EH Event Handler (not used currently)
- *     +- slow:a TIMER_ACC Access style timer (not used currently)
- *     +- slow:b PPM (U335 only, what is that?)
- */
-
-/* Global syscon virtual base */
-static void __iomem *syscon_vbase;
-
-/**
- * struct clk_syscon - U300 syscon clock
- * @hw: corresponding clock hardware entry
- * @hw_ctrld: whether this clock is hardware controlled (for refcount etc)
- *	and does not need any magic pokes to be enabled/disabled
- * @reset: state holder, whether this block's reset line is asserted or not
- * @res_reg: reset line enable/disable flag register
- * @res_bit: bit for resetting or taking this consumer out of reset
- * @en_reg: clock line enable/disable flag register
- * @en_bit: bit for enabling/disabling this consumer clock line
- * @clk_val: magic value to poke in the register to enable/disable
- *	this one clock
- */
-struct clk_syscon {
-	struct clk_hw hw;
-	bool hw_ctrld;
-	bool reset;
-	void __iomem *res_reg;
-	u8 res_bit;
-	void __iomem *en_reg;
-	u8 en_bit;
-	u16 clk_val;
-};
-
-#define to_syscon(_hw) container_of(_hw, struct clk_syscon, hw)
-
-static DEFINE_SPINLOCK(syscon_resetreg_lock);
-
-/*
- * Reset control functions. We remember if a block has been
- * taken out of reset and don't remove the reset assertion again
- * and vice versa. Currently we only remove resets so the
- * enablement function is defined out.
- */
-static void syscon_block_reset_enable(struct clk_syscon *sclk)
-{
-	unsigned long iflags;
-	u16 val;
-
-	/* Not all blocks support resetting */
-	if (!sclk->res_reg)
-		return;
-	spin_lock_irqsave(&syscon_resetreg_lock, iflags);
-	val = readw(sclk->res_reg);
-	val |= BIT(sclk->res_bit);
-	writew(val, sclk->res_reg);
-	spin_unlock_irqrestore(&syscon_resetreg_lock, iflags);
-	sclk->reset = true;
-}
-
-static void syscon_block_reset_disable(struct clk_syscon *sclk)
-{
-	unsigned long iflags;
-	u16 val;
-
-	/* Not all blocks support resetting */
-	if (!sclk->res_reg)
-		return;
-	spin_lock_irqsave(&syscon_resetreg_lock, iflags);
-	val = readw(sclk->res_reg);
-	val &= ~BIT(sclk->res_bit);
-	writew(val, sclk->res_reg);
-	spin_unlock_irqrestore(&syscon_resetreg_lock, iflags);
-	sclk->reset = false;
-}
-
-static int syscon_clk_prepare(struct clk_hw *hw)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-
-	/* If the block is in reset, bring it out */
-	if (sclk->reset)
-		syscon_block_reset_disable(sclk);
-	return 0;
-}
-
-static void syscon_clk_unprepare(struct clk_hw *hw)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-
-	/* Please don't force the console into reset */
-	if (sclk->clk_val == U300_SYSCON_SBCER_UART_CLK_EN)
-		return;
-	/* When unpreparing, force block into reset */
-	if (!sclk->reset)
-		syscon_block_reset_enable(sclk);
-}
-
-static int syscon_clk_enable(struct clk_hw *hw)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-
-	/* Don't touch the hardware controlled clocks */
-	if (sclk->hw_ctrld)
-		return 0;
-	/* These cannot be controlled */
-	if (sclk->clk_val == 0xFFFFU)
-		return 0;
-
-	writew(sclk->clk_val, syscon_vbase + U300_SYSCON_SBCER);
-	return 0;
-}
-
-static void syscon_clk_disable(struct clk_hw *hw)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-
-	/* Don't touch the hardware controlled clocks */
-	if (sclk->hw_ctrld)
-		return;
-	if (sclk->clk_val == 0xFFFFU)
-		return;
-	/* Please don't disable the console port */
-	if (sclk->clk_val == U300_SYSCON_SBCER_UART_CLK_EN)
-		return;
-
-	writew(sclk->clk_val, syscon_vbase + U300_SYSCON_SBCDR);
-}
-
-static int syscon_clk_is_enabled(struct clk_hw *hw)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-	u16 val;
-
-	/* If no enable register defined, it's always-on */
-	if (!sclk->en_reg)
-		return 1;
-
-	val = readw(sclk->en_reg);
-	val &= BIT(sclk->en_bit);
-
-	return val ? 1 : 0;
-}
-
-static u16 syscon_get_perf(void)
-{
-	u16 val;
-
-	val = readw(syscon_vbase + U300_SYSCON_CCR);
-	val &= U300_SYSCON_CCR_CLKING_PERFORMANCE_MASK;
-	return val;
-}
-
-static unsigned long
-syscon_clk_recalc_rate(struct clk_hw *hw,
-		       unsigned long parent_rate)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-	u16 perf = syscon_get_perf();
-
-	switch (sclk->clk_val) {
-	case U300_SYSCON_SBCER_FAST_BRIDGE_CLK_EN:
-	case U300_SYSCON_SBCER_I2C0_CLK_EN:
-	case U300_SYSCON_SBCER_I2C1_CLK_EN:
-	case U300_SYSCON_SBCER_MMC_CLK_EN:
-	case U300_SYSCON_SBCER_SPI_CLK_EN:
-		/* The FAST clocks have one progression */
-		switch (perf) {
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER:
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW:
-			return 13000000;
-		default:
-			return parent_rate; /* 26 MHz */
-		}
-	case U300_SYSCON_SBCER_DMAC_CLK_EN:
-	case U300_SYSCON_SBCER_NANDIF_CLK_EN:
-	case U300_SYSCON_SBCER_XGAM_CLK_EN:
-		/* AMBA interconnect peripherals */
-		switch (perf) {
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER:
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW:
-			return 6500000;
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_INTERMEDIATE:
-			return 26000000;
-		default:
-			return parent_rate; /* 52 MHz */
-		}
-	case U300_SYSCON_SBCER_SEMI_CLK_EN:
-	case U300_SYSCON_SBCER_EMIF_CLK_EN:
-		/* EMIF speeds */
-		switch (perf) {
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER:
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW:
-			return 13000000;
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_INTERMEDIATE:
-			return 52000000;
-		default:
-			return 104000000;
-		}
-	case U300_SYSCON_SBCER_CPU_CLK_EN:
-		/* And the fast CPU clock */
-		switch (perf) {
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER:
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW:
-			return 13000000;
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_INTERMEDIATE:
-			return 52000000;
-		case U300_SYSCON_CCR_CLKING_PERFORMANCE_HIGH:
-			return 104000000;
-		default:
-			return parent_rate; /* 208 MHz */
-		}
-	default:
-		/*
-		 * The SLOW clocks and default just inherit the rate of
-		 * their parent (typically PLL13 13 MHz).
-		 */
-		return parent_rate;
-	}
-}
-
-static long
-syscon_clk_round_rate(struct clk_hw *hw, unsigned long rate,
-		      unsigned long *prate)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-
-	if (sclk->clk_val != U300_SYSCON_SBCER_CPU_CLK_EN)
-		return *prate;
-	/* We really only support setting the rate of the CPU clock */
-	if (rate <= 13000000)
-		return 13000000;
-	if (rate <= 52000000)
-		return 52000000;
-	if (rate <= 104000000)
-		return 104000000;
-	return 208000000;
-}
-
-static int syscon_clk_set_rate(struct clk_hw *hw, unsigned long rate,
-			       unsigned long parent_rate)
-{
-	struct clk_syscon *sclk = to_syscon(hw);
-	u16 val;
-
-	/* We only support setting the rate of the CPU clock */
-	if (sclk->clk_val != U300_SYSCON_SBCER_CPU_CLK_EN)
-		return -EINVAL;
-	switch (rate) {
-	case 13000000:
-		val = U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER;
-		break;
-	case 52000000:
-		val = U300_SYSCON_CCR_CLKING_PERFORMANCE_INTERMEDIATE;
-		break;
-	case 104000000:
-		val = U300_SYSCON_CCR_CLKING_PERFORMANCE_HIGH;
-		break;
-	case 208000000:
-		val = U300_SYSCON_CCR_CLKING_PERFORMANCE_BEST;
-		break;
-	default:
-		return -EINVAL;
-	}
-	val |= readw(syscon_vbase + U300_SYSCON_CCR) &
-		~U300_SYSCON_CCR_CLKING_PERFORMANCE_MASK ;
-	writew(val, syscon_vbase + U300_SYSCON_CCR);
-	return 0;
-}
-
-static const struct clk_ops syscon_clk_ops = {
-	.prepare = syscon_clk_prepare,
-	.unprepare = syscon_clk_unprepare,
-	.enable = syscon_clk_enable,
-	.disable = syscon_clk_disable,
-	.is_enabled = syscon_clk_is_enabled,
-	.recalc_rate = syscon_clk_recalc_rate,
-	.round_rate = syscon_clk_round_rate,
-	.set_rate = syscon_clk_set_rate,
-};
-
-static struct clk_hw * __init
-syscon_clk_register(struct device *dev, const char *name,
-		    const char *parent_name, unsigned long flags,
-		    bool hw_ctrld,
-		    void __iomem *res_reg, u8 res_bit,
-		    void __iomem *en_reg, u8 en_bit,
-		    u16 clk_val)
-{
-	struct clk_hw *hw;
-	struct clk_syscon *sclk;
-	struct clk_init_data init;
-	int ret;
-
-	sclk = kzalloc(sizeof(*sclk), GFP_KERNEL);
-	if (!sclk)
-		return ERR_PTR(-ENOMEM);
-
-	init.name = name;
-	init.ops = &syscon_clk_ops;
-	init.flags = flags;
-	init.parent_names = (parent_name ? &parent_name : NULL);
-	init.num_parents = (parent_name ? 1 : 0);
-	sclk->hw.init = &init;
-	sclk->hw_ctrld = hw_ctrld;
-	/* Assume the block is in reset at registration */
-	sclk->reset = true;
-	sclk->res_reg = res_reg;
-	sclk->res_bit = res_bit;
-	sclk->en_reg = en_reg;
-	sclk->en_bit = en_bit;
-	sclk->clk_val = clk_val;
-
-	hw = &sclk->hw;
-	ret = clk_hw_register(dev, hw);
-	if (ret) {
-		kfree(sclk);
-		hw = ERR_PTR(ret);
-	}
-
-	return hw;
-}
-
-#define U300_CLK_TYPE_SLOW 0
-#define U300_CLK_TYPE_FAST 1
-#define U300_CLK_TYPE_REST 2
-
-/**
- * struct u300_clock - defines the bits and pieces for a certain clock
- * @type: the clock type, slow fast or rest
- * @id: the bit in the slow/fast/rest register for this clock
- * @hw_ctrld: whether the clock is hardware controlled
- * @clk_val: a value to poke in the one-write enable/disable registers
- */
-struct u300_clock {
-	u8 type;
-	u8 id;
-	bool hw_ctrld;
-	u16 clk_val;
-};
-
-static struct u300_clock const u300_clk_lookup[] __initconst = {
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 3,
-		.hw_ctrld = true,
-		.clk_val = U300_SYSCON_SBCER_CPU_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 4,
-		.hw_ctrld = true,
-		.clk_val = U300_SYSCON_SBCER_DMAC_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 5,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_EMIF_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 6,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_NANDIF_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 8,
-		.hw_ctrld = true,
-		.clk_val = U300_SYSCON_SBCER_XGAM_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 9,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_SEMI_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 10,
-		.hw_ctrld = true,
-		.clk_val = U300_SYSCON_SBCER_AHB_SUBSYS_BRIDGE_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_REST,
-		.id = 12,
-		.hw_ctrld = false,
-		/* INTCON: cannot be enabled, just taken out of reset */
-		.clk_val = 0xFFFFU,
-	},
-	{
-		.type = U300_CLK_TYPE_FAST,
-		.id = 0,
-		.hw_ctrld = true,
-		.clk_val = U300_SYSCON_SBCER_FAST_BRIDGE_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_FAST,
-		.id = 1,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_I2C0_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_FAST,
-		.id = 2,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_I2C1_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_FAST,
-		.id = 5,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_MMC_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_FAST,
-		.id = 6,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_SPI_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_SLOW,
-		.id = 0,
-		.hw_ctrld = true,
-		.clk_val = U300_SYSCON_SBCER_SLOW_BRIDGE_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_SLOW,
-		.id = 1,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_UART_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_SLOW,
-		.id = 4,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_GPIO_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_SLOW,
-		.id = 6,
-		.hw_ctrld = true,
-		/* No clock enable register bit */
-		.clk_val = 0xFFFFU,
-	},
-	{
-		.type = U300_CLK_TYPE_SLOW,
-		.id = 7,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_APP_TMR_CLK_EN,
-	},
-	{
-		.type = U300_CLK_TYPE_SLOW,
-		.id = 8,
-		.hw_ctrld = false,
-		.clk_val = U300_SYSCON_SBCER_ACC_TMR_CLK_EN,
-	},
-};
-
-static void __init of_u300_syscon_clk_init(struct device_node *np)
-{
-	struct clk_hw *hw = ERR_PTR(-EINVAL);
-	const char *clk_name = np->name;
-	const char *parent_name;
-	void __iomem *res_reg;
-	void __iomem *en_reg;
-	u32 clk_type;
-	u32 clk_id;
-	int i;
-
-	if (of_property_read_u32(np, "clock-type", &clk_type)) {
-		pr_err("%s: syscon clock \"%s\" missing clock-type property\n",
-		       __func__, clk_name);
-		return;
-	}
-	if (of_property_read_u32(np, "clock-id", &clk_id)) {
-		pr_err("%s: syscon clock \"%s\" missing clock-id property\n",
-		       __func__, clk_name);
-		return;
-	}
-	parent_name = of_clk_get_parent_name(np, 0);
-
-	switch (clk_type) {
-	case U300_CLK_TYPE_SLOW:
-		res_reg = syscon_vbase + U300_SYSCON_RSR;
-		en_reg = syscon_vbase + U300_SYSCON_CESR;
-		break;
-	case U300_CLK_TYPE_FAST:
-		res_reg = syscon_vbase + U300_SYSCON_RFR;
-		en_reg = syscon_vbase + U300_SYSCON_CEFR;
-		break;
-	case U300_CLK_TYPE_REST:
-		res_reg = syscon_vbase + U300_SYSCON_RRR;
-		en_reg = syscon_vbase + U300_SYSCON_CERR;
-		break;
-	default:
-		pr_err("unknown clock type %x specified\n", clk_type);
-		return;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(u300_clk_lookup); i++) {
-		const struct u300_clock *u3clk = &u300_clk_lookup[i];
-
-		if (u3clk->type == clk_type && u3clk->id == clk_id)
-			hw = syscon_clk_register(NULL, clk_name, parent_name,
-						 0, u3clk->hw_ctrld,
-						 res_reg, u3clk->id,
-						 en_reg, u3clk->id,
-						 u3clk->clk_val);
-	}
-
-	if (!IS_ERR(hw)) {
-		of_clk_add_hw_provider(np, of_clk_hw_simple_get, hw);
-
-		/*
-		 * Some few system clocks - device tree does not
-		 * represent clocks without a corresponding device node.
-		 * for now we add these three clocks here.
-		 */
-		if (clk_type == U300_CLK_TYPE_REST && clk_id == 5)
-			clk_hw_register_clkdev(hw, NULL, "pl172");
-		if (clk_type == U300_CLK_TYPE_REST && clk_id == 9)
-			clk_hw_register_clkdev(hw, NULL, "semi");
-		if (clk_type == U300_CLK_TYPE_REST && clk_id == 12)
-			clk_hw_register_clkdev(hw, NULL, "intcon");
-	}
-}
-
-/**
- * struct clk_mclk - U300 MCLK clock (MMC/SD clock)
- * @hw: corresponding clock hardware entry
- * @is_mspro: if this is the memory stick clock rather than MMC/SD
- */
-struct clk_mclk {
-	struct clk_hw hw;
-	bool is_mspro;
-};
-
-#define to_mclk(_hw) container_of(_hw, struct clk_mclk, hw)
-
-static int mclk_clk_prepare(struct clk_hw *hw)
-{
-	struct clk_mclk *mclk = to_mclk(hw);
-	u16 val;
-
-	/* The MMC and MSPRO clocks need some special set-up */
-	if (!mclk->is_mspro) {
-		/* Set default MMC clock divisor to 18.9 MHz */
-		writew(0x0054U, syscon_vbase + U300_SYSCON_MMF0R);
-		val = readw(syscon_vbase + U300_SYSCON_MMCR);
-		/* Disable the MMC feedback clock */
-		val &= ~U300_SYSCON_MMCR_MMC_FB_CLK_SEL_ENABLE;
-		/* Disable MSPRO frequency */
-		val &= ~U300_SYSCON_MMCR_MSPRO_FREQSEL_ENABLE;
-		writew(val, syscon_vbase + U300_SYSCON_MMCR);
-	} else {
-		val = readw(syscon_vbase + U300_SYSCON_MMCR);
-		/* Disable the MMC feedback clock */
-		val &= ~U300_SYSCON_MMCR_MMC_FB_CLK_SEL_ENABLE;
-		/* Enable MSPRO frequency */
-		val |= U300_SYSCON_MMCR_MSPRO_FREQSEL_ENABLE;
-		writew(val, syscon_vbase + U300_SYSCON_MMCR);
-	}
-
-	return 0;
-}
-
-static unsigned long
-mclk_clk_recalc_rate(struct clk_hw *hw,
-		     unsigned long parent_rate)
-{
-	u16 perf = syscon_get_perf();
-
-	switch (perf) {
-	case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW_POWER:
-		/*
-		 * Here, the 208 MHz PLL gets shut down and the always
-		 * on 13 MHz PLL used for RTC etc kicks into use
-		 * instead.
-		 */
-		return 13000000;
-	case U300_SYSCON_CCR_CLKING_PERFORMANCE_LOW:
-	case U300_SYSCON_CCR_CLKING_PERFORMANCE_INTERMEDIATE:
-	case U300_SYSCON_CCR_CLKING_PERFORMANCE_HIGH:
-	case U300_SYSCON_CCR_CLKING_PERFORMANCE_BEST:
-	{
-		/*
-		 * This clock is under program control. The register is
-		 * divided in two nybbles, bit 7-4 gives cycles-1 to count
-		 * high, bit 3-0 gives cycles-1 to count low. Distribute
-		 * these with no more than 1 cycle difference between
-		 * low and high and add low and high to get the actual
-		 * divisor. The base PLL is 208 MHz. Writing 0x00 will
-		 * divide by 1 and 1 so the highest frequency possible
-		 * is 104 MHz.
-		 *
-		 * e.g. 0x54 =>
-		 * f = 208 / ((5+1) + (4+1)) = 208 / 11 = 18.9 MHz
-		 */
-		u16 val = readw(syscon_vbase + U300_SYSCON_MMF0R) &
-			U300_SYSCON_MMF0R_MASK;
-		switch (val) {
-		case 0x0054:
-			return 18900000;
-		case 0x0044:
-			return 20800000;
-		case 0x0043:
-			return 23100000;
-		case 0x0033:
-			return 26000000;
-		case 0x0032:
-			return 29700000;
-		case 0x0022:
-			return 34700000;
-		case 0x0021:
-			return 41600000;
-		case 0x0011:
-			return 52000000;
-		case 0x0000:
-			return 104000000;
-		default:
-			break;
-		}
-	}
-	default:
-		break;
-	}
-	return parent_rate;
-}
-
-static long
-mclk_clk_round_rate(struct clk_hw *hw, unsigned long rate,
-		    unsigned long *prate)
-{
-	if (rate <= 18900000)
-		return 18900000;
-	if (rate <= 20800000)
-		return 20800000;
-	if (rate <= 23100000)
-		return 23100000;
-	if (rate <= 26000000)
-		return 26000000;
-	if (rate <= 29700000)
-		return 29700000;
-	if (rate <= 34700000)
-		return 34700000;
-	if (rate <= 41600000)
-		return 41600000;
-	/* Highest rate */
-	return 52000000;
-}
-
-static int mclk_clk_set_rate(struct clk_hw *hw, unsigned long rate,
-			     unsigned long parent_rate)
-{
-	u16 val;
-	u16 reg;
-
-	switch (rate) {
-	case 18900000:
-		val = 0x0054;
-		break;
-	case 20800000:
-		val = 0x0044;
-		break;
-	case 23100000:
-		val = 0x0043;
-		break;
-	case 26000000:
-		val = 0x0033;
-		break;
-	case 29700000:
-		val = 0x0032;
-		break;
-	case 34700000:
-		val = 0x0022;
-		break;
-	case 41600000:
-		val = 0x0021;
-		break;
-	case 52000000:
-		val = 0x0011;
-		break;
-	case 104000000:
-		val = 0x0000;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	reg = readw(syscon_vbase + U300_SYSCON_MMF0R) &
-		~U300_SYSCON_MMF0R_MASK;
-	writew(reg | val, syscon_vbase + U300_SYSCON_MMF0R);
-	return 0;
-}
-
-static const struct clk_ops mclk_ops = {
-	.prepare = mclk_clk_prepare,
-	.recalc_rate = mclk_clk_recalc_rate,
-	.round_rate = mclk_clk_round_rate,
-	.set_rate = mclk_clk_set_rate,
-};
-
-static struct clk_hw * __init
-mclk_clk_register(struct device *dev, const char *name,
-		  const char *parent_name, bool is_mspro)
-{
-	struct clk_hw *hw;
-	struct clk_mclk *mclk;
-	struct clk_init_data init;
-	int ret;
-
-	mclk = kzalloc(sizeof(*mclk), GFP_KERNEL);
-	if (!mclk)
-		return ERR_PTR(-ENOMEM);
-
-	init.name = "mclk";
-	init.ops = &mclk_ops;
-	init.flags = 0;
-	init.parent_names = (parent_name ? &parent_name : NULL);
-	init.num_parents = (parent_name ? 1 : 0);
-	mclk->hw.init = &init;
-	mclk->is_mspro = is_mspro;
-
-	hw = &mclk->hw;
-	ret = clk_hw_register(dev, hw);
-	if (ret) {
-		kfree(mclk);
-		hw = ERR_PTR(ret);
-	}
-
-	return hw;
-}
-
-static void __init of_u300_syscon_mclk_init(struct device_node *np)
-{
-	struct clk_hw *hw;
-	const char *clk_name = np->name;
-	const char *parent_name;
-
-	parent_name = of_clk_get_parent_name(np, 0);
-	hw = mclk_clk_register(NULL, clk_name, parent_name, false);
-	if (!IS_ERR(hw))
-		of_clk_add_hw_provider(np, of_clk_hw_simple_get, hw);
-}
-
-static const struct of_device_id u300_clk_match[] __initconst = {
-	{
-		.compatible = "fixed-clock",
-		.data = of_fixed_clk_setup,
-	},
-	{
-		.compatible = "fixed-factor-clock",
-		.data = of_fixed_factor_clk_setup,
-	},
-	{
-		.compatible = "stericsson,u300-syscon-clk",
-		.data = of_u300_syscon_clk_init,
-	},
-	{
-		.compatible = "stericsson,u300-syscon-mclk",
-		.data = of_u300_syscon_mclk_init,
-	},
-	{}
-};
-
-
-void __init u300_clk_init(void __iomem *base)
-{
-	u16 val;
-
-	syscon_vbase = base;
-
-	/* Set system to run at PLL208, max performance, a known state. */
-	val = readw(syscon_vbase + U300_SYSCON_CCR);
-	val &= ~U300_SYSCON_CCR_CLKING_PERFORMANCE_MASK;
-	writew(val, syscon_vbase + U300_SYSCON_CCR);
-	/* Wait for the PLL208 to lock if not locked in yet */
-	while (!(readw(syscon_vbase + U300_SYSCON_CSR) &
-		 U300_SYSCON_CSR_PLL208_LOCK_IND));
-
-	/* Power management enable */
-	val = readw(syscon_vbase + U300_SYSCON_PMCR);
-	val |= U300_SYSCON_PMCR_PWR_MGNT_ENABLE;
-	writew(val, syscon_vbase + U300_SYSCON_PMCR);
-
-	of_clk_init(u300_clk_match);
-}
diff --git a/include/linux/platform_data/clk-u300.h b/include/linux/platform_data/clk-u300.h
deleted file mode 100644
index 8429e73911a1..000000000000
--- a/include/linux/platform_data/clk-u300.h
+++ /dev/null
@@ -1 +0,0 @@
-void __init u300_clk_init(void __iomem *base);
-- 
cgit v1.2.3


From bf5c9cc8ad7fffd1f72df3baa5870449e4c16d1b Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Mon, 8 Feb 2021 08:37:05 +0100
Subject: mei: bus: change remove callback to return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of mei_cl_device_remove() so
passing an error value doesn't solve any problem. As most mei drivers'
remove callbacks return 0 unconditionally and returning a different value
doesn't have any effect, change this prototype to return void and return 0
unconditionally in mei_cl_device_remove(). The only driver that could
return an error value is modified to emit an explicit warning in the error
case.

Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Link: https://lore.kernel.org/r/20210208073705.428185-3-uwe@kleine-koenig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c           | 5 ++---
 drivers/misc/mei/hdcp/mei_hdcp.c | 7 +++++--
 drivers/nfc/microread/mei.c      | 4 +---
 drivers/nfc/pn544/mei.c          | 4 +---
 drivers/watchdog/mei_wdt.c       | 4 +---
 include/linux/mei_cl_bus.h       | 2 +-
 6 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 393a41330a6b..580074e32599 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -881,17 +881,16 @@ static int mei_cl_device_remove(struct device *dev)
 {
 	struct mei_cl_device *cldev = to_mei_cl_device(dev);
 	struct mei_cl_driver *cldrv = to_mei_cl_driver(dev->driver);
-	int ret = 0;
 
 	if (cldrv->remove)
-		ret = cldrv->remove(cldev);
+		cldrv->remove(cldev);
 
 	mei_cldev_unregister_callbacks(cldev);
 
 	mei_cl_bus_module_put(cldev);
 	module_put(THIS_MODULE);
 
-	return ret;
+	return 0;
 }
 
 static ssize_t name_show(struct device *dev, struct device_attribute *a,
diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c
index 9ae9669e46ea..8447ad4b7d47 100644
--- a/drivers/misc/mei/hdcp/mei_hdcp.c
+++ b/drivers/misc/mei/hdcp/mei_hdcp.c
@@ -845,16 +845,19 @@ enable_err_exit:
 	return ret;
 }
 
-static int mei_hdcp_remove(struct mei_cl_device *cldev)
+static void mei_hdcp_remove(struct mei_cl_device *cldev)
 {
 	struct i915_hdcp_comp_master *comp_master =
 						mei_cldev_get_drvdata(cldev);
+	int ret;
 
 	component_master_del(&cldev->dev, &mei_component_master_ops);
 	kfree(comp_master);
 	mei_cldev_set_drvdata(cldev, NULL);
 
-	return mei_cldev_disable(cldev);
+	ret = mei_cldev_disable(cldev);
+	if (ret)
+		dev_warn(&cldev->dev, "mei_cldev_disable() failed\n");
 }
 
 #define MEI_UUID_HDCP GUID_INIT(0xB638AB7E, 0x94E2, 0x4EA2, 0xA5, \
diff --git a/drivers/nfc/microread/mei.c b/drivers/nfc/microread/mei.c
index 5dad8847a9b3..8fa7771085eb 100644
--- a/drivers/nfc/microread/mei.c
+++ b/drivers/nfc/microread/mei.c
@@ -44,15 +44,13 @@ static int microread_mei_probe(struct mei_cl_device *cldev,
 	return 0;
 }
 
-static int microread_mei_remove(struct mei_cl_device *cldev)
+static void microread_mei_remove(struct mei_cl_device *cldev)
 {
 	struct nfc_mei_phy *phy = mei_cldev_get_drvdata(cldev);
 
 	microread_remove(phy->hdev);
 
 	nfc_mei_phy_free(phy);
-
-	return 0;
 }
 
 static struct mei_cl_device_id microread_mei_tbl[] = {
diff --git a/drivers/nfc/pn544/mei.c b/drivers/nfc/pn544/mei.c
index 579bc599f545..5c10aac085a4 100644
--- a/drivers/nfc/pn544/mei.c
+++ b/drivers/nfc/pn544/mei.c
@@ -42,7 +42,7 @@ static int pn544_mei_probe(struct mei_cl_device *cldev,
 	return 0;
 }
 
-static int pn544_mei_remove(struct mei_cl_device *cldev)
+static void pn544_mei_remove(struct mei_cl_device *cldev)
 {
 	struct nfc_mei_phy *phy = mei_cldev_get_drvdata(cldev);
 
@@ -51,8 +51,6 @@ static int pn544_mei_remove(struct mei_cl_device *cldev)
 	pn544_hci_remove(phy->hdev);
 
 	nfc_mei_phy_free(phy);
-
-	return 0;
 }
 
 static struct mei_cl_device_id pn544_mei_tbl[] = {
diff --git a/drivers/watchdog/mei_wdt.c b/drivers/watchdog/mei_wdt.c
index 5391bf3e6b11..53165e49c298 100644
--- a/drivers/watchdog/mei_wdt.c
+++ b/drivers/watchdog/mei_wdt.c
@@ -619,7 +619,7 @@ err_out:
 	return ret;
 }
 
-static int mei_wdt_remove(struct mei_cl_device *cldev)
+static void mei_wdt_remove(struct mei_cl_device *cldev)
 {
 	struct mei_wdt *wdt = mei_cldev_get_drvdata(cldev);
 
@@ -636,8 +636,6 @@ static int mei_wdt_remove(struct mei_cl_device *cldev)
 	dbgfs_unregister(wdt);
 
 	kfree(wdt);
-
-	return 0;
 }
 
 #define MEI_UUID_WD UUID_LE(0x05B79A6F, 0x4628, 0x4D7F, \
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 959ad7d850b4..07f5ef8fc456 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -68,7 +68,7 @@ struct mei_cl_driver {
 
 	int (*probe)(struct mei_cl_device *cldev,
 		     const struct mei_cl_device_id *id);
-	int (*remove)(struct mei_cl_device *cldev);
+	void (*remove)(struct mei_cl_device *cldev);
 };
 
 int __mei_cldev_driver_register(struct mei_cl_driver *cldrv,
-- 
cgit v1.2.3


From cae5216387d18c888f9f38a0cf5be341a0af75a6 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Mon, 8 Feb 2021 22:16:03 +0200
Subject: i2c: mux: mlxcpld: Get rid of adapter numbers enforcement

Do not set the argument 'force_nr' of i2c_mux_add_adapter() routine,
instead provide argument 'chan_id'.
Rename mux ids array from 'adap_ids' to 'chan_ids'.

The motivation is to prepare infrastructure to be able to:
- Create only the child adapters which are actually needed - for which
  channel ids are specified.
- To assign 'nrs' to these child adapters dynamically, with no 'nr'
  enforcement.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Acked-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-mlxcpld.c   | 7 ++-----
 include/linux/platform_data/mlxcpld.h | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index 113ad84cdd94..e99a7ad09886 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -101,9 +101,8 @@ static int mlxcpld_mux_probe(struct platform_device *pdev)
 	struct mlxcpld_mux_plat_data *pdata = dev_get_platdata(&pdev->dev);
 	struct i2c_client *client = to_i2c_client(pdev->dev.parent);
 	struct i2c_mux_core *muxc;
-	int num, force;
 	struct mlxcpld_mux *data;
-	int err;
+	int num, err;
 
 	if (!pdata)
 		return -EINVAL;
@@ -130,9 +129,7 @@ static int mlxcpld_mux_probe(struct platform_device *pdev)
 			/* discard unconfigured channels */
 			break;
 
-		force = pdata->adap_ids[num];
-
-		err = i2c_mux_add_adapter(muxc, force, num, 0);
+		err = i2c_mux_add_adapter(muxc, 0, pdata->chan_ids[num], 0);
 		if (err)
 			goto virt_reg_failed;
 	}
diff --git a/include/linux/platform_data/mlxcpld.h b/include/linux/platform_data/mlxcpld.h
index e6c18bf017dd..04d93c563c04 100644
--- a/include/linux/platform_data/mlxcpld.h
+++ b/include/linux/platform_data/mlxcpld.h
@@ -11,12 +11,12 @@
 /* Platform data for the CPLD I2C multiplexers */
 
 /* mlxcpld_mux_plat_data - per mux data, used with i2c_register_board_info
- * @adap_ids - adapter array
+ * @chan_ids - channels array
  * @num_adaps - number of adapters
  * @sel_reg_addr - mux select register offset in CPLD space
  */
 struct mlxcpld_mux_plat_data {
-	int *adap_ids;
+	int *chan_ids;
 	int num_adaps;
 	int sel_reg_addr;
 };
-- 
cgit v1.2.3


From c52a1c5f5db55c6a71110c2db9ae26b9f5269d20 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Mon, 8 Feb 2021 22:16:04 +0200
Subject: i2c: mux: mlxcpld: Extend driver to support word address space
 devices

Extend driver to allow I2C routing control through CPLD devices with
word address space. Till now only CPLD devices with byte address space
have been supported.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Reviewed-by: Michael Shych <michaelsh@nvidia.com>
Acked-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-mlxcpld.c   | 47 +++++++++++++++++++++++++++++------
 include/linux/platform_data/mlxcpld.h |  2 ++
 2 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index e99a7ad09886..10767ad4adb4 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -63,19 +63,39 @@ static int mlxcpld_mux_reg_write(struct i2c_adapter *adap,
 				 struct mlxcpld_mux *mux, u32 val)
 {
 	struct i2c_client *client = mux->client;
-	union i2c_smbus_data data = { .byte = val };
-
-	return __i2c_smbus_xfer(adap, client->addr, client->flags,
-				I2C_SMBUS_WRITE, mux->pdata.sel_reg_addr,
-				I2C_SMBUS_BYTE_DATA, &data);
+	union i2c_smbus_data data;
+	struct i2c_msg msg;
+	u8 buf[3];
+
+	switch (mux->pdata.reg_size) {
+	case 1:
+		data.byte = val;
+		return __i2c_smbus_xfer(adap, client->addr, client->flags,
+					I2C_SMBUS_WRITE, mux->pdata.sel_reg_addr,
+					I2C_SMBUS_BYTE_DATA, &data);
+	case 2:
+		buf[0] = mux->pdata.sel_reg_addr >> 8;
+		buf[1] = mux->pdata.sel_reg_addr;
+		buf[2] = val;
+		msg.addr = client->addr;
+		msg.buf = buf;
+		msg.len = mux->pdata.reg_size + 1;
+		msg.flags = 0;
+		return __i2c_transfer(adap, &msg, 1);
+	default:
+		return -EINVAL;
+	}
 }
 
 static int mlxcpld_mux_select_chan(struct i2c_mux_core *muxc, u32 chan)
 {
 	struct mlxcpld_mux *mux = i2c_mux_priv(muxc);
-	u32 regval = chan + 1;
+	u32 regval = chan;
 	int err = 0;
 
+	if (mux->pdata.reg_size == 1)
+		regval += 1;
+
 	/* Only select the channel if its different from the last channel */
 	if (mux->last_val != regval) {
 		err = mlxcpld_mux_reg_write(muxc->parent, mux, regval);
@@ -103,12 +123,23 @@ static int mlxcpld_mux_probe(struct platform_device *pdev)
 	struct i2c_mux_core *muxc;
 	struct mlxcpld_mux *data;
 	int num, err;
+	u32 func;
 
 	if (!pdata)
 		return -EINVAL;
 
-	if (!i2c_check_functionality(client->adapter,
-				     I2C_FUNC_SMBUS_WRITE_BYTE_DATA))
+	switch (pdata->reg_size) {
+	case 1:
+		func = I2C_FUNC_SMBUS_WRITE_BYTE_DATA;
+		break;
+	case 2:
+		func = I2C_FUNC_I2C;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (!i2c_check_functionality(client->adapter, func))
 		return -ENODEV;
 
 	muxc = i2c_mux_alloc(client->adapter, &pdev->dev, CPLD_MUX_MAX_NCHANS,
diff --git a/include/linux/platform_data/mlxcpld.h b/include/linux/platform_data/mlxcpld.h
index 04d93c563c04..a7bee798d991 100644
--- a/include/linux/platform_data/mlxcpld.h
+++ b/include/linux/platform_data/mlxcpld.h
@@ -14,11 +14,13 @@
  * @chan_ids - channels array
  * @num_adaps - number of adapters
  * @sel_reg_addr - mux select register offset in CPLD space
+ * @reg_size: register size in bytes
  */
 struct mlxcpld_mux_plat_data {
 	int *chan_ids;
 	int num_adaps;
 	int sel_reg_addr;
+	u8 reg_size;
 };
 
 #endif /* _LINUX_I2C_MLXCPLD_H */
-- 
cgit v1.2.3


From a39bd92e92b96d05d676fb5c9493cf1c911d2a0a Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@nvidia.com>
Date: Mon, 8 Feb 2021 22:16:06 +0200
Subject: i2c: mux: mlxcpld: Add callback to notify mux creation completion

Add notification to inform caller that mux objects array has been
created. It allows to user, invoked platform device registration for
"i2c-mux-mlxcpld" driver, to be notified that mux infrastructure is
available, and thus some devices could be connected to this
infrastructure.

Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Acked-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-mlxcpld.c   | 4 ++++
 include/linux/platform_data/mlxcpld.h | 5 +++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index 5e0672f9979b..1a879f6a31ef 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -159,6 +159,10 @@ static int mlxcpld_mux_probe(struct platform_device *pdev)
 			goto virt_reg_failed;
 	}
 
+	/* Notify caller when all channels' adapters are created. */
+	if (pdata->completion_notify)
+		pdata->completion_notify(pdata->handle, muxc->parent, muxc->adapter);
+
 	return 0;
 
 virt_reg_failed:
diff --git a/include/linux/platform_data/mlxcpld.h b/include/linux/platform_data/mlxcpld.h
index a7bee798d991..d7610b528856 100644
--- a/include/linux/platform_data/mlxcpld.h
+++ b/include/linux/platform_data/mlxcpld.h
@@ -15,12 +15,17 @@
  * @num_adaps - number of adapters
  * @sel_reg_addr - mux select register offset in CPLD space
  * @reg_size: register size in bytes
+ * @handle: handle to be passed by callback
+ * @completion_notify: callback to notify when all the adapters are created
  */
 struct mlxcpld_mux_plat_data {
 	int *chan_ids;
 	int num_adaps;
 	int sel_reg_addr;
 	u8 reg_size;
+	void *handle;
+	int (*completion_notify)(void *handle, struct i2c_adapter *parent,
+				 struct i2c_adapter *adapters[]);
 };
 
 #endif /* _LINUX_I2C_MLXCPLD_H */
-- 
cgit v1.2.3


From 3c5960c0559c44c6628341a82167ee0d3e40ee50 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Fri, 5 Feb 2021 11:34:09 +0800
Subject: usb: typec: Manage SVDM version

PD Spec Revision 3.0 Version 2.0 + ECNs 2020-12-10
  6.4.4.2.3 Structured VDM Version
  "The Structured VDM Version field of the Discover Identity Command
  sent and received during VDM discovery Shall be used to determine the
  lowest common Structured VDM Version supported by the Port Partners or
  Cable Plug and Shall continue to operate using this Specification
  Revision until they are Detached."

Add a variable in typec_capability to specify the highest SVDM version
supported by the port and another variable in typec_partner to cache the
negotiated SVDM version between the port and the partner.

Also add setter/getter functions for the negotiated SVDM version.

Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210205033415.3320439-2-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/class.c         | 43 +++++++++++++++++++++++++++++++++++++++
 include/linux/usb/typec.h         | 12 +++++++++++
 include/linux/usb/typec_altmode.h | 10 +++++++++
 3 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index b4a5d9d4564f..45f0bf65e9ab 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -38,6 +38,7 @@ struct typec_partner {
 	struct ida			mode_ids;
 	int				num_altmodes;
 	u16				pd_revision; /* 0300H = "3.0" */
+	enum usb_pd_svdm_ver		svdm_version;
 };
 
 struct typec_port {
@@ -824,6 +825,20 @@ typec_partner_register_altmode(struct typec_partner *partner,
 }
 EXPORT_SYMBOL_GPL(typec_partner_register_altmode);
 
+/**
+ * typec_partner_set_svdm_version - Set negotiated Structured VDM (SVDM) Version
+ * @partner: USB Type-C Partner that supports SVDM
+ * @svdm_version: Negotiated SVDM Version
+ *
+ * This routine is used to save the negotiated SVDM Version.
+ */
+void typec_partner_set_svdm_version(struct typec_partner *partner,
+				   enum usb_pd_svdm_ver svdm_version)
+{
+	partner->svdm_version = svdm_version;
+}
+EXPORT_SYMBOL_GPL(typec_partner_set_svdm_version);
+
 /**
  * typec_register_partner - Register a USB Type-C Partner
  * @port: The USB Type-C Port the partner is connected to
@@ -848,6 +863,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port,
 	partner->accessory = desc->accessory;
 	partner->num_altmodes = -1;
 	partner->pd_revision = desc->pd_revision;
+	partner->svdm_version = port->cap->svdm_version;
 
 	if (desc->identity) {
 		/*
@@ -1894,6 +1910,33 @@ EXPORT_SYMBOL_GPL(typec_set_mode);
 
 /* --------------------------------------- */
 
+/**
+ * typec_get_negotiated_svdm_version - Get negotiated SVDM Version
+ * @port: USB Type-C Port.
+ *
+ * Get the negotiated SVDM Version. The Version is set to the port default
+ * value stored in typec_capability on partner registration, and updated after
+ * a successful Discover Identity if the negotiated value is less than the
+ * default value.
+ *
+ * Returns usb_pd_svdm_ver if the partner has been registered otherwise -ENODEV.
+ */
+int typec_get_negotiated_svdm_version(struct typec_port *port)
+{
+	enum usb_pd_svdm_ver svdm_version;
+	struct device *partner_dev;
+
+	partner_dev = device_find_child(&port->dev, NULL, partner_match);
+	if (!partner_dev)
+		return -ENODEV;
+
+	svdm_version = to_typec_partner(partner_dev)->svdm_version;
+	put_device(partner_dev);
+
+	return svdm_version;
+}
+EXPORT_SYMBOL_GPL(typec_get_negotiated_svdm_version);
+
 /**
  * typec_get_drvdata - Return private driver data pointer
  * @port: USB Type-C port
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index a94df82ab62f..91b4303ca305 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -217,12 +217,19 @@ struct typec_operations {
 			     enum typec_port_type type);
 };
 
+enum usb_pd_svdm_ver {
+	SVDM_VER_1_0 = 0,
+	SVDM_VER_2_0 = 1,
+	SVDM_VER_MAX = SVDM_VER_2_0,
+};
+
 /*
  * struct typec_capability - USB Type-C Port Capabilities
  * @type: Supported power role of the port
  * @data: Supported data role of the port
  * @revision: USB Type-C Specification release. Binary coded decimal
  * @pd_revision: USB Power Delivery Specification revision if supported
+ * @svdm_version: USB PD Structured VDM version if supported
  * @prefer_role: Initial role preference (DRP ports).
  * @accessory: Supported Accessory Modes
  * @fwnode: Optional fwnode of the port
@@ -236,6 +243,7 @@ struct typec_capability {
 	enum typec_port_data	data;
 	u16			revision; /* 0120H = "1.2" */
 	u16			pd_revision; /* 0300H = "3.0" */
+	enum usb_pd_svdm_ver	svdm_version;
 	int			prefer_role;
 	enum typec_accessory	accessory[TYPEC_MAX_ACCESSORY];
 	unsigned int		orientation_aware:1;
@@ -286,4 +294,8 @@ int typec_find_orientation(const char *name);
 int typec_find_port_power_role(const char *name);
 int typec_find_power_role(const char *name);
 int typec_find_port_data_role(const char *name);
+
+void typec_partner_set_svdm_version(struct typec_partner *partner,
+				    enum usb_pd_svdm_ver svdm_version);
+int typec_get_negotiated_svdm_version(struct typec_port *port);
 #endif /* __LINUX_USB_TYPEC_H */
diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h
index 5e0a7b7647c3..65933cbe9129 100644
--- a/include/linux/usb/typec_altmode.h
+++ b/include/linux/usb/typec_altmode.h
@@ -132,6 +132,16 @@ typec_altmode_get_orientation(struct typec_altmode *altmode)
 	return typec_get_orientation(typec_altmode2port(altmode));
 }
 
+/**
+ * typec_altmode_get_svdm_version - Get negotiated SVDM version
+ * @altmode: Handle to the alternate mode
+ */
+static inline int
+typec_altmode_get_svdm_version(struct typec_altmode *altmode)
+{
+	return typec_get_negotiated_svdm_version(typec_altmode2port(altmode));
+}
+
 /**
  * struct typec_altmode_driver - USB Type-C alternate mode device driver
  * @id_table: Null terminated array of SVIDs
-- 
cgit v1.2.3


From 31737c27d665bb3bc8ad9396c63fae2543dd8818 Mon Sep 17 00:00:00 2001
From: Kyle Tso <kyletso@google.com>
Date: Fri, 5 Feb 2021 11:34:10 +0800
Subject: usb: pd: Make SVDM Version configurable in VDM header

PD Rev 3.0 introduces SVDM Version 2.0. This patch makes the field
configuable in the header in order to be able to be compatible with
older SVDM version.

Acked-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Kyle Tso <kyletso@google.com>
Link: https://lore.kernel.org/r/20210205033415.3320439-3-kyletso@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/altmodes/displayport.c |  2 +-
 drivers/usb/typec/tcpm/tcpm.c            | 16 ++++++++--------
 drivers/usb/typec/ucsi/displayport.c     |  6 +++---
 include/linux/usb/pd_vdo.h               |  7 +++++--
 4 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c
index e62e5e3da01e..0abc3121238f 100644
--- a/drivers/usb/typec/altmodes/displayport.c
+++ b/drivers/usb/typec/altmodes/displayport.c
@@ -15,7 +15,7 @@
 #include <linux/usb/typec_dp.h>
 #include "displayport.h"
 
-#define DP_HEADER(_dp, cmd)		(VDO((_dp)->alt->svid, 1, cmd) | \
+#define DP_HEADER(_dp, cmd)		(VDO((_dp)->alt->svid, 1, SVDM_VER_1_0, cmd) | \
 					 VDO_OPOS(USB_TYPEC_DP_MODE))
 
 enum {
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 8558ab006885..9aadb1e1bec5 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -1544,17 +1544,17 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
 		case CMD_DISCOVER_IDENT:
 			/* 6.4.4.3.1 */
 			svdm_consume_identity(port, p, cnt);
-			response[0] = VDO(USB_SID_PD, 1, CMD_DISCOVER_SVID);
+			response[0] = VDO(USB_SID_PD, 1, SVDM_VER_1_0, CMD_DISCOVER_SVID);
 			rlen = 1;
 			break;
 		case CMD_DISCOVER_SVID:
 			/* 6.4.4.3.2 */
 			if (svdm_consume_svids(port, p, cnt)) {
-				response[0] = VDO(USB_SID_PD, 1,
+				response[0] = VDO(USB_SID_PD, 1, SVDM_VER_1_0,
 						  CMD_DISCOVER_SVID);
 				rlen = 1;
 			} else if (modep->nsvids && supports_modal(port)) {
-				response[0] = VDO(modep->svids[0], 1,
+				response[0] = VDO(modep->svids[0], 1, SVDM_VER_1_0,
 						  CMD_DISCOVER_MODES);
 				rlen = 1;
 			}
@@ -1565,7 +1565,7 @@ static int tcpm_pd_svdm(struct tcpm_port *port, struct typec_altmode *adev,
 			modep->svid_index++;
 			if (modep->svid_index < modep->nsvids) {
 				u16 svid = modep->svids[modep->svid_index];
-				response[0] = VDO(svid, 1, CMD_DISCOVER_MODES);
+				response[0] = VDO(svid, 1, SVDM_VER_1_0, CMD_DISCOVER_MODES);
 				rlen = 1;
 			} else {
 				tcpm_register_partner_altmodes(port);
@@ -1695,7 +1695,7 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port,
 			break;
 		case ADEV_QUEUE_VDM_SEND_EXIT_MODE_ON_FAIL:
 			if (typec_altmode_vdm(adev, p[0], &p[1], cnt)) {
-				response[0] = VDO(adev->svid, 1, CMD_EXIT_MODE);
+				response[0] = VDO(adev->svid, 1, SVDM_VER_1_0, CMD_EXIT_MODE);
 				response[0] |= VDO_OPOS(adev->mode);
 				rlen = 1;
 			}
@@ -1729,7 +1729,7 @@ static void tcpm_send_vdm(struct tcpm_port *port, u32 vid, int cmd,
 
 	/* set VDM header with VID & CMD */
 	header = VDO(vid, ((vid & USB_SID_PD) == USB_SID_PD) ?
-			1 : (PD_VDO_CMD(cmd) <= CMD_ATTENTION), cmd);
+			1 : (PD_VDO_CMD(cmd) <= CMD_ATTENTION), SVDM_VER_1_0, cmd);
 	tcpm_queue_vdm(port, header, data, count);
 }
 
@@ -2024,7 +2024,7 @@ static int tcpm_altmode_enter(struct typec_altmode *altmode, u32 *vdo)
 	struct tcpm_port *port = typec_altmode_get_drvdata(altmode);
 	u32 header;
 
-	header = VDO(altmode->svid, vdo ? 2 : 1, CMD_ENTER_MODE);
+	header = VDO(altmode->svid, vdo ? 2 : 1, SVDM_VER_1_0, CMD_ENTER_MODE);
 	header |= VDO_OPOS(altmode->mode);
 
 	tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0);
@@ -2036,7 +2036,7 @@ static int tcpm_altmode_exit(struct typec_altmode *altmode)
 	struct tcpm_port *port = typec_altmode_get_drvdata(altmode);
 	u32 header;
 
-	header = VDO(altmode->svid, 1, CMD_EXIT_MODE);
+	header = VDO(altmode->svid, 1, SVDM_VER_1_0, CMD_EXIT_MODE);
 	header |= VDO_OPOS(altmode->mode);
 
 	tcpm_queue_vdm_unlocked(port, header, NULL, 0);
diff --git a/drivers/usb/typec/ucsi/displayport.c b/drivers/usb/typec/ucsi/displayport.c
index 261131c9e37c..1d387bddefb9 100644
--- a/drivers/usb/typec/ucsi/displayport.c
+++ b/drivers/usb/typec/ucsi/displayport.c
@@ -83,7 +83,7 @@ static int ucsi_displayport_enter(struct typec_altmode *alt, u32 *vdo)
 	 * mode, and letting the alt mode driver continue.
 	 */
 
-	dp->header = VDO(USB_TYPEC_DP_SID, 1, CMD_ENTER_MODE);
+	dp->header = VDO(USB_TYPEC_DP_SID, 1, SVDM_VER_1_0, CMD_ENTER_MODE);
 	dp->header |= VDO_OPOS(USB_TYPEC_DP_MODE);
 	dp->header |= VDO_CMDT(CMDT_RSP_ACK);
 
@@ -120,7 +120,7 @@ static int ucsi_displayport_exit(struct typec_altmode *alt)
 	if (ret < 0)
 		goto out_unlock;
 
-	dp->header = VDO(USB_TYPEC_DP_SID, 1, CMD_EXIT_MODE);
+	dp->header = VDO(USB_TYPEC_DP_SID, 1, SVDM_VER_1_0, CMD_EXIT_MODE);
 	dp->header |= VDO_OPOS(USB_TYPEC_DP_MODE);
 	dp->header |= VDO_CMDT(CMDT_RSP_ACK);
 
@@ -200,7 +200,7 @@ static int ucsi_displayport_vdm(struct typec_altmode *alt,
 
 	switch (cmd_type) {
 	case CMDT_INIT:
-		dp->header = VDO(USB_TYPEC_DP_SID, 1, cmd);
+		dp->header = VDO(USB_TYPEC_DP_SID, 1, SVDM_VER_1_0, cmd);
 		dp->header |= VDO_OPOS(USB_TYPEC_DP_MODE);
 
 		switch (cmd) {
diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h
index 5de7f550f93e..b057250704e8 100644
--- a/include/linux/usb/pd_vdo.h
+++ b/include/linux/usb/pd_vdo.h
@@ -21,22 +21,24 @@
  * ----------
  * <31:16>  :: SVID
  * <15>     :: VDM type ( 1b == structured, 0b == unstructured )
- * <14:13>  :: Structured VDM version (can only be 00 == 1.0 currently)
+ * <14:13>  :: Structured VDM version
  * <12:11>  :: reserved
  * <10:8>   :: object position (1-7 valid ... used for enter/exit mode only)
  * <7:6>    :: command type (SVDM only?)
  * <5>      :: reserved (SVDM), command type (UVDM)
  * <4:0>    :: command
  */
-#define VDO(vid, type, custom)				\
+#define VDO(vid, type, ver, custom)			\
 	(((vid) << 16) |				\
 	 ((type) << 15) |				\
+	 ((ver) << 13) |				\
 	 ((custom) & 0x7FFF))
 
 #define VDO_SVDM_TYPE		(1 << 15)
 #define VDO_SVDM_VERS(x)	((x) << 13)
 #define VDO_OPOS(x)		((x) << 8)
 #define VDO_CMDT(x)		((x) << 6)
+#define VDO_SVDM_VERS_MASK	VDO_SVDM_VERS(0x3)
 #define VDO_OPOS_MASK		VDO_OPOS(0x7)
 #define VDO_CMDT_MASK		VDO_CMDT(0x3)
 
@@ -74,6 +76,7 @@
 
 #define PD_VDO_VID(vdo)		((vdo) >> 16)
 #define PD_VDO_SVDM(vdo)	(((vdo) >> 15) & 1)
+#define PD_VDO_SVDM_VER(vdo)	(((vdo) >> 13) & 0x3)
 #define PD_VDO_OPOS(vdo)	(((vdo) >> 8) & 0x7)
 #define PD_VDO_CMD(vdo)		((vdo) & 0x1f)
 #define PD_VDO_CMDT(vdo)	(((vdo) >> 6) & 0x3)
-- 
cgit v1.2.3


From 1077d4367ab3b97f6db2f66c87289af863652215 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 3 Feb 2021 15:13:50 +0100
Subject: firmware: xilinx: Use explicit values for all enum values

Based on discussion at
https://lore.kernel.org/r/20200318125003.GA2727094@kroah.com we got
recommendation to use explicit values for all enum values.
The patch is following this recommendation.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/daeb67ded45d8a8f6a96717d1fb9c84439dd2ae8.1612361627.git.michal.simek@xilinx.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 338 +++++++++++++++++------------------
 1 file changed, 169 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0a483249ff30..71177b17eee5 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -64,25 +64,25 @@ enum pm_api_id {
 	PM_GET_API_VERSION = 1,
 	PM_SYSTEM_SHUTDOWN = 12,
 	PM_REQUEST_NODE = 13,
-	PM_RELEASE_NODE,
-	PM_SET_REQUIREMENT,
+	PM_RELEASE_NODE = 14,
+	PM_SET_REQUIREMENT = 15,
 	PM_RESET_ASSERT = 17,
-	PM_RESET_GET_STATUS,
+	PM_RESET_GET_STATUS = 18,
 	PM_PM_INIT_FINALIZE = 21,
-	PM_FPGA_LOAD,
-	PM_FPGA_GET_STATUS,
+	PM_FPGA_LOAD = 22,
+	PM_FPGA_GET_STATUS = 23,
 	PM_GET_CHIPID = 24,
 	PM_IOCTL = 34,
-	PM_QUERY_DATA,
-	PM_CLOCK_ENABLE,
-	PM_CLOCK_DISABLE,
-	PM_CLOCK_GETSTATE,
-	PM_CLOCK_SETDIVIDER,
-	PM_CLOCK_GETDIVIDER,
-	PM_CLOCK_SETRATE,
-	PM_CLOCK_GETRATE,
-	PM_CLOCK_SETPARENT,
-	PM_CLOCK_GETPARENT,
+	PM_QUERY_DATA = 35,
+	PM_CLOCK_ENABLE = 36,
+	PM_CLOCK_DISABLE = 37,
+	PM_CLOCK_GETSTATE = 38,
+	PM_CLOCK_SETDIVIDER = 39,
+	PM_CLOCK_GETDIVIDER = 40,
+	PM_CLOCK_SETRATE = 41,
+	PM_CLOCK_GETRATE = 42,
+	PM_CLOCK_SETPARENT = 43,
+	PM_CLOCK_GETPARENT = 44,
 	PM_SECURE_AES = 47,
 	PM_FEATURE_CHECK = 63,
 };
@@ -92,21 +92,21 @@ enum pm_ret_status {
 	XST_PM_SUCCESS = 0,
 	XST_PM_NO_FEATURE = 19,
 	XST_PM_INTERNAL = 2000,
-	XST_PM_CONFLICT,
-	XST_PM_NO_ACCESS,
-	XST_PM_INVALID_NODE,
-	XST_PM_DOUBLE_REQ,
-	XST_PM_ABORT_SUSPEND,
+	XST_PM_CONFLICT = 2001,
+	XST_PM_NO_ACCESS = 2002,
+	XST_PM_INVALID_NODE = 2003,
+	XST_PM_DOUBLE_REQ = 2004,
+	XST_PM_ABORT_SUSPEND = 2005,
 	XST_PM_MULT_USER = 2008,
 };
 
 enum pm_ioctl_id {
 	IOCTL_SD_DLL_RESET = 6,
-	IOCTL_SET_SD_TAPDELAY,
-	IOCTL_SET_PLL_FRAC_MODE,
-	IOCTL_GET_PLL_FRAC_MODE,
-	IOCTL_SET_PLL_FRAC_DATA,
-	IOCTL_GET_PLL_FRAC_DATA,
+	IOCTL_SET_SD_TAPDELAY = 7,
+	IOCTL_SET_PLL_FRAC_MODE = 8,
+	IOCTL_GET_PLL_FRAC_MODE = 9,
+	IOCTL_SET_PLL_FRAC_DATA = 10,
+	IOCTL_GET_PLL_FRAC_DATA = 11,
 	IOCTL_WRITE_GGS = 12,
 	IOCTL_READ_GGS = 13,
 	IOCTL_WRITE_PGGS = 14,
@@ -116,185 +116,185 @@ enum pm_ioctl_id {
 };
 
 enum pm_query_id {
-	PM_QID_INVALID,
-	PM_QID_CLOCK_GET_NAME,
-	PM_QID_CLOCK_GET_TOPOLOGY,
-	PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS,
-	PM_QID_CLOCK_GET_PARENTS,
-	PM_QID_CLOCK_GET_ATTRIBUTES,
+	PM_QID_INVALID = 0,
+	PM_QID_CLOCK_GET_NAME = 1,
+	PM_QID_CLOCK_GET_TOPOLOGY = 2,
+	PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS = 3,
+	PM_QID_CLOCK_GET_PARENTS = 4,
+	PM_QID_CLOCK_GET_ATTRIBUTES = 5,
 	PM_QID_CLOCK_GET_NUM_CLOCKS = 12,
-	PM_QID_CLOCK_GET_MAX_DIVISOR,
+	PM_QID_CLOCK_GET_MAX_DIVISOR = 13,
 };
 
 enum zynqmp_pm_reset_action {
-	PM_RESET_ACTION_RELEASE,
-	PM_RESET_ACTION_ASSERT,
-	PM_RESET_ACTION_PULSE,
+	PM_RESET_ACTION_RELEASE = 0,
+	PM_RESET_ACTION_ASSERT = 1,
+	PM_RESET_ACTION_PULSE = 2,
 };
 
 enum zynqmp_pm_reset {
 	ZYNQMP_PM_RESET_START = 1000,
 	ZYNQMP_PM_RESET_PCIE_CFG = ZYNQMP_PM_RESET_START,
-	ZYNQMP_PM_RESET_PCIE_BRIDGE,
-	ZYNQMP_PM_RESET_PCIE_CTRL,
-	ZYNQMP_PM_RESET_DP,
-	ZYNQMP_PM_RESET_SWDT_CRF,
-	ZYNQMP_PM_RESET_AFI_FM5,
-	ZYNQMP_PM_RESET_AFI_FM4,
-	ZYNQMP_PM_RESET_AFI_FM3,
-	ZYNQMP_PM_RESET_AFI_FM2,
-	ZYNQMP_PM_RESET_AFI_FM1,
-	ZYNQMP_PM_RESET_AFI_FM0,
-	ZYNQMP_PM_RESET_GDMA,
-	ZYNQMP_PM_RESET_GPU_PP1,
-	ZYNQMP_PM_RESET_GPU_PP0,
-	ZYNQMP_PM_RESET_GPU,
-	ZYNQMP_PM_RESET_GT,
-	ZYNQMP_PM_RESET_SATA,
-	ZYNQMP_PM_RESET_ACPU3_PWRON,
-	ZYNQMP_PM_RESET_ACPU2_PWRON,
-	ZYNQMP_PM_RESET_ACPU1_PWRON,
-	ZYNQMP_PM_RESET_ACPU0_PWRON,
-	ZYNQMP_PM_RESET_APU_L2,
-	ZYNQMP_PM_RESET_ACPU3,
-	ZYNQMP_PM_RESET_ACPU2,
-	ZYNQMP_PM_RESET_ACPU1,
-	ZYNQMP_PM_RESET_ACPU0,
-	ZYNQMP_PM_RESET_DDR,
-	ZYNQMP_PM_RESET_APM_FPD,
-	ZYNQMP_PM_RESET_SOFT,
-	ZYNQMP_PM_RESET_GEM0,
-	ZYNQMP_PM_RESET_GEM1,
-	ZYNQMP_PM_RESET_GEM2,
-	ZYNQMP_PM_RESET_GEM3,
-	ZYNQMP_PM_RESET_QSPI,
-	ZYNQMP_PM_RESET_UART0,
-	ZYNQMP_PM_RESET_UART1,
-	ZYNQMP_PM_RESET_SPI0,
-	ZYNQMP_PM_RESET_SPI1,
-	ZYNQMP_PM_RESET_SDIO0,
-	ZYNQMP_PM_RESET_SDIO1,
-	ZYNQMP_PM_RESET_CAN0,
-	ZYNQMP_PM_RESET_CAN1,
-	ZYNQMP_PM_RESET_I2C0,
-	ZYNQMP_PM_RESET_I2C1,
-	ZYNQMP_PM_RESET_TTC0,
-	ZYNQMP_PM_RESET_TTC1,
-	ZYNQMP_PM_RESET_TTC2,
-	ZYNQMP_PM_RESET_TTC3,
-	ZYNQMP_PM_RESET_SWDT_CRL,
-	ZYNQMP_PM_RESET_NAND,
-	ZYNQMP_PM_RESET_ADMA,
-	ZYNQMP_PM_RESET_GPIO,
-	ZYNQMP_PM_RESET_IOU_CC,
-	ZYNQMP_PM_RESET_TIMESTAMP,
-	ZYNQMP_PM_RESET_RPU_R50,
-	ZYNQMP_PM_RESET_RPU_R51,
-	ZYNQMP_PM_RESET_RPU_AMBA,
-	ZYNQMP_PM_RESET_OCM,
-	ZYNQMP_PM_RESET_RPU_PGE,
-	ZYNQMP_PM_RESET_USB0_CORERESET,
-	ZYNQMP_PM_RESET_USB1_CORERESET,
-	ZYNQMP_PM_RESET_USB0_HIBERRESET,
-	ZYNQMP_PM_RESET_USB1_HIBERRESET,
-	ZYNQMP_PM_RESET_USB0_APB,
-	ZYNQMP_PM_RESET_USB1_APB,
-	ZYNQMP_PM_RESET_IPI,
-	ZYNQMP_PM_RESET_APM_LPD,
-	ZYNQMP_PM_RESET_RTC,
-	ZYNQMP_PM_RESET_SYSMON,
-	ZYNQMP_PM_RESET_AFI_FM6,
-	ZYNQMP_PM_RESET_LPD_SWDT,
-	ZYNQMP_PM_RESET_FPD,
-	ZYNQMP_PM_RESET_RPU_DBG1,
-	ZYNQMP_PM_RESET_RPU_DBG0,
-	ZYNQMP_PM_RESET_DBG_LPD,
-	ZYNQMP_PM_RESET_DBG_FPD,
-	ZYNQMP_PM_RESET_APLL,
-	ZYNQMP_PM_RESET_DPLL,
-	ZYNQMP_PM_RESET_VPLL,
-	ZYNQMP_PM_RESET_IOPLL,
-	ZYNQMP_PM_RESET_RPLL,
-	ZYNQMP_PM_RESET_GPO3_PL_0,
-	ZYNQMP_PM_RESET_GPO3_PL_1,
-	ZYNQMP_PM_RESET_GPO3_PL_2,
-	ZYNQMP_PM_RESET_GPO3_PL_3,
-	ZYNQMP_PM_RESET_GPO3_PL_4,
-	ZYNQMP_PM_RESET_GPO3_PL_5,
-	ZYNQMP_PM_RESET_GPO3_PL_6,
-	ZYNQMP_PM_RESET_GPO3_PL_7,
-	ZYNQMP_PM_RESET_GPO3_PL_8,
-	ZYNQMP_PM_RESET_GPO3_PL_9,
-	ZYNQMP_PM_RESET_GPO3_PL_10,
-	ZYNQMP_PM_RESET_GPO3_PL_11,
-	ZYNQMP_PM_RESET_GPO3_PL_12,
-	ZYNQMP_PM_RESET_GPO3_PL_13,
-	ZYNQMP_PM_RESET_GPO3_PL_14,
-	ZYNQMP_PM_RESET_GPO3_PL_15,
-	ZYNQMP_PM_RESET_GPO3_PL_16,
-	ZYNQMP_PM_RESET_GPO3_PL_17,
-	ZYNQMP_PM_RESET_GPO3_PL_18,
-	ZYNQMP_PM_RESET_GPO3_PL_19,
-	ZYNQMP_PM_RESET_GPO3_PL_20,
-	ZYNQMP_PM_RESET_GPO3_PL_21,
-	ZYNQMP_PM_RESET_GPO3_PL_22,
-	ZYNQMP_PM_RESET_GPO3_PL_23,
-	ZYNQMP_PM_RESET_GPO3_PL_24,
-	ZYNQMP_PM_RESET_GPO3_PL_25,
-	ZYNQMP_PM_RESET_GPO3_PL_26,
-	ZYNQMP_PM_RESET_GPO3_PL_27,
-	ZYNQMP_PM_RESET_GPO3_PL_28,
-	ZYNQMP_PM_RESET_GPO3_PL_29,
-	ZYNQMP_PM_RESET_GPO3_PL_30,
-	ZYNQMP_PM_RESET_GPO3_PL_31,
-	ZYNQMP_PM_RESET_RPU_LS,
-	ZYNQMP_PM_RESET_PS_ONLY,
-	ZYNQMP_PM_RESET_PL,
-	ZYNQMP_PM_RESET_PS_PL0,
-	ZYNQMP_PM_RESET_PS_PL1,
-	ZYNQMP_PM_RESET_PS_PL2,
-	ZYNQMP_PM_RESET_PS_PL3,
+	ZYNQMP_PM_RESET_PCIE_BRIDGE = 1001,
+	ZYNQMP_PM_RESET_PCIE_CTRL = 1002,
+	ZYNQMP_PM_RESET_DP = 1003,
+	ZYNQMP_PM_RESET_SWDT_CRF = 1004,
+	ZYNQMP_PM_RESET_AFI_FM5 = 1005,
+	ZYNQMP_PM_RESET_AFI_FM4 = 1006,
+	ZYNQMP_PM_RESET_AFI_FM3 = 1007,
+	ZYNQMP_PM_RESET_AFI_FM2 = 1008,
+	ZYNQMP_PM_RESET_AFI_FM1 = 1009,
+	ZYNQMP_PM_RESET_AFI_FM0 = 1010,
+	ZYNQMP_PM_RESET_GDMA = 1011,
+	ZYNQMP_PM_RESET_GPU_PP1 = 1012,
+	ZYNQMP_PM_RESET_GPU_PP0 = 1013,
+	ZYNQMP_PM_RESET_GPU = 1014,
+	ZYNQMP_PM_RESET_GT = 1015,
+	ZYNQMP_PM_RESET_SATA = 1016,
+	ZYNQMP_PM_RESET_ACPU3_PWRON = 1017,
+	ZYNQMP_PM_RESET_ACPU2_PWRON = 1018,
+	ZYNQMP_PM_RESET_ACPU1_PWRON = 1019,
+	ZYNQMP_PM_RESET_ACPU0_PWRON = 1020,
+	ZYNQMP_PM_RESET_APU_L2 = 1021,
+	ZYNQMP_PM_RESET_ACPU3 = 1022,
+	ZYNQMP_PM_RESET_ACPU2 = 1023,
+	ZYNQMP_PM_RESET_ACPU1 = 1024,
+	ZYNQMP_PM_RESET_ACPU0 = 1025,
+	ZYNQMP_PM_RESET_DDR = 1026,
+	ZYNQMP_PM_RESET_APM_FPD = 1027,
+	ZYNQMP_PM_RESET_SOFT = 1028,
+	ZYNQMP_PM_RESET_GEM0 = 1029,
+	ZYNQMP_PM_RESET_GEM1 = 1030,
+	ZYNQMP_PM_RESET_GEM2 = 1031,
+	ZYNQMP_PM_RESET_GEM3 = 1032,
+	ZYNQMP_PM_RESET_QSPI = 1033,
+	ZYNQMP_PM_RESET_UART0 = 1034,
+	ZYNQMP_PM_RESET_UART1 = 1035,
+	ZYNQMP_PM_RESET_SPI0 = 1036,
+	ZYNQMP_PM_RESET_SPI1 = 1037,
+	ZYNQMP_PM_RESET_SDIO0 = 1038,
+	ZYNQMP_PM_RESET_SDIO1 = 1039,
+	ZYNQMP_PM_RESET_CAN0 = 1040,
+	ZYNQMP_PM_RESET_CAN1 = 1041,
+	ZYNQMP_PM_RESET_I2C0 = 1042,
+	ZYNQMP_PM_RESET_I2C1 = 1043,
+	ZYNQMP_PM_RESET_TTC0 = 1044,
+	ZYNQMP_PM_RESET_TTC1 = 1045,
+	ZYNQMP_PM_RESET_TTC2 = 1046,
+	ZYNQMP_PM_RESET_TTC3 = 1047,
+	ZYNQMP_PM_RESET_SWDT_CRL = 1048,
+	ZYNQMP_PM_RESET_NAND = 1049,
+	ZYNQMP_PM_RESET_ADMA = 1050,
+	ZYNQMP_PM_RESET_GPIO = 1051,
+	ZYNQMP_PM_RESET_IOU_CC = 1052,
+	ZYNQMP_PM_RESET_TIMESTAMP = 1053,
+	ZYNQMP_PM_RESET_RPU_R50 = 1054,
+	ZYNQMP_PM_RESET_RPU_R51 = 1055,
+	ZYNQMP_PM_RESET_RPU_AMBA = 1056,
+	ZYNQMP_PM_RESET_OCM = 1057,
+	ZYNQMP_PM_RESET_RPU_PGE = 1058,
+	ZYNQMP_PM_RESET_USB0_CORERESET = 1059,
+	ZYNQMP_PM_RESET_USB1_CORERESET = 1060,
+	ZYNQMP_PM_RESET_USB0_HIBERRESET = 1061,
+	ZYNQMP_PM_RESET_USB1_HIBERRESET = 1062,
+	ZYNQMP_PM_RESET_USB0_APB = 1063,
+	ZYNQMP_PM_RESET_USB1_APB = 1064,
+	ZYNQMP_PM_RESET_IPI = 1065,
+	ZYNQMP_PM_RESET_APM_LPD = 1066,
+	ZYNQMP_PM_RESET_RTC = 1067,
+	ZYNQMP_PM_RESET_SYSMON = 1068,
+	ZYNQMP_PM_RESET_AFI_FM6 = 1069,
+	ZYNQMP_PM_RESET_LPD_SWDT = 1070,
+	ZYNQMP_PM_RESET_FPD = 1071,
+	ZYNQMP_PM_RESET_RPU_DBG1 = 1072,
+	ZYNQMP_PM_RESET_RPU_DBG0 = 1073,
+	ZYNQMP_PM_RESET_DBG_LPD = 1074,
+	ZYNQMP_PM_RESET_DBG_FPD = 1075,
+	ZYNQMP_PM_RESET_APLL = 1076,
+	ZYNQMP_PM_RESET_DPLL = 1077,
+	ZYNQMP_PM_RESET_VPLL = 1078,
+	ZYNQMP_PM_RESET_IOPLL = 1079,
+	ZYNQMP_PM_RESET_RPLL = 1080,
+	ZYNQMP_PM_RESET_GPO3_PL_0 = 1081,
+	ZYNQMP_PM_RESET_GPO3_PL_1 = 1082,
+	ZYNQMP_PM_RESET_GPO3_PL_2 = 1083,
+	ZYNQMP_PM_RESET_GPO3_PL_3 = 1084,
+	ZYNQMP_PM_RESET_GPO3_PL_4 = 1085,
+	ZYNQMP_PM_RESET_GPO3_PL_5 = 1086,
+	ZYNQMP_PM_RESET_GPO3_PL_6 = 1087,
+	ZYNQMP_PM_RESET_GPO3_PL_7 = 1088,
+	ZYNQMP_PM_RESET_GPO3_PL_8 = 1089,
+	ZYNQMP_PM_RESET_GPO3_PL_9 = 1090,
+	ZYNQMP_PM_RESET_GPO3_PL_10 = 1091,
+	ZYNQMP_PM_RESET_GPO3_PL_11 = 1092,
+	ZYNQMP_PM_RESET_GPO3_PL_12 = 1093,
+	ZYNQMP_PM_RESET_GPO3_PL_13 = 1094,
+	ZYNQMP_PM_RESET_GPO3_PL_14 = 1095,
+	ZYNQMP_PM_RESET_GPO3_PL_15 = 1096,
+	ZYNQMP_PM_RESET_GPO3_PL_16 = 1097,
+	ZYNQMP_PM_RESET_GPO3_PL_17 = 1098,
+	ZYNQMP_PM_RESET_GPO3_PL_18 = 1099,
+	ZYNQMP_PM_RESET_GPO3_PL_19 = 1100,
+	ZYNQMP_PM_RESET_GPO3_PL_20 = 1101,
+	ZYNQMP_PM_RESET_GPO3_PL_21 = 1102,
+	ZYNQMP_PM_RESET_GPO3_PL_22 = 1103,
+	ZYNQMP_PM_RESET_GPO3_PL_23 = 1104,
+	ZYNQMP_PM_RESET_GPO3_PL_24 = 1105,
+	ZYNQMP_PM_RESET_GPO3_PL_25 = 1106,
+	ZYNQMP_PM_RESET_GPO3_PL_26 = 1107,
+	ZYNQMP_PM_RESET_GPO3_PL_27 = 1108,
+	ZYNQMP_PM_RESET_GPO3_PL_28 = 1109,
+	ZYNQMP_PM_RESET_GPO3_PL_29 = 1110,
+	ZYNQMP_PM_RESET_GPO3_PL_30 = 1111,
+	ZYNQMP_PM_RESET_GPO3_PL_31 = 1112,
+	ZYNQMP_PM_RESET_RPU_LS = 1113,
+	ZYNQMP_PM_RESET_PS_ONLY = 1114,
+	ZYNQMP_PM_RESET_PL = 1115,
+	ZYNQMP_PM_RESET_PS_PL0 = 1116,
+	ZYNQMP_PM_RESET_PS_PL1 = 1117,
+	ZYNQMP_PM_RESET_PS_PL2 = 1118,
+	ZYNQMP_PM_RESET_PS_PL3 = 1119,
 	ZYNQMP_PM_RESET_END = ZYNQMP_PM_RESET_PS_PL3
 };
 
 enum zynqmp_pm_suspend_reason {
 	SUSPEND_POWER_REQUEST = 201,
-	SUSPEND_ALERT,
-	SUSPEND_SYSTEM_SHUTDOWN,
+	SUSPEND_ALERT = 202,
+	SUSPEND_SYSTEM_SHUTDOWN = 203,
 };
 
 enum zynqmp_pm_request_ack {
 	ZYNQMP_PM_REQUEST_ACK_NO = 1,
-	ZYNQMP_PM_REQUEST_ACK_BLOCKING,
-	ZYNQMP_PM_REQUEST_ACK_NON_BLOCKING,
+	ZYNQMP_PM_REQUEST_ACK_BLOCKING = 2,
+	ZYNQMP_PM_REQUEST_ACK_NON_BLOCKING = 3,
 };
 
 enum pm_node_id {
 	NODE_SD_0 = 39,
-	NODE_SD_1,
+	NODE_SD_1 = 40,
 };
 
 enum tap_delay_type {
 	PM_TAPDELAY_INPUT = 0,
-	PM_TAPDELAY_OUTPUT,
+	PM_TAPDELAY_OUTPUT = 1,
 };
 
 enum dll_reset_type {
-	PM_DLL_RESET_ASSERT,
-	PM_DLL_RESET_RELEASE,
-	PM_DLL_RESET_PULSE,
+	PM_DLL_RESET_ASSERT = 0,
+	PM_DLL_RESET_RELEASE = 1,
+	PM_DLL_RESET_PULSE = 2,
 };
 
 enum zynqmp_pm_shutdown_type {
-	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN,
-	ZYNQMP_PM_SHUTDOWN_TYPE_RESET,
-	ZYNQMP_PM_SHUTDOWN_TYPE_SETSCOPE_ONLY,
+	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN = 0,
+	ZYNQMP_PM_SHUTDOWN_TYPE_RESET = 1,
+	ZYNQMP_PM_SHUTDOWN_TYPE_SETSCOPE_ONLY = 2,
 };
 
 enum zynqmp_pm_shutdown_subtype {
-	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM,
-	ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY,
-	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM,
+	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SUBSYSTEM = 0,
+	ZYNQMP_PM_SHUTDOWN_SUBTYPE_PS_ONLY = 1,
+	ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM = 2,
 };
 
 /**
-- 
cgit v1.2.3


From 2adc75fba3289455b9c4349dd6b95cfb7167b7ea Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Wed, 27 Jan 2021 22:23:29 +0100
Subject: vme: make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of struct bus_type::remove()
because there is only little that can be done. To simplify the quest to
make this function return void, let struct vme_driver::remove return void,
too. There is only a single vme driver and it already returns 0
unconditionally in .remove().

Also fix the bus remove function to always return 0.

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Link: https://lore.kernel.org/r/20210127212329.98517-1-uwe@kleine-koenig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/vme/devices/vme_user.c | 4 +---
 drivers/vme/vme.c                      | 4 ++--
 include/linux/vme.h                    | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/vme/devices/vme_user.c b/drivers/staging/vme/devices/vme_user.c
index fd0ea4dbcb91..35d7260e2271 100644
--- a/drivers/staging/vme/devices/vme_user.c
+++ b/drivers/staging/vme/devices/vme_user.c
@@ -689,7 +689,7 @@ err_dev:
 	return err;
 }
 
-static int vme_user_remove(struct vme_dev *dev)
+static void vme_user_remove(struct vme_dev *dev)
 {
 	int i;
 
@@ -717,8 +717,6 @@ static int vme_user_remove(struct vme_dev *dev)
 
 	/* Unregister the major and minor device numbers */
 	unregister_chrdev_region(MKDEV(VME_MAJOR, 0), VME_DEVS);
-
-	return 0;
 }
 
 static struct vme_driver vme_user_driver = {
diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c
index 54d7963c1078..1b15afea28ee 100644
--- a/drivers/vme/vme.c
+++ b/drivers/vme/vme.c
@@ -1997,9 +1997,9 @@ static int vme_bus_remove(struct device *dev)
 
 	driver = dev->platform_data;
 	if (driver->remove)
-		return driver->remove(vdev);
+		driver->remove(vdev);
 
-	return -ENODEV;
+	return 0;
 }
 
 struct bus_type vme_bus_type = {
diff --git a/include/linux/vme.h b/include/linux/vme.h
index 7e82bf500f01..b204a9b4be1b 100644
--- a/include/linux/vme.h
+++ b/include/linux/vme.h
@@ -122,7 +122,7 @@ struct vme_driver {
 	const char *name;
 	int (*match)(struct vme_dev *);
 	int (*probe)(struct vme_dev *);
-	int (*remove)(struct vme_dev *);
+	void (*remove)(struct vme_dev *);
 	struct device_driver driver;
 	struct list_head devices;
 };
-- 
cgit v1.2.3


From 5c3db63abdb08d8f0ec2c609f7789a199d5c476f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 5 Feb 2021 18:06:08 +0100
Subject: device.h: Remove bogus "the" in kerneldoc

Remove the bogus word "the" from "...once the it is..." in the
documentation describing the "dev_groups" member of the device_driver
structure.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20210205170608.1956223-1-geert@linux-m68k.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index ee7ba5b5417e..a498ebcf4993 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -75,7 +75,7 @@ enum probe_type {
  * @resume:	Called to bring a device from sleep mode.
  * @groups:	Default attributes that get created by the driver core
  *		automatically.
- * @dev_groups:	Additional attributes attached to device instance once the
+ * @dev_groups:	Additional attributes attached to device instance once
  *		it is bound to the driver.
  * @pm:		Power management operations of the device which matched
  *		this driver.
-- 
cgit v1.2.3


From 9fd6dad1261a541b3f5fa7dc5b152222306e6702 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 5 Feb 2021 05:07:11 -0500
Subject: mm: provide a saner PTE walking API for modules

Currently, the follow_pfn function is exported for modules but
follow_pte is not.  However, follow_pfn is very easy to misuse,
because it does not provide protections (so most of its callers
assume the page is writable!) and because it returns after having
already unlocked the page table lock.

Provide instead a simplified version of follow_pte that does
not have the pmdpp and range arguments.  The older version
survives as follow_invalidate_pte() for use by fs/dax.c.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/pci/pci_mmio.c |  4 ++--
 fs/dax.c                 |  5 +++--
 include/linux/mm.h       |  6 ++++--
 mm/memory.c              | 41 ++++++++++++++++++++++++++++++++++++-----
 virt/kvm/kvm_main.c      |  4 ++--
 5 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 18f2d10c3176..474617b88648 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -170,7 +170,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & VM_WRITE))
 		goto out_unlock_mmap;
 
-	ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+	ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
 	if (ret)
 		goto out_unlock_mmap;
 
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & VM_WRITE))
 		goto out_unlock_mmap;
 
-	ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+	ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
 	if (ret)
 		goto out_unlock_mmap;
 
diff --git a/fs/dax.c b/fs/dax.c
index 26d5dcd2d69e..b3d27fdc6775 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -810,11 +810,12 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
 		address = pgoff_address(index, vma);
 
 		/*
-		 * Note because we provide range to follow_pte it will call
+		 * follow_invalidate_pte() will use the range to call
 		 * mmu_notifier_invalidate_range_start() on our behalf before
 		 * taking any lock.
 		 */
-		if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
+		if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
+					  &pmdp, &ptl))
 			continue;
 
 		/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8cd6ae..24b292fce8e5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1658,9 +1658,11 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+			  struct mmu_notifier_range *range, pte_t **ptepp,
+			  pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
-		struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
-		spinlock_t **ptlp);
+	       pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..985dac0958dc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4709,9 +4709,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_pte(struct mm_struct *mm, unsigned long address,
-	       struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
-	       spinlock_t **ptlp)
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+			  struct mmu_notifier_range *range, pte_t **ptepp,
+			  pmd_t **pmdpp, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -4776,6 +4776,34 @@ out:
 	return -EINVAL;
 }
 
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+	       pte_t **ptepp, spinlock_t **ptlp)
+{
+	return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+}
+EXPORT_SYMBOL_GPL(follow_pte);
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
@@ -4784,6 +4812,9 @@ out:
  *
  * Only IO mappings and raw PFN mappings are allowed.
  *
+ * This function does not allow the caller to read the permissions
+ * of the PTE.  Do not use it.
+ *
  * Return: zero and the pfn at @pfn on success, -ve otherwise.
  */
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -4796,7 +4827,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		return ret;
 
-	ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
+	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
 	if (ret)
 		return ret;
 	*pfn = pte_pfn(*ptep);
@@ -4817,7 +4848,7 @@ int follow_phys(struct vm_area_struct *vma,
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		goto out;
 
-	if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
+	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
 		goto out;
 	pte = *ptep;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 48ccdf4e3d04..ee4ac2618ec5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1911,7 +1911,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	int r;
 
-	r = follow_pte(vma->vm_mm, addr, NULL, &ptep, NULL, &ptl);
+	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
 	if (r) {
 		/*
 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@ -1926,7 +1926,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 		if (r)
 			return r;
 
-		r = follow_pte(vma->vm_mm, addr, NULL, &ptep, NULL, &ptl);
+		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
 		if (r)
 			return r;
 	}
-- 
cgit v1.2.3


From c5d1448fa353242635fa3e1fed6ab4558e0e7d9a Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Mon, 8 Feb 2021 15:31:49 +0100
Subject: USB: serial: make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All usb_serial drivers return 0 in their remove callbacks and driver
core ignores the value returned by usb_serial_device_remove(). So change
the remove callback to return void and return 0 unconditionally in
usb_serial_device_remove().

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Link: https://lore.kernel.org/r/20210208143149.963644-2-uwe@kleine-koenig.org
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/ark3116.c          | 4 +---
 drivers/usb/serial/belkin_sa.c        | 6 ++----
 drivers/usb/serial/bus.c              | 5 ++---
 drivers/usb/serial/ch341.c            | 4 +---
 drivers/usb/serial/cp210x.c           | 6 ++----
 drivers/usb/serial/cyberjack.c        | 6 ++----
 drivers/usb/serial/cypress_m8.c       | 6 ++----
 drivers/usb/serial/digi_acceleport.c  | 6 ++----
 drivers/usb/serial/f81534.c           | 3 +--
 drivers/usb/serial/ftdi_sio.c         | 6 ++----
 drivers/usb/serial/garmin_gps.c       | 3 +--
 drivers/usb/serial/io_edgeport.c      | 6 ++----
 drivers/usb/serial/io_ti.c            | 4 +---
 drivers/usb/serial/iuu_phoenix.c      | 4 +---
 drivers/usb/serial/keyspan.c          | 6 ++----
 drivers/usb/serial/keyspan_pda.c      | 4 +---
 drivers/usb/serial/kl5kusb105.c       | 6 ++----
 drivers/usb/serial/kobil_sct.c        | 6 ++----
 drivers/usb/serial/mct_u232.c         | 6 ++----
 drivers/usb/serial/metro-usb.c        | 4 +---
 drivers/usb/serial/mos7720.c          | 4 +---
 drivers/usb/serial/mos7840.c          | 4 +---
 drivers/usb/serial/omninet.c          | 6 ++----
 drivers/usb/serial/opticon.c          | 4 +---
 drivers/usb/serial/oti6858.c          | 6 ++----
 drivers/usb/serial/pl2303.c           | 4 +---
 drivers/usb/serial/quatech2.c         | 4 +---
 drivers/usb/serial/sierra.c           | 4 +---
 drivers/usb/serial/spcp8x5.c          | 4 +---
 drivers/usb/serial/ssu100.c           | 4 +---
 drivers/usb/serial/symbolserial.c     | 4 +---
 drivers/usb/serial/ti_usb_3410_5052.c | 6 ++----
 drivers/usb/serial/upd78f0730.c       | 4 +---
 drivers/usb/serial/usb-wwan.h         | 2 +-
 drivers/usb/serial/usb_wwan.c         | 4 +---
 drivers/usb/serial/whiteheat.c        | 6 ++----
 include/linux/usb/serial.h            | 2 +-
 37 files changed, 53 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index 71a9206ea1e2..f0ac7bb07ac1 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c
@@ -178,15 +178,13 @@ static int ark3116_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int ark3116_port_remove(struct usb_serial_port *port)
+static void ark3116_port_remove(struct usb_serial_port *port)
 {
 	struct ark3116_private *priv = usb_get_serial_port_data(port);
 
 	/* device is closed, so URBs and DMA should be down */
 	mutex_destroy(&priv->hw_lock);
 	kfree(priv);
-
-	return 0;
 }
 
 static void ark3116_set_termios(struct tty_struct *tty,
diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index 9bb123ab9bc9..ed9193f3bb1a 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -37,7 +37,7 @@
 
 /* function prototypes for a Belkin USB Serial Adapter F5U103 */
 static int belkin_sa_port_probe(struct usb_serial_port *port);
-static int belkin_sa_port_remove(struct usb_serial_port *port);
+static void belkin_sa_port_remove(struct usb_serial_port *port);
 static int  belkin_sa_open(struct tty_struct *tty,
 			struct usb_serial_port *port);
 static void belkin_sa_close(struct usb_serial_port *port);
@@ -134,14 +134,12 @@ static int belkin_sa_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int belkin_sa_port_remove(struct usb_serial_port *port)
+static void belkin_sa_port_remove(struct usb_serial_port *port)
 {
 	struct belkin_sa_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int belkin_sa_open(struct tty_struct *tty,
diff --git a/drivers/usb/serial/bus.c b/drivers/usb/serial/bus.c
index 58206ef0a780..1a8c7821d00a 100644
--- a/drivers/usb/serial/bus.c
+++ b/drivers/usb/serial/bus.c
@@ -88,7 +88,6 @@ static int usb_serial_device_remove(struct device *dev)
 {
 	struct usb_serial_port *port = to_usb_serial_port(dev);
 	struct usb_serial_driver *driver;
-	int retval = 0;
 	int minor;
 	int autopm_err;
 
@@ -105,7 +104,7 @@ static int usb_serial_device_remove(struct device *dev)
 
 	driver = port->serial->type;
 	if (driver->port_remove)
-		retval = driver->port_remove(port);
+		driver->port_remove(port);
 
 	dev_info(dev, "%s converter now disconnected from ttyUSB%d\n",
 		 driver->description, minor);
@@ -113,7 +112,7 @@ static int usb_serial_device_remove(struct device *dev)
 	if (!autopm_err)
 		usb_autopm_put_interface(port->serial->interface);
 
-	return retval;
+	return 0;
 }
 
 static ssize_t new_id_store(struct device_driver *driver,
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index 28deaaec581f..8d997b71056f 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -419,14 +419,12 @@ error:	kfree(priv);
 	return r;
 }
 
-static int ch341_port_remove(struct usb_serial_port *port)
+static void ch341_port_remove(struct usb_serial_port *port)
 {
 	struct ch341_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int ch341_carrier_raised(struct usb_serial_port *port)
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index cc4f63a06f9e..665798043d3c 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -44,7 +44,7 @@ static int cp210x_attach(struct usb_serial *);
 static void cp210x_disconnect(struct usb_serial *);
 static void cp210x_release(struct usb_serial *);
 static int cp210x_port_probe(struct usb_serial_port *);
-static int cp210x_port_remove(struct usb_serial_port *);
+static void cp210x_port_remove(struct usb_serial_port *);
 static void cp210x_dtr_rts(struct usb_serial_port *port, int on);
 static void cp210x_process_read_urb(struct urb *urb);
 static void cp210x_enable_event_mode(struct usb_serial_port *port);
@@ -1841,14 +1841,12 @@ static int cp210x_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int cp210x_port_remove(struct usb_serial_port *port)
+static void cp210x_port_remove(struct usb_serial_port *port)
 {
 	struct cp210x_port_private *port_priv;
 
 	port_priv = usb_get_serial_port_data(port);
 	kfree(port_priv);
-
-	return 0;
 }
 
 static void cp210x_init_max_speed(struct usb_serial *serial)
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index 2e40908963da..cf389224d528 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -47,7 +47,7 @@
 
 /* Function prototypes */
 static int cyberjack_port_probe(struct usb_serial_port *port);
-static int cyberjack_port_remove(struct usb_serial_port *port);
+static void cyberjack_port_remove(struct usb_serial_port *port);
 static int  cyberjack_open(struct tty_struct *tty,
 	struct usb_serial_port *port);
 static void cyberjack_close(struct usb_serial_port *port);
@@ -120,7 +120,7 @@ static int cyberjack_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int cyberjack_port_remove(struct usb_serial_port *port)
+static void cyberjack_port_remove(struct usb_serial_port *port)
 {
 	struct cyberjack_private *priv;
 
@@ -128,8 +128,6 @@ static int cyberjack_port_remove(struct usb_serial_port *port)
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int  cyberjack_open(struct tty_struct *tty,
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index cc028601c388..166ee2286fda 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -115,7 +115,7 @@ struct cypress_private {
 static int  cypress_earthmate_port_probe(struct usb_serial_port *port);
 static int  cypress_hidcom_port_probe(struct usb_serial_port *port);
 static int  cypress_ca42v2_port_probe(struct usb_serial_port *port);
-static int  cypress_port_remove(struct usb_serial_port *port);
+static void cypress_port_remove(struct usb_serial_port *port);
 static int  cypress_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void cypress_close(struct usb_serial_port *port);
 static void cypress_dtr_rts(struct usb_serial_port *port, int on);
@@ -564,7 +564,7 @@ static int cypress_ca42v2_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int cypress_port_remove(struct usb_serial_port *port)
+static void cypress_port_remove(struct usb_serial_port *port)
 {
 	struct cypress_private *priv;
 
@@ -572,8 +572,6 @@ static int cypress_port_remove(struct usb_serial_port *port)
 
 	kfifo_free(&priv->write_fifo);
 	kfree(priv);
-
-	return 0;
 }
 
 static int cypress_open(struct tty_struct *tty, struct usb_serial_port *port)
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 0ecd5316d85f..8b2f06539f2c 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -233,7 +233,7 @@ static int digi_startup(struct usb_serial *serial);
 static void digi_disconnect(struct usb_serial *serial);
 static void digi_release(struct usb_serial *serial);
 static int digi_port_probe(struct usb_serial_port *port);
-static int digi_port_remove(struct usb_serial_port *port);
+static void digi_port_remove(struct usb_serial_port *port);
 static void digi_read_bulk_callback(struct urb *urb);
 static int digi_read_inb_callback(struct urb *urb);
 static int digi_read_oob_callback(struct urb *urb);
@@ -1281,14 +1281,12 @@ static int digi_port_probe(struct usb_serial_port *port)
 	return digi_port_init(port, port->port_number);
 }
 
-static int digi_port_remove(struct usb_serial_port *port)
+static void digi_port_remove(struct usb_serial_port *port)
 {
 	struct digi_port *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static void digi_read_bulk_callback(struct urb *urb)
diff --git a/drivers/usb/serial/f81534.c b/drivers/usb/serial/f81534.c
index dd7e55e822ef..a763b362f081 100644
--- a/drivers/usb/serial/f81534.c
+++ b/drivers/usb/serial/f81534.c
@@ -1430,12 +1430,11 @@ static int f81534_port_probe(struct usb_serial_port *port)
 	return f81534_set_port_output_pin(port);
 }
 
-static int f81534_port_remove(struct usb_serial_port *port)
+static void f81534_port_remove(struct usb_serial_port *port)
 {
 	struct f81534_port_private *port_priv = usb_get_serial_port_data(port);
 
 	flush_work(&port_priv->lsr_work);
-	return 0;
 }
 
 static int f81534_tiocmget(struct tty_struct *tty)
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index d61703d858a1..c867592477c9 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1069,7 +1069,7 @@ static const char *ftdi_chip_name[] = {
 static int  ftdi_sio_probe(struct usb_serial *serial,
 					const struct usb_device_id *id);
 static int  ftdi_sio_port_probe(struct usb_serial_port *port);
-static int  ftdi_sio_port_remove(struct usb_serial_port *port);
+static void ftdi_sio_port_remove(struct usb_serial_port *port);
 static int  ftdi_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void ftdi_dtr_rts(struct usb_serial_port *port, int on);
 static void ftdi_process_read_urb(struct urb *urb);
@@ -2400,7 +2400,7 @@ static int ftdi_stmclite_probe(struct usb_serial *serial)
 	return 0;
 }
 
-static int ftdi_sio_port_remove(struct usb_serial_port *port)
+static void ftdi_sio_port_remove(struct usb_serial_port *port)
 {
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
 
@@ -2409,8 +2409,6 @@ static int ftdi_sio_port_remove(struct usb_serial_port *port)
 	remove_sysfs_attrs(port);
 
 	kfree(priv);
-
-	return 0;
 }
 
 static int ftdi_open(struct tty_struct *tty, struct usb_serial_port *port)
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index c02c19bb1183..50e8bdc77e71 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -1401,7 +1401,7 @@ err_free:
 }
 
 
-static int garmin_port_remove(struct usb_serial_port *port)
+static void garmin_port_remove(struct usb_serial_port *port)
 {
 	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
 
@@ -1409,7 +1409,6 @@ static int garmin_port_remove(struct usb_serial_port *port)
 	usb_kill_urb(port->interrupt_in_urb);
 	del_timer_sync(&garmin_data_p->timer);
 	kfree(garmin_data_p);
-	return 0;
 }
 
 
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index ba5d8df69518..a493670c06e6 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -293,7 +293,7 @@ static int  edge_startup(struct usb_serial *serial);
 static void edge_disconnect(struct usb_serial *serial);
 static void edge_release(struct usb_serial *serial);
 static int edge_port_probe(struct usb_serial_port *port);
-static int edge_port_remove(struct usb_serial_port *port);
+static void edge_port_remove(struct usb_serial_port *port);
 
 /* function prototypes for all of our local functions */
 
@@ -3078,14 +3078,12 @@ static int edge_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int edge_port_remove(struct usb_serial_port *port)
+static void edge_port_remove(struct usb_serial_port *port)
 {
 	struct edgeport_port *edge_port;
 
 	edge_port = usb_get_serial_port_data(port);
 	kfree(edge_port);
-
-	return 0;
 }
 
 static struct usb_serial_driver edgeport_2port_device = {
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 0bbfa47e04b7..e800547be9e0 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2625,15 +2625,13 @@ err:
 	return ret;
 }
 
-static int edge_port_remove(struct usb_serial_port *port)
+static void edge_port_remove(struct usb_serial_port *port)
 {
 	struct edgeport_port *edge_port;
 
 	edge_port = usb_get_serial_port_data(port);
 	edge_remove_sysfs_attrs(port);
 	kfree(edge_port);
-
-	return 0;
 }
 
 /* Sysfs Attributes */
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index e8f06b41a503..093afd67a664 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -100,7 +100,7 @@ static int iuu_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int iuu_port_remove(struct usb_serial_port *port)
+static void iuu_port_remove(struct usb_serial_port *port)
 {
 	struct iuu_private *priv = usb_get_serial_port_data(port);
 
@@ -108,8 +108,6 @@ static int iuu_port_remove(struct usb_serial_port *port)
 	kfree(priv->writebuf);
 	kfree(priv->buf);
 	kfree(priv);
-
-	return 0;
 }
 
 static int iuu_tiocmset(struct tty_struct *tty,
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index aa3dbce22cfb..622077dcc344 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -49,7 +49,7 @@ static int keyspan_startup(struct usb_serial *serial);
 static void keyspan_disconnect(struct usb_serial *serial);
 static void keyspan_release(struct usb_serial *serial);
 static int keyspan_port_probe(struct usb_serial_port *port);
-static int keyspan_port_remove(struct usb_serial_port *port);
+static void keyspan_port_remove(struct usb_serial_port *port);
 static int keyspan_write_room(struct tty_struct *tty);
 static int keyspan_write(struct tty_struct *tty, struct usb_serial_port *port,
 			 const unsigned char *buf, int count);
@@ -2985,7 +2985,7 @@ err_in_buffer:
 	return -ENOMEM;
 }
 
-static int keyspan_port_remove(struct usb_serial_port *port)
+static void keyspan_port_remove(struct usb_serial_port *port)
 {
 	struct keyspan_port_private *p_priv;
 	int i;
@@ -3014,8 +3014,6 @@ static int keyspan_port_remove(struct usb_serial_port *port)
 		kfree(p_priv->in_buffer[i]);
 
 	kfree(p_priv);
-
-	return 0;
 }
 
 /* Structs for the devices, pre and post renumeration. */
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index e6f933e8d25f..39b0f5f344c2 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -672,14 +672,12 @@ static int keyspan_pda_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int keyspan_pda_port_remove(struct usb_serial_port *port)
+static void keyspan_pda_port_remove(struct usb_serial_port *port)
 {
 	struct keyspan_pda_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static struct usb_serial_driver keyspan_pda_fake_device = {
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index 5f6b82ebccc5..f1e9628a9907 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -52,7 +52,7 @@
  * Function prototypes
  */
 static int klsi_105_port_probe(struct usb_serial_port *port);
-static int klsi_105_port_remove(struct usb_serial_port *port);
+static void klsi_105_port_remove(struct usb_serial_port *port);
 static int  klsi_105_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void klsi_105_close(struct usb_serial_port *port);
 static void klsi_105_set_termios(struct tty_struct *tty,
@@ -231,14 +231,12 @@ static int klsi_105_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int klsi_105_port_remove(struct usb_serial_port *port)
+static void klsi_105_port_remove(struct usb_serial_port *port)
 {
 	struct klsi_105_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int  klsi_105_open(struct tty_struct *tty, struct usb_serial_port *port)
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index 49aacb0a327c..a9bc546626ab 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -48,7 +48,7 @@
 
 /* Function prototypes */
 static int kobil_port_probe(struct usb_serial_port *probe);
-static int kobil_port_remove(struct usb_serial_port *probe);
+static void kobil_port_remove(struct usb_serial_port *probe);
 static int  kobil_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -143,14 +143,12 @@ static int kobil_port_probe(struct usb_serial_port *port)
 }
 
 
-static int kobil_port_remove(struct usb_serial_port *port)
+static void kobil_port_remove(struct usb_serial_port *port)
 {
 	struct kobil_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static void kobil_init_termios(struct tty_struct *tty)
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index 7887c312d9a9..ecd5b921e374 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -39,7 +39,7 @@
  * Function prototypes
  */
 static int  mct_u232_port_probe(struct usb_serial_port *port);
-static int  mct_u232_port_remove(struct usb_serial_port *remove);
+static void mct_u232_port_remove(struct usb_serial_port *remove);
 static int  mct_u232_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void mct_u232_close(struct usb_serial_port *port);
 static void mct_u232_dtr_rts(struct usb_serial_port *port, int on);
@@ -400,14 +400,12 @@ static int mct_u232_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int mct_u232_port_remove(struct usb_serial_port *port)
+static void mct_u232_port_remove(struct usb_serial_port *port)
 {
 	struct mct_u232_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int  mct_u232_open(struct tty_struct *tty, struct usb_serial_port *port)
diff --git a/drivers/usb/serial/metro-usb.c b/drivers/usb/serial/metro-usb.c
index e63cea02cfd8..0bfe4459c37f 100644
--- a/drivers/usb/serial/metro-usb.c
+++ b/drivers/usb/serial/metro-usb.c
@@ -256,14 +256,12 @@ static int metrousb_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int metrousb_port_remove(struct usb_serial_port *port)
+static void metrousb_port_remove(struct usb_serial_port *port)
 {
 	struct metrousb_private *metro_priv;
 
 	metro_priv = usb_get_serial_port_data(port);
 	kfree(metro_priv);
-
-	return 0;
 }
 
 static void metrousb_throttle(struct tty_struct *tty)
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index aa55169796a3..701dfb32b129 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -1760,14 +1760,12 @@ static int mos7720_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int mos7720_port_remove(struct usb_serial_port *port)
+static void mos7720_port_remove(struct usb_serial_port *port)
 {
 	struct moschip_port *mos7720_port;
 
 	mos7720_port = usb_get_serial_port_data(port);
 	kfree(mos7720_port);
-
-	return 0;
 }
 
 static struct usb_serial_driver moschip7720_2port_driver = {
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 30c25ef0dacd..1bf0d066f55a 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1745,7 +1745,7 @@ error:
 	return status;
 }
 
-static int mos7840_port_remove(struct usb_serial_port *port)
+static void mos7840_port_remove(struct usb_serial_port *port)
 {
 	struct moschip_port *mos7840_port = usb_get_serial_port_data(port);
 
@@ -1762,8 +1762,6 @@ static int mos7840_port_remove(struct usb_serial_port *port)
 	}
 
 	kfree(mos7840_port);
-
-	return 0;
 }
 
 static struct usb_serial_driver moschip7840_4port_device = {
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index 5b6e982a9376..83c62f920c50 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -36,7 +36,7 @@ static int omninet_prepare_write_buffer(struct usb_serial_port *port,
 static int omninet_calc_num_ports(struct usb_serial *serial,
 				struct usb_serial_endpoints *epds);
 static int omninet_port_probe(struct usb_serial_port *port);
-static int omninet_port_remove(struct usb_serial_port *port);
+static void omninet_port_remove(struct usb_serial_port *port);
 
 static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(ZYXEL_VENDOR_ID, ZYXEL_OMNINET_ID) },
@@ -121,14 +121,12 @@ static int omninet_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int omninet_port_remove(struct usb_serial_port *port)
+static void omninet_port_remove(struct usb_serial_port *port)
 {
 	struct omninet_data *od;
 
 	od = usb_get_serial_port_data(port);
 	kfree(od);
-
-	return 0;
 }
 
 #define OMNINET_HEADERLEN	4
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 0af76800bd78..eecb72aef83e 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -385,13 +385,11 @@ static int opticon_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int opticon_port_remove(struct usb_serial_port *port)
+static void opticon_port_remove(struct usb_serial_port *port)
 {
 	struct opticon_private *priv = usb_get_serial_port_data(port);
 
 	kfree(priv);
-
-	return 0;
 }
 
 static struct usb_serial_driver opticon_device = {
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index 8151dd7a45e8..65cd0341fa78 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -132,7 +132,7 @@ static int oti6858_tiocmget(struct tty_struct *tty);
 static int oti6858_tiocmset(struct tty_struct *tty,
 				unsigned int set, unsigned int clear);
 static int oti6858_port_probe(struct usb_serial_port *port);
-static int oti6858_port_remove(struct usb_serial_port *port);
+static void oti6858_port_remove(struct usb_serial_port *port);
 
 /* device info */
 static struct usb_serial_driver oti6858_device = {
@@ -344,14 +344,12 @@ static int oti6858_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int oti6858_port_remove(struct usb_serial_port *port)
+static void oti6858_port_remove(struct usb_serial_port *port)
 {
 	struct oti6858_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int oti6858_write(struct tty_struct *tty, struct usb_serial_port *port,
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 29dda60e3bcd..eed9acd1ae08 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -450,13 +450,11 @@ static int pl2303_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int pl2303_port_remove(struct usb_serial_port *port)
+static void pl2303_port_remove(struct usb_serial_port *port)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
 
 	kfree(priv);
-
-	return 0;
 }
 
 static int pl2303_set_control_lines(struct usb_serial_port *port, u8 value)
diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c
index 872d1bc86ab4..599dcb2e374d 100644
--- a/drivers/usb/serial/quatech2.c
+++ b/drivers/usb/serial/quatech2.c
@@ -727,7 +727,7 @@ err_buf:
 	return -ENOMEM;
 }
 
-static int qt2_port_remove(struct usb_serial_port *port)
+static void qt2_port_remove(struct usb_serial_port *port)
 {
 	struct qt2_port_private *port_priv;
 
@@ -735,8 +735,6 @@ static int qt2_port_remove(struct usb_serial_port *port)
 	usb_free_urb(port_priv->write_urb);
 	kfree(port_priv->write_buffer);
 	kfree(port_priv);
-
-	return 0;
 }
 
 static int qt2_tiocmget(struct tty_struct *tty)
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 57fc3c31712e..54e16ffc30a0 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -901,15 +901,13 @@ static int sierra_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int sierra_port_remove(struct usb_serial_port *port)
+static void sierra_port_remove(struct usb_serial_port *port)
 {
 	struct sierra_port_private *portdata;
 
 	portdata = usb_get_serial_port_data(port);
 	usb_set_serial_port_data(port, NULL);
 	kfree(portdata);
-
-	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 3bac55bd9bd9..7039dc918827 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -169,14 +169,12 @@ static int spcp8x5_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int spcp8x5_port_remove(struct usb_serial_port *port)
+static void spcp8x5_port_remove(struct usb_serial_port *port)
 {
 	struct spcp8x5_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int spcp8x5_set_ctrl_line(struct usb_serial_port *port, u8 mcr)
diff --git a/drivers/usb/serial/ssu100.c b/drivers/usb/serial/ssu100.c
index 7d39d35e52a1..89fdc5c19285 100644
--- a/drivers/usb/serial/ssu100.c
+++ b/drivers/usb/serial/ssu100.c
@@ -366,14 +366,12 @@ static int ssu100_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int ssu100_port_remove(struct usb_serial_port *port)
+static void ssu100_port_remove(struct usb_serial_port *port)
 {
 	struct ssu100_port_private *priv;
 
 	priv = usb_get_serial_port_data(port);
 	kfree(priv);
-
-	return 0;
 }
 
 static int ssu100_tiocmget(struct tty_struct *tty)
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 6ca24e86f686..d7f73ad6e778 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -160,13 +160,11 @@ static int symbol_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int symbol_port_remove(struct usb_serial_port *port)
+static void symbol_port_remove(struct usb_serial_port *port)
 {
 	struct symbol_private *priv = usb_get_serial_port_data(port);
 
 	kfree(priv);
-
-	return 0;
 }
 
 static struct usb_serial_driver symbol_device = {
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 73075b9351c5..7252b0ce75a6 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -303,7 +303,7 @@ struct ti_device {
 static int ti_startup(struct usb_serial *serial);
 static void ti_release(struct usb_serial *serial);
 static int ti_port_probe(struct usb_serial_port *port);
-static int ti_port_remove(struct usb_serial_port *port);
+static void ti_port_remove(struct usb_serial_port *port);
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -629,14 +629,12 @@ static int ti_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int ti_port_remove(struct usb_serial_port *port)
+static void ti_port_remove(struct usb_serial_port *port)
 {
 	struct ti_port *tport;
 
 	tport = usb_get_serial_port_data(port);
 	kfree(tport);
-
-	return 0;
 }
 
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port)
diff --git a/drivers/usb/serial/upd78f0730.c b/drivers/usb/serial/upd78f0730.c
index 1ca9c1881621..26d7b003b7e3 100644
--- a/drivers/usb/serial/upd78f0730.c
+++ b/drivers/usb/serial/upd78f0730.c
@@ -171,15 +171,13 @@ static int upd78f0730_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int upd78f0730_port_remove(struct usb_serial_port *port)
+static void upd78f0730_port_remove(struct usb_serial_port *port)
 {
 	struct upd78f0730_port_private *private;
 
 	private = usb_get_serial_port_data(port);
 	mutex_destroy(&private->lock);
 	kfree(private);
-
-	return 0;
 }
 
 static int upd78f0730_tiocmget(struct tty_struct *tty)
diff --git a/drivers/usb/serial/usb-wwan.h b/drivers/usb/serial/usb-wwan.h
index 934e9361cf6b..79dafd98e0a1 100644
--- a/drivers/usb/serial/usb-wwan.h
+++ b/drivers/usb/serial/usb-wwan.h
@@ -10,7 +10,7 @@ extern void usb_wwan_dtr_rts(struct usb_serial_port *port, int on);
 extern int usb_wwan_open(struct tty_struct *tty, struct usb_serial_port *port);
 extern void usb_wwan_close(struct usb_serial_port *port);
 extern int usb_wwan_port_probe(struct usb_serial_port *port);
-extern int usb_wwan_port_remove(struct usb_serial_port *port);
+extern void usb_wwan_port_remove(struct usb_serial_port *port);
 extern int usb_wwan_write_room(struct tty_struct *tty);
 extern int usb_wwan_tiocmget(struct tty_struct *tty);
 extern int usb_wwan_tiocmset(struct tty_struct *tty,
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index 4b9845807bee..46d46a4f99c9 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -544,7 +544,7 @@ bail_out_error:
 }
 EXPORT_SYMBOL_GPL(usb_wwan_port_probe);
 
-int usb_wwan_port_remove(struct usb_serial_port *port)
+void usb_wwan_port_remove(struct usb_serial_port *port)
 {
 	int i;
 	struct usb_wwan_port_private *portdata;
@@ -562,8 +562,6 @@ int usb_wwan_port_remove(struct usb_serial_port *port)
 	}
 
 	kfree(portdata);
-
-	return 0;
 }
 EXPORT_SYMBOL(usb_wwan_port_remove);
 
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index ca3bd58f2025..ccfd5ed652cd 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -79,7 +79,7 @@ static int  whiteheat_firmware_attach(struct usb_serial *serial);
 static int  whiteheat_attach(struct usb_serial *serial);
 static void whiteheat_release(struct usb_serial *serial);
 static int  whiteheat_port_probe(struct usb_serial_port *port);
-static int  whiteheat_port_remove(struct usb_serial_port *port);
+static void whiteheat_port_remove(struct usb_serial_port *port);
 static int  whiteheat_open(struct tty_struct *tty,
 			struct usb_serial_port *port);
 static void whiteheat_close(struct usb_serial_port *port);
@@ -345,14 +345,12 @@ static int whiteheat_port_probe(struct usb_serial_port *port)
 	return 0;
 }
 
-static int whiteheat_port_remove(struct usb_serial_port *port)
+static void whiteheat_port_remove(struct usb_serial_port *port)
 {
 	struct whiteheat_private *info;
 
 	info = usb_get_serial_port_data(port);
 	kfree(info);
-
-	return 0;
 }
 
 static int whiteheat_open(struct tty_struct *tty, struct usb_serial_port *port)
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 1c09b922f8b0..952272002e48 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -260,7 +260,7 @@ struct usb_serial_driver {
 	void (*release)(struct usb_serial *serial);
 
 	int (*port_probe)(struct usb_serial_port *port);
-	int (*port_remove)(struct usb_serial_port *port);
+	void (*port_remove)(struct usb_serial_port *port);
 
 	int (*suspend)(struct usb_serial *serial, pm_message_t message);
 	int (*resume)(struct usb_serial *serial);
-- 
cgit v1.2.3


From 4fc096a99e01dd06dc55bef76ade7f8d76653245 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 28 Jan 2021 13:01:31 -0500
Subject: KVM: Raise the maximum number of user memslots

Current KVM_USER_MEM_SLOTS limits are arch specific (512 on Power, 509 on x86,
32 on s390, 16 on MIPS) but they don't really need to be. Memory slots are
allocated dynamically in KVM when added so the only real limitation is
'id_to_index' array which is 'short'. We don't have any other
KVM_MEM_SLOTS_NUM/KVM_USER_MEM_SLOTS-sized statically defined structures.

Low KVM_USER_MEM_SLOTS can be a limiting factor for some configurations.
In particular, when QEMU tries to start a Windows guest with Hyper-V SynIC
enabled and e.g. 256 vCPUs the limit is hit as SynIC requires two pages per
vCPU and the guest is free to pick any GFN for each of them, this fragments
memslots as QEMU wants to have a separate memslot for each of these pages
(which are supposed to act as 'overlay' pages).

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20210127175731.2020089-3-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_host.h   | 1 -
 arch/mips/include/asm/kvm_host.h    | 1 -
 arch/powerpc/include/asm/kvm_host.h | 1 -
 arch/s390/include/asm/kvm_host.h    | 1 -
 arch/x86/include/asm/kvm_host.h     | 2 --
 include/linux/kvm_host.h            | 5 ++---
 6 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8fcfab0c2567..1b8a3d825276 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -30,7 +30,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#define KVM_USER_MEM_SLOTS 512
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #include <kvm/arm_vgic.h>
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 24f3d0f9996b..3a5612e7304c 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -83,7 +83,6 @@
 
 
 #define KVM_MAX_VCPUS		16
-#define KVM_USER_MEM_SLOTS	16
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS	0
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d67a470e95a3..2b9b6855ec86 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -28,7 +28,6 @@
 
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
-#define KVM_USER_MEM_SLOTS	512
 
 #include <asm/cputhreads.h>
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 74f9a036bab2..6bcfc5614bbc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -28,7 +28,6 @@
 #define KVM_S390_BSCA_CPU_SLOTS 64
 #define KVM_S390_ESCA_CPU_SLOTS 248
 #define KVM_MAX_VCPUS 255
-#define KVM_USER_MEM_SLOTS 32
 
 /*
  * These seem to be used for allocating ->chip in the routing table, which we
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c451412a6230..b6de568e7476 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,10 +40,8 @@
 #define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
 #define KVM_MAX_VCPU_ID 1023
-#define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
 #define KVM_HALT_POLL_NS_DEFAULT 200000
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f417447129b9..e126ebda36d0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -425,9 +425,8 @@ struct kvm_irq_routing_table {
 #define KVM_PRIVATE_MEM_SLOTS 0
 #endif
 
-#ifndef KVM_MEM_SLOTS_NUM
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#endif
+#define KVM_MEM_SLOTS_NUM SHRT_MAX
+#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS)
 
 #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 9528e0d9c10027ae80e2aab36e30a1f730b1bbf9 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 5 Feb 2021 14:26:37 -0800
Subject: driver core: fw_devlink: Detect supplier devices that will never be
 added

During the initial parsing of firmware by fw_devlink, fw_devlink might
infer that some supplier firmware nodes would get populated as devices.
But the inference is not always correct. This patch tries to logically
detect and fix such mistakes as boot progresses or more devices probe.

fw_devlink makes a fundamental assumption that once a device binds to a
driver, it will populate (i.e: add as struct devices) all the child
firmware nodes that could be populated as devices (if they aren't
populated already).

So, whenever a device probes, we check all its child firmware nodes. If
a child firmware node has a corresponding device populated, we don't
modify the child node or its descendants. However, if a child firmware
node has not been populated as a device, we delete all the fwnode links
where the child node or its descendants are suppliers. This ensures that
no other device is blocked on a firmware node that will never be
populated as a device. We also mark such fwnodes as NOT_DEVICE, so that
no new fwnode links are created with these nodes as suppliers.

Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default")
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210205222644.2357303-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 31 ++++++++++++++++++++++++++++---
 include/linux/fwnode.h |  2 ++
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 484a942884ba..c95b1daabac7 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -148,6 +148,21 @@ void fwnode_links_purge(struct fwnode_handle *fwnode)
 	fwnode_links_purge_consumers(fwnode);
 }
 
+static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *child;
+
+	/* Don't purge consumer links of an added child */
+	if (fwnode->dev)
+		return;
+
+	fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
+	fwnode_links_purge_consumers(fwnode);
+
+	fwnode_for_each_available_child_node(fwnode, child)
+		fw_devlink_purge_absent_suppliers(child);
+}
+
 #ifdef CONFIG_SRCU
 static DEFINE_MUTEX(device_links_lock);
 DEFINE_STATIC_SRCU(device_links_srcu);
@@ -1154,12 +1169,22 @@ void device_links_driver_bound(struct device *dev)
 	LIST_HEAD(sync_list);
 
 	/*
-	 * If a device probes successfully, it's expected to have created all
+	 * If a device binds successfully, it's expected to have created all
 	 * the device links it needs to or make new device links as it needs
-	 * them. So, it no longer needs to wait on any suppliers.
+	 * them. So, fw_devlink no longer needs to create device links to any
+	 * of the device's suppliers.
+	 *
+	 * Also, if a child firmware node of this bound device is not added as
+	 * a device by now, assume it is never going to be added and make sure
+	 * other devices don't defer probe indefinitely by waiting for such a
+	 * child device.
 	 */
-	if (dev->fwnode && dev->fwnode->dev == dev)
+	if (dev->fwnode && dev->fwnode->dev == dev) {
+		struct fwnode_handle *child;
 		fwnode_links_purge_suppliers(dev->fwnode);
+		fwnode_for_each_available_child_node(dev->fwnode, child)
+			fw_devlink_purge_absent_suppliers(child);
+	}
 	device_remove_file(dev, &dev_attr_waiting_for_supplier);
 
 	device_links_write_lock();
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index fde4ad97564c..21082f11473f 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -19,8 +19,10 @@ struct device;
  * fwnode link flags
  *
  * LINKS_ADDED: The fwnode has already be parsed to add fwnode links.
+ * NOT_DEVICE: The fwnode will never be populated as a struct device.
  */
 #define FWNODE_FLAG_LINKS_ADDED		BIT(0)
+#define FWNODE_FLAG_NOT_DEVICE		BIT(1)
 
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
-- 
cgit v1.2.3


From 19d0f5f6bff878277783fd98fef4ae2441d6a1d8 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 5 Feb 2021 14:26:39 -0800
Subject: driver core: Add fw_devlink.strict kernel param

This param allows forcing all dependencies to be treated as mandatory.
This will be useful for boards in which all optional dependencies like
IOMMUs and DMAs need to be treated as mandatory dependencies.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210205222644.2357303-4-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 +++++
 drivers/base/core.c                             | 12 ++++++++++++
 include/linux/fwnode.h                          |  1 +
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a10b545c2070..692b63644133 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1433,6 +1433,11 @@
 				to enforce probe and suspend/resume ordering.
 			rpm --	Like "on", but also use to order runtime PM.
 
+	fw_devlink.strict=<bool>
+			[KNL] Treat all inferred dependencies as mandatory
+			dependencies. This only applies for fw_devlink=on|rpm.
+			Format: <bool>
+
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
 			support via parallel port (up to 5 devices per port)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index c95b1daabac7..f466ab4f1c35 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1521,6 +1521,13 @@ static int __init fw_devlink_setup(char *arg)
 }
 early_param("fw_devlink", fw_devlink_setup);
 
+static bool fw_devlink_strict;
+static int __init fw_devlink_strict_setup(char *arg)
+{
+	return strtobool(arg, &fw_devlink_strict);
+}
+early_param("fw_devlink.strict", fw_devlink_strict_setup);
+
 u32 fw_devlink_get_flags(void)
 {
 	return fw_devlink_flags;
@@ -1531,6 +1538,11 @@ static bool fw_devlink_is_permissive(void)
 	return fw_devlink_flags == FW_DEVLINK_FLAGS_PERMISSIVE;
 }
 
+bool fw_devlink_is_strict(void)
+{
+	return fw_devlink_strict && !fw_devlink_is_permissive();
+}
+
 static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
 {
 	if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 21082f11473f..d5caefe39d93 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -162,6 +162,7 @@ static inline void fwnode_init(struct fwnode_handle *fwnode,
 }
 
 extern u32 fw_devlink_get_flags(void);
+extern bool fw_devlink_is_strict(void);
 int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup);
 void fwnode_links_purge(struct fwnode_handle *fwnode);
 
-- 
cgit v1.2.3


From 74c782cff77b3533290148df1fa6f8c7db5e60d5 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Fri, 5 Feb 2021 14:26:41 -0800
Subject: driver core: fw_devlink: Handle suppliers that don't use driver core

Device links only work between devices that use the driver core to match
and bind a driver to a device. So, add an API for frameworks to let the
driver core know that a fwnode has been initialized by a driver without
using the driver core.

Then use this information to make sure that fw_devlink doesn't make the
consumers wait indefinitely on suppliers that'll never bind to a driver.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210205222644.2357303-6-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 15 +++++++++++++++
 include/linux/fwnode.h | 19 +++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f466ab4f1c35..ea710b33bda6 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1636,6 +1636,17 @@ static int fw_devlink_create_devlink(struct device *con,
 
 	sup_dev = get_dev_from_fwnode(sup_handle);
 	if (sup_dev) {
+		/*
+		 * If it's one of those drivers that don't actually bind to
+		 * their device using driver core, then don't wait on this
+		 * supplier device indefinitely.
+		 */
+		if (sup_dev->links.status == DL_DEV_NO_DRIVER &&
+		    sup_handle->flags & FWNODE_FLAG_INITIALIZED) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * If this fails, it is due to cycles in device links.  Just
 		 * give up on this link and treat it as invalid.
@@ -1655,6 +1666,10 @@ static int fw_devlink_create_devlink(struct device *con,
 		goto out;
 	}
 
+	/* Supplier that's already initialized without a struct device. */
+	if (sup_handle->flags & FWNODE_FLAG_INITIALIZED)
+		return -EINVAL;
+
 	/*
 	 * DL_FLAG_SYNC_STATE_ONLY doesn't block probing and supports
 	 * cycles. So cycle detection isn't necessary and shouldn't be
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index d5caefe39d93..dfefd43a737c 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/err.h>
 
 struct fwnode_operations;
 struct device;
@@ -18,11 +19,13 @@ struct device;
 /*
  * fwnode link flags
  *
- * LINKS_ADDED: The fwnode has already be parsed to add fwnode links.
- * NOT_DEVICE: The fwnode will never be populated as a struct device.
+ * LINKS_ADDED:	The fwnode has already be parsed to add fwnode links.
+ * NOT_DEVICE:	The fwnode will never be populated as a struct device.
+ * INITIALIZED: The hardware corresponding to fwnode has been initialized.
  */
 #define FWNODE_FLAG_LINKS_ADDED		BIT(0)
 #define FWNODE_FLAG_NOT_DEVICE		BIT(1)
+#define FWNODE_FLAG_INITIALIZED		BIT(2)
 
 struct fwnode_handle {
 	struct fwnode_handle *secondary;
@@ -161,6 +164,18 @@ static inline void fwnode_init(struct fwnode_handle *fwnode,
 	INIT_LIST_HEAD(&fwnode->suppliers);
 }
 
+static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode,
+					  bool initialized)
+{
+	if (IS_ERR_OR_NULL(fwnode))
+		return;
+
+	if (initialized)
+		fwnode->flags |= FWNODE_FLAG_INITIALIZED;
+	else
+		fwnode->flags &= ~FWNODE_FLAG_INITIALIZED;
+}
+
 extern u32 fw_devlink_get_flags(void);
 extern bool fw_devlink_is_strict(void);
 int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup);
-- 
cgit v1.2.3


From ef3c67b6454b8f542f50387ad481633ae30874ac Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Jan 2021 21:39:48 +0200
Subject: mfd: intel_msic: Remove driver for deprecated platform

Intel Moorestown and Medfield are quite old Intel Atom based
32-bit platforms, which were in limited use in some Android phones,
tablets and consumer electronics more than eight years ago.

There are no bugs or problems ever reported outside from Intel
for breaking any of that platforms for years. It seems no real
users exists who run more or less fresh kernel on it. Commit
05f4434bc130 ("ASoC: Intel: remove mfld_machine") is also in align
with this theory.

Due to above and to reduce a burden of supporting outdated drivers,
remove the support for outdated platforms completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                                 |   2 -
 arch/x86/include/asm/intel_scu_ipc_legacy.h |  12 -
 drivers/mfd/Kconfig                         |   9 -
 drivers/mfd/Makefile                        |   1 -
 drivers/mfd/intel_msic.c                    | 425 --------------------------
 include/linux/mfd/intel_msic.h              | 453 ----------------------------
 6 files changed, 902 deletions(-)
 delete mode 100644 drivers/mfd/intel_msic.c
 delete mode 100644 include/linux/mfd/intel_msic.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 904eaa52f5f8..8fdb7fe27537 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9098,9 +9098,7 @@ F:	drivers/gpio/gpio-*cove.c
 INTEL PMIC MULTIFUNCTION DEVICE DRIVERS
 M:	Andy Shevchenko <andy@kernel.org>
 S:	Maintained
-F:	drivers/mfd/intel_msic.c
 F:	drivers/mfd/intel_soc_pmic*
-F:	include/linux/mfd/intel_msic.h
 F:	include/linux/mfd/intel_soc_pmic*
 
 INTEL PMT DRIVER
diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h
index 2232197c24f8..f8b87d8face7 100644
--- a/arch/x86/include/asm/intel_scu_ipc_legacy.h
+++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h
@@ -16,24 +16,12 @@
 
 /* Don't call these in new code - they will be removed eventually */
 
-/* Read single register */
-static inline int intel_scu_ipc_ioread8(u16 addr, u8 *data)
-{
-	return intel_scu_ipc_dev_ioread8(NULL, addr, data);
-}
-
 /* Read a vector */
 static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
 {
 	return intel_scu_ipc_dev_readv(NULL, addr, data, len);
 }
 
-/* Write single register */
-static inline int intel_scu_ipc_iowrite8(u16 addr, u8 data)
-{
-	return intel_scu_ipc_dev_iowrite8(NULL, addr, data);
-}
-
 /* Write a vector */
 static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
 {
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index bdfce7b15621..2dc58a92251b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -659,15 +659,6 @@ config MFD_INTEL_LPSS_PCI
 	  I2C, SPI and HS-UART starting from Intel Sunrisepoint (Intel Skylake
 	  PCH) in PCI mode.
 
-config MFD_INTEL_MSIC
-	bool "Intel MSIC"
-	depends on INTEL_SCU
-	select MFD_CORE
-	help
-	  Select this option to enable access to Intel MSIC (Avatele
-	  Passage) chip. This chip embeds audio, battery, GPIO, etc.
-	  devices used in Intel Medfield platforms.
-
 config MFD_INTEL_PMC_BXT
 	tristate "Intel PMC Driver for Broxton"
 	depends on X86
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 14fdb188af02..be19bc799073 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -214,7 +214,6 @@ obj-$(CONFIG_MFD_ATMEL_SMC)	+= atmel-smc.o
 obj-$(CONFIG_MFD_INTEL_LPSS)	+= intel-lpss.o
 obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
-obj-$(CONFIG_MFD_INTEL_MSIC)	+= intel_msic.o
 obj-$(CONFIG_MFD_INTEL_PMC_BXT)	+= intel_pmc_bxt.o
 obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
diff --git a/drivers/mfd/intel_msic.c b/drivers/mfd/intel_msic.c
deleted file mode 100644
index daa772f8146b..000000000000
--- a/drivers/mfd/intel_msic.c
+++ /dev/null
@@ -1,425 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Driver for Intel MSIC
- *
- * Copyright (C) 2011, Intel Corporation
- * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- */
-
-#include <linux/err.h>
-#include <linux/gpio.h>
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/mfd/core.h>
-#include <linux/mfd/intel_msic.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-
-#include <asm/intel_scu_ipc.h>
-
-#define MSIC_VENDOR(id)		((id >> 6) & 3)
-#define MSIC_VERSION(id)	(id & 0x3f)
-#define MSIC_MAJOR(id)		('A' + ((id >> 3) & 7))
-#define MSIC_MINOR(id)		(id & 7)
-
-/*
- * MSIC interrupt tree is readable from SRAM at INTEL_MSIC_IRQ_PHYS_BASE.
- * Since IRQ block starts from address 0x002 we need to subtract that from
- * the actual IRQ status register address.
- */
-#define MSIC_IRQ_STATUS(x)	(INTEL_MSIC_IRQ_PHYS_BASE + ((x) - 2))
-#define MSIC_IRQ_STATUS_ACCDET	MSIC_IRQ_STATUS(INTEL_MSIC_ACCDET)
-
-/*
- * The SCU hardware has limitation of 16 bytes per read/write buffer on
- * Medfield.
- */
-#define SCU_IPC_RWBUF_LIMIT	16
-
-/**
- * struct intel_msic - an MSIC MFD instance
- * @pdev: pointer to the platform device
- * @vendor: vendor ID
- * @version: chip version
- * @irq_base: base address of the mapped MSIC SRAM interrupt tree
- */
-struct intel_msic {
-	struct platform_device		*pdev;
-	unsigned			vendor;
-	unsigned			version;
-	void __iomem			*irq_base;
-};
-
-static const struct resource msic_touch_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_adc_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_battery_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_gpio_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_audio_resources[] = {
-	DEFINE_RES_IRQ_NAMED(0, "IRQ"),
-	/*
-	 * We will pass IRQ_BASE to the driver now but this can be removed
-	 * when/if the driver starts to use intel_msic_irq_read().
-	 */
-	DEFINE_RES_MEM_NAMED(MSIC_IRQ_STATUS_ACCDET, 1, "IRQ_BASE"),
-};
-
-static const struct resource msic_hdmi_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_thermal_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_power_btn_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-static const struct resource msic_ocd_resources[] = {
-	DEFINE_RES_IRQ(0),
-};
-
-/*
- * Devices that are part of the MSIC and are available via firmware
- * populated SFI DEVS table.
- */
-static struct mfd_cell msic_devs[] = {
-	[INTEL_MSIC_BLOCK_TOUCH]	= {
-		.name			= "msic_touch",
-		.num_resources		= ARRAY_SIZE(msic_touch_resources),
-		.resources		= msic_touch_resources,
-	},
-	[INTEL_MSIC_BLOCK_ADC]		= {
-		.name			= "msic_adc",
-		.num_resources		= ARRAY_SIZE(msic_adc_resources),
-		.resources		= msic_adc_resources,
-	},
-	[INTEL_MSIC_BLOCK_BATTERY]	= {
-		.name			= "msic_battery",
-		.num_resources		= ARRAY_SIZE(msic_battery_resources),
-		.resources		= msic_battery_resources,
-	},
-	[INTEL_MSIC_BLOCK_GPIO]		= {
-		.name			= "msic_gpio",
-		.num_resources		= ARRAY_SIZE(msic_gpio_resources),
-		.resources		= msic_gpio_resources,
-	},
-	[INTEL_MSIC_BLOCK_AUDIO]	= {
-		.name			= "msic_audio",
-		.num_resources		= ARRAY_SIZE(msic_audio_resources),
-		.resources		= msic_audio_resources,
-	},
-	[INTEL_MSIC_BLOCK_HDMI]		= {
-		.name			= "msic_hdmi",
-		.num_resources		= ARRAY_SIZE(msic_hdmi_resources),
-		.resources		= msic_hdmi_resources,
-	},
-	[INTEL_MSIC_BLOCK_THERMAL]	= {
-		.name			= "msic_thermal",
-		.num_resources		= ARRAY_SIZE(msic_thermal_resources),
-		.resources		= msic_thermal_resources,
-	},
-	[INTEL_MSIC_BLOCK_POWER_BTN]	= {
-		.name			= "msic_power_btn",
-		.num_resources		= ARRAY_SIZE(msic_power_btn_resources),
-		.resources		= msic_power_btn_resources,
-	},
-	[INTEL_MSIC_BLOCK_OCD]		= {
-		.name			= "msic_ocd",
-		.num_resources		= ARRAY_SIZE(msic_ocd_resources),
-		.resources		= msic_ocd_resources,
-	},
-};
-
-/*
- * Other MSIC related devices which are not directly available via SFI DEVS
- * table. These can be pseudo devices, regulators etc. which are needed for
- * different purposes.
- *
- * These devices appear only after the MSIC driver itself is initialized so
- * we can guarantee that the SCU IPC interface is ready.
- */
-static const struct mfd_cell msic_other_devs[] = {
-	/* Audio codec in the MSIC */
-	{
-		.id			= -1,
-		.name			= "sn95031",
-	},
-};
-
-/**
- * intel_msic_reg_read - read a single MSIC register
- * @reg: register to read
- * @val: register value is placed here
- *
- * Read a single register from MSIC. Returns %0 on success and negative
- * errno in case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_reg_read(unsigned short reg, u8 *val)
-{
-	return intel_scu_ipc_ioread8(reg, val);
-}
-EXPORT_SYMBOL_GPL(intel_msic_reg_read);
-
-/**
- * intel_msic_reg_write - write a single MSIC register
- * @reg: register to write
- * @val: value to write to that register
- *
- * Write a single MSIC register. Returns 0 on success and negative
- * errno in case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_reg_write(unsigned short reg, u8 val)
-{
-	return intel_scu_ipc_iowrite8(reg, val);
-}
-EXPORT_SYMBOL_GPL(intel_msic_reg_write);
-
-/**
- * intel_msic_reg_update - update a single MSIC register
- * @reg: register to update
- * @val: value to write to the register
- * @mask: specifies which of the bits are updated (%0 = don't update,
- *        %1 = update)
- *
- * Perform an update to a register @reg. @mask is used to specify which
- * bits are updated. Returns %0 in case of success and negative errno in
- * case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_reg_update(unsigned short reg, u8 val, u8 mask)
-{
-	return intel_scu_ipc_update_register(reg, val, mask);
-}
-EXPORT_SYMBOL_GPL(intel_msic_reg_update);
-
-/**
- * intel_msic_bulk_read - read an array of registers
- * @reg: array of register addresses to read
- * @buf: array where the read values are placed
- * @count: number of registers to read
- *
- * Function reads @count registers from the MSIC using addresses passed in
- * @reg. Read values are placed in @buf. Reads are performed atomically
- * wrt. MSIC.
- *
- * Returns %0 in case of success and negative errno in case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_bulk_read(unsigned short *reg, u8 *buf, size_t count)
-{
-	if (WARN_ON(count > SCU_IPC_RWBUF_LIMIT))
-		return -EINVAL;
-
-	return intel_scu_ipc_readv(reg, buf, count);
-}
-EXPORT_SYMBOL_GPL(intel_msic_bulk_read);
-
-/**
- * intel_msic_bulk_write - write an array of values to the MSIC registers
- * @reg: array of registers to write
- * @buf: values to write to each register
- * @count: number of registers to write
- *
- * Function writes @count registers in @buf to MSIC. Writes are performed
- * atomically wrt MSIC. Returns %0 in case of success and negative errno in
- * case of failure.
- *
- * Function may sleep.
- */
-int intel_msic_bulk_write(unsigned short *reg, u8 *buf, size_t count)
-{
-	if (WARN_ON(count > SCU_IPC_RWBUF_LIMIT))
-		return -EINVAL;
-
-	return intel_scu_ipc_writev(reg, buf, count);
-}
-EXPORT_SYMBOL_GPL(intel_msic_bulk_write);
-
-/**
- * intel_msic_irq_read - read a register from an MSIC interrupt tree
- * @msic: MSIC instance
- * @reg: interrupt register (between %INTEL_MSIC_IRQLVL1 and
- *	 %INTEL_MSIC_RESETIRQ2)
- * @val: value of the register is placed here
- *
- * This function can be used by an MSIC subdevice interrupt handler to read
- * a register value from the MSIC interrupt tree. In this way subdevice
- * drivers don't have to map in the interrupt tree themselves but can just
- * call this function instead.
- *
- * Function doesn't sleep and is callable from interrupt context.
- *
- * Returns %-EINVAL if @reg is outside of the allowed register region.
- */
-int intel_msic_irq_read(struct intel_msic *msic, unsigned short reg, u8 *val)
-{
-	if (WARN_ON(reg < INTEL_MSIC_IRQLVL1 || reg > INTEL_MSIC_RESETIRQ2))
-		return -EINVAL;
-
-	*val = readb(msic->irq_base + (reg - INTEL_MSIC_IRQLVL1));
-	return 0;
-}
-EXPORT_SYMBOL_GPL(intel_msic_irq_read);
-
-static int intel_msic_init_devices(struct intel_msic *msic)
-{
-	struct platform_device *pdev = msic->pdev;
-	struct intel_msic_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	int ret, i;
-
-	if (pdata->gpio) {
-		struct mfd_cell *cell = &msic_devs[INTEL_MSIC_BLOCK_GPIO];
-
-		cell->platform_data = pdata->gpio;
-		cell->pdata_size = sizeof(*pdata->gpio);
-	}
-
-	if (pdata->ocd) {
-		unsigned gpio = pdata->ocd->gpio;
-
-		ret = devm_gpio_request_one(&pdev->dev, gpio,
-					GPIOF_IN, "ocd_gpio");
-		if (ret) {
-			dev_err(&pdev->dev, "failed to register OCD GPIO\n");
-			return ret;
-		}
-
-		ret = gpio_to_irq(gpio);
-		if (ret < 0) {
-			dev_err(&pdev->dev, "no IRQ number for OCD GPIO\n");
-			return ret;
-		}
-
-		/* Update the IRQ number for the OCD */
-		pdata->irq[INTEL_MSIC_BLOCK_OCD] = ret;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(msic_devs); i++) {
-		if (!pdata->irq[i])
-			continue;
-
-		ret = mfd_add_devices(&pdev->dev, -1, &msic_devs[i], 1, NULL,
-				      pdata->irq[i], NULL);
-		if (ret)
-			goto fail;
-	}
-
-	ret = mfd_add_devices(&pdev->dev, 0, msic_other_devs,
-			      ARRAY_SIZE(msic_other_devs), NULL, 0, NULL);
-	if (ret)
-		goto fail;
-
-	return 0;
-
-fail:
-	mfd_remove_devices(&pdev->dev);
-
-	return ret;
-}
-
-static void intel_msic_remove_devices(struct intel_msic *msic)
-{
-	struct platform_device *pdev = msic->pdev;
-
-	mfd_remove_devices(&pdev->dev);
-}
-
-static int intel_msic_probe(struct platform_device *pdev)
-{
-	struct intel_msic_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	struct intel_msic *msic;
-	struct resource *res;
-	u8 id0, id1;
-	int ret;
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data passed\n");
-		return -EINVAL;
-	}
-
-	/* First validate that we have an MSIC in place */
-	ret = intel_scu_ipc_ioread8(INTEL_MSIC_ID0, &id0);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to identify the MSIC chip (ID0)\n");
-		return -ENXIO;
-	}
-
-	ret = intel_scu_ipc_ioread8(INTEL_MSIC_ID1, &id1);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to identify the MSIC chip (ID1)\n");
-		return -ENXIO;
-	}
-
-	if (MSIC_VENDOR(id0) != MSIC_VENDOR(id1)) {
-		dev_err(&pdev->dev, "invalid vendor ID: %x, %x\n", id0, id1);
-		return -ENXIO;
-	}
-
-	msic = devm_kzalloc(&pdev->dev, sizeof(*msic), GFP_KERNEL);
-	if (!msic)
-		return -ENOMEM;
-
-	msic->vendor = MSIC_VENDOR(id0);
-	msic->version = MSIC_VERSION(id0);
-	msic->pdev = pdev;
-
-	/*
-	 * Map in the MSIC interrupt tree area in SRAM. This is exposed to
-	 * the clients via intel_msic_irq_read().
-	 */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	msic->irq_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(msic->irq_base))
-		return PTR_ERR(msic->irq_base);
-
-	platform_set_drvdata(pdev, msic);
-
-	ret = intel_msic_init_devices(msic);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize MSIC devices\n");
-		return ret;
-	}
-
-	dev_info(&pdev->dev, "Intel MSIC version %c%d (vendor %#x)\n",
-		 MSIC_MAJOR(msic->version), MSIC_MINOR(msic->version),
-		 msic->vendor);
-
-	return 0;
-}
-
-static int intel_msic_remove(struct platform_device *pdev)
-{
-	struct intel_msic *msic = platform_get_drvdata(pdev);
-
-	intel_msic_remove_devices(msic);
-
-	return 0;
-}
-
-static struct platform_driver intel_msic_driver = {
-	.probe		= intel_msic_probe,
-	.remove		= intel_msic_remove,
-	.driver		= {
-		.name	= "intel_msic",
-	},
-};
-builtin_platform_driver(intel_msic_driver);
diff --git a/include/linux/mfd/intel_msic.h b/include/linux/mfd/intel_msic.h
deleted file mode 100644
index 317e8608cf41..000000000000
--- a/include/linux/mfd/intel_msic.h
+++ /dev/null
@@ -1,453 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Core interface for Intel MSIC
- *
- * Copyright (C) 2011, Intel Corporation
- * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- */
-
-#ifndef __LINUX_MFD_INTEL_MSIC_H__
-#define __LINUX_MFD_INTEL_MSIC_H__
-
-/* ID */
-#define INTEL_MSIC_ID0			0x000	/* RO */
-#define INTEL_MSIC_ID1			0x001	/* RO */
-
-/* IRQ */
-#define INTEL_MSIC_IRQLVL1		0x002
-#define INTEL_MSIC_ADC1INT		0x003
-#define INTEL_MSIC_CCINT		0x004
-#define INTEL_MSIC_PWRSRCINT		0x005
-#define INTEL_MSIC_PWRSRCINT1		0x006
-#define INTEL_MSIC_CHRINT		0x007
-#define INTEL_MSIC_CHRINT1		0x008
-#define INTEL_MSIC_RTCIRQ		0x009
-#define INTEL_MSIC_GPIO0LVIRQ		0x00a
-#define INTEL_MSIC_GPIO1LVIRQ		0x00b
-#define INTEL_MSIC_GPIOHVIRQ		0x00c
-#define INTEL_MSIC_VRINT		0x00d
-#define INTEL_MSIC_OCAUDIO		0x00e
-#define INTEL_MSIC_ACCDET		0x00f
-#define INTEL_MSIC_RESETIRQ1		0x010
-#define INTEL_MSIC_RESETIRQ2		0x011
-#define INTEL_MSIC_MADC1INT		0x012
-#define INTEL_MSIC_MCCINT		0x013
-#define INTEL_MSIC_MPWRSRCINT		0x014
-#define INTEL_MSIC_MPWRSRCINT1		0x015
-#define INTEL_MSIC_MCHRINT		0x016
-#define INTEL_MSIC_MCHRINT1		0x017
-#define INTEL_MSIC_RTCIRQMASK		0x018
-#define INTEL_MSIC_GPIO0LVIRQMASK	0x019
-#define INTEL_MSIC_GPIO1LVIRQMASK	0x01a
-#define INTEL_MSIC_GPIOHVIRQMASK	0x01b
-#define INTEL_MSIC_VRINTMASK		0x01c
-#define INTEL_MSIC_OCAUDIOMASK		0x01d
-#define INTEL_MSIC_ACCDETMASK		0x01e
-#define INTEL_MSIC_RESETIRQ1MASK	0x01f
-#define INTEL_MSIC_RESETIRQ2MASK	0x020
-#define INTEL_MSIC_IRQLVL1MSK		0x021
-#define INTEL_MSIC_PBCONFIG		0x03e
-#define INTEL_MSIC_PBSTATUS		0x03f	/* RO */
-
-/* GPIO */
-#define INTEL_MSIC_GPIO0LV7CTLO		0x040
-#define INTEL_MSIC_GPIO0LV6CTLO		0x041
-#define INTEL_MSIC_GPIO0LV5CTLO		0x042
-#define INTEL_MSIC_GPIO0LV4CTLO		0x043
-#define INTEL_MSIC_GPIO0LV3CTLO		0x044
-#define INTEL_MSIC_GPIO0LV2CTLO		0x045
-#define INTEL_MSIC_GPIO0LV1CTLO		0x046
-#define INTEL_MSIC_GPIO0LV0CTLO		0x047
-#define INTEL_MSIC_GPIO1LV7CTLOS	0x048
-#define INTEL_MSIC_GPIO1LV6CTLO		0x049
-#define INTEL_MSIC_GPIO1LV5CTLO		0x04a
-#define INTEL_MSIC_GPIO1LV4CTLO		0x04b
-#define INTEL_MSIC_GPIO1LV3CTLO		0x04c
-#define INTEL_MSIC_GPIO1LV2CTLO		0x04d
-#define INTEL_MSIC_GPIO1LV1CTLO		0x04e
-#define INTEL_MSIC_GPIO1LV0CTLO		0x04f
-#define INTEL_MSIC_GPIO0LV7CTLI		0x050
-#define INTEL_MSIC_GPIO0LV6CTLI		0x051
-#define INTEL_MSIC_GPIO0LV5CTLI		0x052
-#define INTEL_MSIC_GPIO0LV4CTLI		0x053
-#define INTEL_MSIC_GPIO0LV3CTLI		0x054
-#define INTEL_MSIC_GPIO0LV2CTLI		0x055
-#define INTEL_MSIC_GPIO0LV1CTLI		0x056
-#define INTEL_MSIC_GPIO0LV0CTLI		0x057
-#define INTEL_MSIC_GPIO1LV7CTLIS	0x058
-#define INTEL_MSIC_GPIO1LV6CTLI		0x059
-#define INTEL_MSIC_GPIO1LV5CTLI		0x05a
-#define INTEL_MSIC_GPIO1LV4CTLI		0x05b
-#define INTEL_MSIC_GPIO1LV3CTLI		0x05c
-#define INTEL_MSIC_GPIO1LV2CTLI		0x05d
-#define INTEL_MSIC_GPIO1LV1CTLI		0x05e
-#define INTEL_MSIC_GPIO1LV0CTLI		0x05f
-#define INTEL_MSIC_PWM0CLKDIV1		0x061
-#define INTEL_MSIC_PWM0CLKDIV0		0x062
-#define INTEL_MSIC_PWM1CLKDIV1		0x063
-#define INTEL_MSIC_PWM1CLKDIV0		0x064
-#define INTEL_MSIC_PWM2CLKDIV1		0x065
-#define INTEL_MSIC_PWM2CLKDIV0		0x066
-#define INTEL_MSIC_PWM0DUTYCYCLE	0x067
-#define INTEL_MSIC_PWM1DUTYCYCLE	0x068
-#define INTEL_MSIC_PWM2DUTYCYCLE	0x069
-#define INTEL_MSIC_GPIO0HV3CTLO		0x06d
-#define INTEL_MSIC_GPIO0HV2CTLO		0x06e
-#define INTEL_MSIC_GPIO0HV1CTLO		0x06f
-#define INTEL_MSIC_GPIO0HV0CTLO		0x070
-#define INTEL_MSIC_GPIO1HV3CTLO		0x071
-#define INTEL_MSIC_GPIO1HV2CTLO		0x072
-#define INTEL_MSIC_GPIO1HV1CTLO		0x073
-#define INTEL_MSIC_GPIO1HV0CTLO		0x074
-#define INTEL_MSIC_GPIO0HV3CTLI		0x075
-#define INTEL_MSIC_GPIO0HV2CTLI		0x076
-#define INTEL_MSIC_GPIO0HV1CTLI		0x077
-#define INTEL_MSIC_GPIO0HV0CTLI		0x078
-#define INTEL_MSIC_GPIO1HV3CTLI		0x079
-#define INTEL_MSIC_GPIO1HV2CTLI		0x07a
-#define INTEL_MSIC_GPIO1HV1CTLI		0x07b
-#define INTEL_MSIC_GPIO1HV0CTLI		0x07c
-
-/* SVID */
-#define INTEL_MSIC_SVIDCTRL0		0x080
-#define INTEL_MSIC_SVIDCTRL1		0x081
-#define INTEL_MSIC_SVIDCTRL2		0x082
-#define INTEL_MSIC_SVIDTXLASTPKT3	0x083	/* RO */
-#define INTEL_MSIC_SVIDTXLASTPKT2	0x084	/* RO */
-#define INTEL_MSIC_SVIDTXLASTPKT1	0x085	/* RO */
-#define INTEL_MSIC_SVIDTXLASTPKT0	0x086	/* RO */
-#define INTEL_MSIC_SVIDPKTOUTBYTE3	0x087
-#define INTEL_MSIC_SVIDPKTOUTBYTE2	0x088
-#define INTEL_MSIC_SVIDPKTOUTBYTE1	0x089
-#define INTEL_MSIC_SVIDPKTOUTBYTE0	0x08a
-#define INTEL_MSIC_SVIDRXVPDEBUG1	0x08b
-#define INTEL_MSIC_SVIDRXVPDEBUG0	0x08c
-#define INTEL_MSIC_SVIDRXLASTPKT3	0x08d	/* RO */
-#define INTEL_MSIC_SVIDRXLASTPKT2	0x08e	/* RO */
-#define INTEL_MSIC_SVIDRXLASTPKT1	0x08f	/* RO */
-#define INTEL_MSIC_SVIDRXLASTPKT0	0x090	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS3	0x091	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS2	0x092	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS1	0x093	/* RO */
-#define INTEL_MSIC_SVIDRXCHKSTATUS0	0x094	/* RO */
-
-/* VREG */
-#define INTEL_MSIC_VCCLATCH		0x0c0
-#define INTEL_MSIC_VNNLATCH		0x0c1
-#define INTEL_MSIC_VCCCNT		0x0c2
-#define INTEL_MSIC_SMPSRAMP		0x0c3
-#define INTEL_MSIC_VNNCNT		0x0c4
-#define INTEL_MSIC_VNNAONCNT		0x0c5
-#define INTEL_MSIC_VCC122AONCNT		0x0c6
-#define INTEL_MSIC_V180AONCNT		0x0c7
-#define INTEL_MSIC_V500CNT		0x0c8
-#define INTEL_MSIC_VIHFCNT		0x0c9
-#define INTEL_MSIC_LDORAMP1		0x0ca
-#define INTEL_MSIC_LDORAMP2		0x0cb
-#define INTEL_MSIC_VCC108AONCNT		0x0cc
-#define INTEL_MSIC_VCC108ASCNT		0x0cd
-#define INTEL_MSIC_VCC108CNT		0x0ce
-#define INTEL_MSIC_VCCA100ASCNT		0x0cf
-#define INTEL_MSIC_VCCA100CNT		0x0d0
-#define INTEL_MSIC_VCC180AONCNT		0x0d1
-#define INTEL_MSIC_VCC180CNT		0x0d2
-#define INTEL_MSIC_VCC330CNT		0x0d3
-#define INTEL_MSIC_VUSB330CNT		0x0d4
-#define INTEL_MSIC_VCCSDIOCNT		0x0d5
-#define INTEL_MSIC_VPROG1CNT		0x0d6
-#define INTEL_MSIC_VPROG2CNT		0x0d7
-#define INTEL_MSIC_VEMMCSCNT		0x0d8
-#define INTEL_MSIC_VEMMC1CNT		0x0d9
-#define INTEL_MSIC_VEMMC2CNT		0x0da
-#define INTEL_MSIC_VAUDACNT		0x0db
-#define INTEL_MSIC_VHSPCNT		0x0dc
-#define INTEL_MSIC_VHSNCNT		0x0dd
-#define INTEL_MSIC_VHDMICNT		0x0de
-#define INTEL_MSIC_VOTGCNT		0x0df
-#define INTEL_MSIC_V1P35CNT		0x0e0
-#define INTEL_MSIC_V330AONCNT		0x0e1
-
-/* RESET */
-#define INTEL_MSIC_CHIPCNTRL		0x100	/* WO */
-#define INTEL_MSIC_ERCONFIG		0x101
-
-/* BURST */
-#define INTEL_MSIC_BATCURRENTLIMIT12	0x102
-#define INTEL_MSIC_BATTIMELIMIT12	0x103
-#define INTEL_MSIC_BATTIMELIMIT3	0x104
-#define INTEL_MSIC_BATTIMEDB		0x105
-#define INTEL_MSIC_BRSTCONFIGOUTPUTS	0x106
-#define INTEL_MSIC_BRSTCONFIGACTIONS	0x107
-#define INTEL_MSIC_BURSTCONTROLSTATUS	0x108
-
-/* RTC */
-#define INTEL_MSIC_RTCB1		0x140	/* RO */
-#define INTEL_MSIC_RTCB2		0x141	/* RO */
-#define INTEL_MSIC_RTCB3		0x142	/* RO */
-#define INTEL_MSIC_RTCB4		0x143	/* RO */
-#define INTEL_MSIC_RTCOB1		0x144
-#define INTEL_MSIC_RTCOB2		0x145
-#define INTEL_MSIC_RTCOB3		0x146
-#define INTEL_MSIC_RTCOB4		0x147
-#define INTEL_MSIC_RTCAB1		0x148
-#define INTEL_MSIC_RTCAB2		0x149
-#define INTEL_MSIC_RTCAB3		0x14a
-#define INTEL_MSIC_RTCAB4		0x14b
-#define INTEL_MSIC_RTCWAB1		0x14c
-#define INTEL_MSIC_RTCWAB2		0x14d
-#define INTEL_MSIC_RTCWAB3		0x14e
-#define INTEL_MSIC_RTCWAB4		0x14f
-#define INTEL_MSIC_RTCSC1		0x150
-#define INTEL_MSIC_RTCSC2		0x151
-#define INTEL_MSIC_RTCSC3		0x152
-#define INTEL_MSIC_RTCSC4		0x153
-#define INTEL_MSIC_RTCSTATUS		0x154	/* RO */
-#define INTEL_MSIC_RTCCONFIG1		0x155
-#define INTEL_MSIC_RTCCONFIG2		0x156
-
-/* CHARGER */
-#define INTEL_MSIC_BDTIMER		0x180
-#define INTEL_MSIC_BATTRMV		0x181
-#define INTEL_MSIC_VBUSDET		0x182
-#define INTEL_MSIC_VBUSDET1		0x183
-#define INTEL_MSIC_ADPHVDET		0x184
-#define INTEL_MSIC_ADPLVDET		0x185
-#define INTEL_MSIC_ADPDETDBDM		0x186
-#define INTEL_MSIC_LOWBATTDET		0x187
-#define INTEL_MSIC_CHRCTRL		0x188
-#define INTEL_MSIC_CHRCVOLTAGE		0x189
-#define INTEL_MSIC_CHRCCURRENT		0x18a
-#define INTEL_MSIC_SPCHARGER		0x18b
-#define INTEL_MSIC_CHRTTIME		0x18c
-#define INTEL_MSIC_CHRCTRL1		0x18d
-#define INTEL_MSIC_PWRSRCLMT		0x18e
-#define INTEL_MSIC_CHRSTWDT		0x18f
-#define INTEL_MSIC_WDTWRITE		0x190	/* WO */
-#define INTEL_MSIC_CHRSAFELMT		0x191
-#define INTEL_MSIC_SPWRSRCINT		0x192	/* RO */
-#define INTEL_MSIC_SPWRSRCINT1		0x193	/* RO */
-#define INTEL_MSIC_CHRLEDPWM		0x194
-#define INTEL_MSIC_CHRLEDCTRL		0x195
-
-/* ADC */
-#define INTEL_MSIC_ADC1CNTL1		0x1c0
-#define INTEL_MSIC_ADC1CNTL2		0x1c1
-#define INTEL_MSIC_ADC1CNTL3		0x1c2
-#define INTEL_MSIC_ADC1OFFSETH		0x1c3	/* RO */
-#define INTEL_MSIC_ADC1OFFSETL		0x1c4	/* RO */
-#define INTEL_MSIC_ADC1ADDR0		0x1c5
-#define INTEL_MSIC_ADC1ADDR1		0x1c6
-#define INTEL_MSIC_ADC1ADDR2		0x1c7
-#define INTEL_MSIC_ADC1ADDR3		0x1c8
-#define INTEL_MSIC_ADC1ADDR4		0x1c9
-#define INTEL_MSIC_ADC1ADDR5		0x1ca
-#define INTEL_MSIC_ADC1ADDR6		0x1cb
-#define INTEL_MSIC_ADC1ADDR7		0x1cc
-#define INTEL_MSIC_ADC1ADDR8		0x1cd
-#define INTEL_MSIC_ADC1ADDR9		0x1ce
-#define INTEL_MSIC_ADC1ADDR10		0x1cf
-#define INTEL_MSIC_ADC1ADDR11		0x1d0
-#define INTEL_MSIC_ADC1ADDR12		0x1d1
-#define INTEL_MSIC_ADC1ADDR13		0x1d2
-#define INTEL_MSIC_ADC1ADDR14		0x1d3
-#define INTEL_MSIC_ADC1SNS0H		0x1d4	/* RO */
-#define INTEL_MSIC_ADC1SNS0L		0x1d5	/* RO */
-#define INTEL_MSIC_ADC1SNS1H		0x1d6	/* RO */
-#define INTEL_MSIC_ADC1SNS1L		0x1d7	/* RO */
-#define INTEL_MSIC_ADC1SNS2H		0x1d8	/* RO */
-#define INTEL_MSIC_ADC1SNS2L		0x1d9	/* RO */
-#define INTEL_MSIC_ADC1SNS3H		0x1da	/* RO */
-#define INTEL_MSIC_ADC1SNS3L		0x1db	/* RO */
-#define INTEL_MSIC_ADC1SNS4H		0x1dc	/* RO */
-#define INTEL_MSIC_ADC1SNS4L		0x1dd	/* RO */
-#define INTEL_MSIC_ADC1SNS5H		0x1de	/* RO */
-#define INTEL_MSIC_ADC1SNS5L		0x1df	/* RO */
-#define INTEL_MSIC_ADC1SNS6H		0x1e0	/* RO */
-#define INTEL_MSIC_ADC1SNS6L		0x1e1	/* RO */
-#define INTEL_MSIC_ADC1SNS7H		0x1e2	/* RO */
-#define INTEL_MSIC_ADC1SNS7L		0x1e3	/* RO */
-#define INTEL_MSIC_ADC1SNS8H		0x1e4	/* RO */
-#define INTEL_MSIC_ADC1SNS8L		0x1e5	/* RO */
-#define INTEL_MSIC_ADC1SNS9H		0x1e6	/* RO */
-#define INTEL_MSIC_ADC1SNS9L		0x1e7	/* RO */
-#define INTEL_MSIC_ADC1SNS10H		0x1e8	/* RO */
-#define INTEL_MSIC_ADC1SNS10L		0x1e9	/* RO */
-#define INTEL_MSIC_ADC1SNS11H		0x1ea	/* RO */
-#define INTEL_MSIC_ADC1SNS11L		0x1eb	/* RO */
-#define INTEL_MSIC_ADC1SNS12H		0x1ec	/* RO */
-#define INTEL_MSIC_ADC1SNS12L		0x1ed	/* RO */
-#define INTEL_MSIC_ADC1SNS13H		0x1ee	/* RO */
-#define INTEL_MSIC_ADC1SNS13L		0x1ef	/* RO */
-#define INTEL_MSIC_ADC1SNS14H		0x1f0	/* RO */
-#define INTEL_MSIC_ADC1SNS14L		0x1f1	/* RO */
-#define INTEL_MSIC_ADC1BV0H		0x1f2	/* RO */
-#define INTEL_MSIC_ADC1BV0L		0x1f3	/* RO */
-#define INTEL_MSIC_ADC1BV1H		0x1f4	/* RO */
-#define INTEL_MSIC_ADC1BV1L		0x1f5	/* RO */
-#define INTEL_MSIC_ADC1BV2H		0x1f6	/* RO */
-#define INTEL_MSIC_ADC1BV2L		0x1f7	/* RO */
-#define INTEL_MSIC_ADC1BV3H		0x1f8	/* RO */
-#define INTEL_MSIC_ADC1BV3L		0x1f9	/* RO */
-#define INTEL_MSIC_ADC1BI0H		0x1fa	/* RO */
-#define INTEL_MSIC_ADC1BI0L		0x1fb	/* RO */
-#define INTEL_MSIC_ADC1BI1H		0x1fc	/* RO */
-#define INTEL_MSIC_ADC1BI1L		0x1fd	/* RO */
-#define INTEL_MSIC_ADC1BI2H		0x1fe	/* RO */
-#define INTEL_MSIC_ADC1BI2L		0x1ff	/* RO */
-#define INTEL_MSIC_ADC1BI3H		0x200	/* RO */
-#define INTEL_MSIC_ADC1BI3L		0x201	/* RO */
-#define INTEL_MSIC_CCCNTL		0x202
-#define INTEL_MSIC_CCOFFSETH		0x203	/* RO */
-#define INTEL_MSIC_CCOFFSETL		0x204	/* RO */
-#define INTEL_MSIC_CCADCHA		0x205	/* RO */
-#define INTEL_MSIC_CCADCLA		0x206	/* RO */
-
-/* AUDIO */
-#define INTEL_MSIC_AUDPLLCTRL		0x240
-#define INTEL_MSIC_DMICBUF0123		0x241
-#define INTEL_MSIC_DMICBUF45		0x242
-#define INTEL_MSIC_DMICGPO		0x244
-#define INTEL_MSIC_DMICMUX		0x245
-#define INTEL_MSIC_DMICCLK		0x246
-#define INTEL_MSIC_MICBIAS		0x247
-#define INTEL_MSIC_ADCCONFIG		0x248
-#define INTEL_MSIC_MICAMP1		0x249
-#define INTEL_MSIC_MICAMP2		0x24a
-#define INTEL_MSIC_NOISEMUX		0x24b
-#define INTEL_MSIC_AUDIOMUX12		0x24c
-#define INTEL_MSIC_AUDIOMUX34		0x24d
-#define INTEL_MSIC_AUDIOSINC		0x24e
-#define INTEL_MSIC_AUDIOTXEN		0x24f
-#define INTEL_MSIC_HSEPRXCTRL		0x250
-#define INTEL_MSIC_IHFRXCTRL		0x251
-#define INTEL_MSIC_VOICETXVOL		0x252
-#define INTEL_MSIC_SIDETONEVOL		0x253
-#define INTEL_MSIC_MUSICSHARVOL		0x254
-#define INTEL_MSIC_VOICETXCTRL		0x255
-#define INTEL_MSIC_HSMIXER		0x256
-#define INTEL_MSIC_DACCONFIG		0x257
-#define INTEL_MSIC_SOFTMUTE		0x258
-#define INTEL_MSIC_HSLVOLCTRL		0x259
-#define INTEL_MSIC_HSRVOLCTRL		0x25a
-#define INTEL_MSIC_IHFLVOLCTRL		0x25b
-#define INTEL_MSIC_IHFRVOLCTRL		0x25c
-#define INTEL_MSIC_DRIVEREN		0x25d
-#define INTEL_MSIC_LINEOUTCTRL		0x25e
-#define INTEL_MSIC_VIB1CTRL1		0x25f
-#define INTEL_MSIC_VIB1CTRL2		0x260
-#define INTEL_MSIC_VIB1CTRL3		0x261
-#define INTEL_MSIC_VIB1SPIPCM_1		0x262
-#define INTEL_MSIC_VIB1SPIPCM_2		0x263
-#define INTEL_MSIC_VIB1CTRL5		0x264
-#define INTEL_MSIC_VIB2CTRL1		0x265
-#define INTEL_MSIC_VIB2CTRL2		0x266
-#define INTEL_MSIC_VIB2CTRL3		0x267
-#define INTEL_MSIC_VIB2SPIPCM_1		0x268
-#define INTEL_MSIC_VIB2SPIPCM_2		0x269
-#define INTEL_MSIC_VIB2CTRL5		0x26a
-#define INTEL_MSIC_BTNCTRL1		0x26b
-#define INTEL_MSIC_BTNCTRL2		0x26c
-#define INTEL_MSIC_PCM1TXSLOT01		0x26d
-#define INTEL_MSIC_PCM1TXSLOT23		0x26e
-#define INTEL_MSIC_PCM1TXSLOT45		0x26f
-#define INTEL_MSIC_PCM1RXSLOT0123	0x270
-#define INTEL_MSIC_PCM1RXSLOT045	0x271
-#define INTEL_MSIC_PCM2TXSLOT01		0x272
-#define INTEL_MSIC_PCM2TXSLOT23		0x273
-#define INTEL_MSIC_PCM2TXSLOT45		0x274
-#define INTEL_MSIC_PCM2RXSLOT01		0x275
-#define INTEL_MSIC_PCM2RXSLOT23		0x276
-#define INTEL_MSIC_PCM2RXSLOT45		0x277
-#define INTEL_MSIC_PCM1CTRL1		0x278
-#define INTEL_MSIC_PCM1CTRL2		0x279
-#define INTEL_MSIC_PCM1CTRL3		0x27a
-#define INTEL_MSIC_PCM2CTRL1		0x27b
-#define INTEL_MSIC_PCM2CTRL2		0x27c
-
-/* HDMI */
-#define INTEL_MSIC_HDMIPUEN		0x280
-#define INTEL_MSIC_HDMISTATUS		0x281	/* RO */
-
-/* Physical address of the start of the MSIC interrupt tree in SRAM */
-#define INTEL_MSIC_IRQ_PHYS_BASE	0xffff7fc0
-
-/**
- * struct intel_msic_gpio_pdata - platform data for the MSIC GPIO driver
- * @gpio_base: base number for the GPIOs
- */
-struct intel_msic_gpio_pdata {
-	unsigned	gpio_base;
-};
-
-/**
- * struct intel_msic_ocd_pdata - platform data for the MSIC OCD driver
- * @gpio: GPIO number used for OCD interrupts
- *
- * The MSIC MFD driver converts @gpio into an IRQ number and passes it to
- * the OCD driver as %IORESOURCE_IRQ.
- */
-struct intel_msic_ocd_pdata {
-	unsigned	gpio;
-};
-
-/* MSIC embedded blocks (subdevices) */
-enum intel_msic_block {
-	INTEL_MSIC_BLOCK_TOUCH,
-	INTEL_MSIC_BLOCK_ADC,
-	INTEL_MSIC_BLOCK_BATTERY,
-	INTEL_MSIC_BLOCK_GPIO,
-	INTEL_MSIC_BLOCK_AUDIO,
-	INTEL_MSIC_BLOCK_HDMI,
-	INTEL_MSIC_BLOCK_THERMAL,
-	INTEL_MSIC_BLOCK_POWER_BTN,
-	INTEL_MSIC_BLOCK_OCD,
-
-	INTEL_MSIC_BLOCK_LAST,
-};
-
-/**
- * struct intel_msic_platform_data - platform data for the MSIC driver
- * @irq: array of interrupt numbers, one per device. If @irq is set to %0
- *	 for a given block, the corresponding platform device is not
- *	 created. For devices which don't have an interrupt, use %0xff
- *	 (this is same as in SFI spec).
- * @gpio: platform data for the MSIC GPIO driver
- * @ocd: platform data for the MSIC OCD driver
- *
- * Once the MSIC driver is initialized, the register interface is ready to
- * use. All the platform devices for subdevices are created after the
- * register interface is ready so that we can guarantee its availability to
- * the subdevice drivers.
- *
- * Interrupt numbers are passed to the subdevices via %IORESOURCE_IRQ
- * resources of the created platform device.
- */
-struct intel_msic_platform_data {
-	int				irq[INTEL_MSIC_BLOCK_LAST];
-	struct intel_msic_gpio_pdata	*gpio;
-	struct intel_msic_ocd_pdata	*ocd;
-};
-
-struct intel_msic;
-
-extern int intel_msic_reg_read(unsigned short reg, u8 *val);
-extern int intel_msic_reg_write(unsigned short reg, u8 val);
-extern int intel_msic_reg_update(unsigned short reg, u8 val, u8 mask);
-extern int intel_msic_bulk_read(unsigned short *reg, u8 *buf, size_t count);
-extern int intel_msic_bulk_write(unsigned short *reg, u8 *buf, size_t count);
-
-/*
- * pdev_to_intel_msic - gets an MSIC instance from the platform device
- * @pdev: platform device pointer
- *
- * The client drivers need to have pointer to the MSIC instance if they
- * want to call intel_msic_irq_read(). This macro can be used for
- * convenience to get the MSIC pointer from @pdev where needed. This is
- * _only_ valid for devices which are managed by the MSIC.
- */
-#define pdev_to_intel_msic(pdev)	(dev_get_drvdata(pdev->dev.parent))
-
-extern int intel_msic_irq_read(struct intel_msic *msic, unsigned short reg,
-			       u8 *val);
-
-#endif /* __LINUX_MFD_INTEL_MSIC_H__ */
-- 
cgit v1.2.3


From 81d88ce55092edf1a1f928efb373f289c6b90efd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Nov 2020 16:38:40 +0100
Subject: dma-mapping: remove the {alloc,free}_noncoherent methods

It turns out allowing non-contigous allocations here was a rather bad
idea, as we'll now need to define ways to get the pages for mmaping
or dma_buf sharing.  Revert this change and stick to the original
concept.  A different API for the use case of non-contigous allocations
will be added back later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Tested-by: Ricardo Ribalda <ribalda@chromium.org>:wq
---
 Documentation/core-api/dma-api.rst | 64 +++++++++++++-------------------------
 drivers/iommu/dma-iommu.c          | 30 ------------------
 include/linux/dma-map-ops.h        |  5 ---
 include/linux/dma-mapping.h        | 17 +++++++---
 kernel/dma/mapping.c               | 40 ------------------------
 5 files changed, 35 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 75cb757bbff0..e6d23f117308 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -528,16 +528,14 @@ an I/O device, you should not be using this part of the API.
 
 ::
 
-	void *
-	dma_alloc_noncoherent(struct device *dev, size_t size,
-			dma_addr_t *dma_handle, enum dma_data_direction dir,
-			gfp_t gfp)
+	struct page *
+	dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle,
+			enum dma_data_direction dir, gfp_t gfp)
 
-This routine allocates a region of <size> bytes of consistent memory.  It
-returns a pointer to the allocated region (in the processor's virtual address
-space) or NULL if the allocation failed.  The returned memory may or may not
-be in the kernel direct mapping.  Drivers must not call virt_to_page on
-the returned memory region.
+This routine allocates a region of <size> bytes of non-coherent memory.  It
+returns a pointer to first struct page for the region, or NULL if the
+allocation failed. The resulting struct page can be used for everything a
+struct page is suitable for.
 
 It also returns a <dma_handle> which may be cast to an unsigned integer the
 same width as the bus and given to the device as the DMA address base of
@@ -558,51 +556,33 @@ reused.
 ::
 
 	void
-	dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
+	dma_free_pages(struct device *dev, size_t size, struct page *page,
 			dma_addr_t dma_handle, enum dma_data_direction dir)
 
-Free a region of memory previously allocated using dma_alloc_noncoherent().
-dev, size and dma_handle and dir must all be the same as those passed into
-dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
-dma_alloc_noncoherent().
+Free a region of memory previously allocated using dma_alloc_pages().
+dev, size, dma_handle and dir must all be the same as those passed into
+dma_alloc_pages().  page must be the pointer returned by dma_alloc_pages().
 
 ::
 
-	struct page *
-	dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle,
-			enum dma_data_direction dir, gfp_t gfp)
-
-This routine allocates a region of <size> bytes of non-coherent memory.  It
-returns a pointer to first struct page for the region, or NULL if the
-allocation failed. The resulting struct page can be used for everything a
-struct page is suitable for.
-
-It also returns a <dma_handle> which may be cast to an unsigned integer the
-same width as the bus and given to the device as the DMA address base of
-the region.
-
-The dir parameter specified if data is read and/or written by the device,
-see dma_map_single() for details.
-
-The gfp parameter allows the caller to specify the ``GFP_`` flags (see
-kmalloc()) for the allocation, but rejects flags used to specify a memory
-zone such as GFP_DMA or GFP_HIGHMEM.
+	void *
+	dma_alloc_noncoherent(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, enum dma_data_direction dir,
+			gfp_t gfp)
 
-Before giving the memory to the device, dma_sync_single_for_device() needs
-to be called, and before reading memory written by the device,
-dma_sync_single_for_cpu(), just like for streaming DMA mappings that are
-reused.
+This routine is a convenient wrapper around dma_alloc_pages that returns the
+kernel virtual address for the allocated memory instead of the page structure.
 
 ::
 
 	void
-	dma_free_pages(struct device *dev, size_t size, struct page *page,
+	dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
 			dma_addr_t dma_handle, enum dma_data_direction dir)
 
-Free a region of memory previously allocated using dma_alloc_pages().
-dev, size and dma_handle and dir must all be the same as those passed into
-dma_alloc_noncoherent().  page must be the pointer returned by
-dma_alloc_pages().
+Free a region of memory previously allocated using dma_alloc_noncoherent().
+dev, size, dma_handle and dir must all be the same as those passed into
+dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
+dma_alloc_noncoherent().
 
 ::
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 4078358ed66e..255533faf905 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1197,34 +1197,6 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 	return cpu_addr;
 }
 
-#ifdef CONFIG_DMA_REMAP
-static void *iommu_dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *handle, enum dma_data_direction dir, gfp_t gfp)
-{
-	if (!gfpflags_allow_blocking(gfp)) {
-		struct page *page;
-
-		page = dma_common_alloc_pages(dev, size, handle, dir, gfp);
-		if (!page)
-			return NULL;
-		return page_address(page);
-	}
-
-	return iommu_dma_alloc_remap(dev, size, handle, gfp | __GFP_ZERO,
-				     PAGE_KERNEL, 0);
-}
-
-static void iommu_dma_free_noncoherent(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t handle, enum dma_data_direction dir)
-{
-	__iommu_dma_unmap(dev, handle, size);
-	__iommu_dma_free(dev, size, cpu_addr);
-}
-#else
-#define iommu_dma_alloc_noncoherent		NULL
-#define iommu_dma_free_noncoherent		NULL
-#endif /* CONFIG_DMA_REMAP */
-
 static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs)
@@ -1295,8 +1267,6 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.free			= iommu_dma_free,
 	.alloc_pages		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
-	.alloc_noncoherent	= iommu_dma_alloc_noncoherent,
-	.free_noncoherent	= iommu_dma_free_noncoherent,
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 70fcd0f610ea..11e02537b9e0 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -22,11 +22,6 @@ struct dma_map_ops {
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
 			dma_addr_t dma_handle, enum dma_data_direction dir);
-	void *(*alloc_noncoherent)(struct device *dev, size_t size,
-			dma_addr_t *dma_handle, enum dma_data_direction dir,
-			gfp_t gfp);
-	void (*free_noncoherent)(struct device *dev, size_t size, void *vaddr,
-			dma_addr_t dma_handle, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			void *, dma_addr_t, size_t, unsigned long attrs);
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2e49996a8f39..fbfa3f5abd94 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -263,10 +263,19 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
 void dma_free_pages(struct device *dev, size_t size, struct page *page,
 		dma_addr_t dma_handle, enum dma_data_direction dir);
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, enum dma_data_direction dir);
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+	struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+	return page ? page_address(page) : NULL;
+}
+
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+		void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+	dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
+}
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f87a89d08654..68992e35c8c3 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -515,46 +515,6 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
 }
 EXPORT_SYMBOL_GPL(dma_free_pages);
 
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	void *vaddr;
-
-	if (!ops || !ops->alloc_noncoherent) {
-		struct page *page;
-
-		page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
-		if (!page)
-			return NULL;
-		return page_address(page);
-	}
-
-	size = PAGE_ALIGN(size);
-	vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
-	if (vaddr)
-		debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
-				   *dma_handle);
-	return vaddr;
-}
-EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);
-
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, enum dma_data_direction dir)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (!ops || !ops->free_noncoherent) {
-		dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
-		return;
-	}
-
-	size = PAGE_ALIGN(size);
-	debug_dma_unmap_page(dev, dma_handle, size, dir);
-	ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
-}
-EXPORT_SYMBOL_GPL(dma_free_noncoherent);
-
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From 1746fd4416ed5510fe9fdd6a93e49a436187b680 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 8 Feb 2021 15:09:23 -0500
Subject: tracepoints: Remove unnecessary "data_args" macro parameter

While working on a clean up that would restructure the difference between
architectures that have static calls vs those that do not, I was stumbling
over the "data_args" parameter that includes "__data" in the arguments. The
issue was that one version didn't even need it, while the other one did.
Instead of injecting a "__data = NULL;" into the macro for the unneeded
version, just remove it completely.

The original idea behind data_args is that there may be a case of a
tracepoint with no arguments. But this is considered bad practice, and all
tracepoints should pass something to that location (that's what tracepoints
were created for).

Link: https://lkml.kernel.org/r/20210208201050.768074128@goodmis.org

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 966ed8980327..42bb5b753b33 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -160,13 +160,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 /*
  * it_func[0] is never NULL because there is at least one element in the array
  * when the array itself is non NULL.
- *
- * Note, the proto and args passed in includes "__data" as the first parameter.
- * The reason for this is to handle the "void" prototype. If a tracepoint
- * has a "void" prototype, then it is invalid to declare a function
- * as "(void *, void)".
  */
-#define __DO_TRACE(name, proto, args, cond, rcuidle)			\
+#define __DO_TRACE(name, args, cond, rcuidle)				\
 	do {								\
 		struct tracepoint_func *it_func_ptr;			\
 		int __maybe_unused __idx = 0;				\
@@ -194,7 +189,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
 		if (it_func_ptr) {					\
 			__data = (it_func_ptr)->data;			\
-			__DO_TRACE_CALL(name)(args);			\
+			__DO_TRACE_CALL(name)(__data, args);		\
 		}							\
 									\
 		if (rcuidle) {						\
@@ -206,17 +201,16 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	} while (0)
 
 #ifndef MODULE
-#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \
+#define __DECLARE_TRACE_RCU(name, proto, args, cond)			\
 	static inline void trace_##name##_rcuidle(proto)		\
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
 			__DO_TRACE(name,				\
-				TP_PROTO(data_proto),			\
-				TP_ARGS(data_args),			\
+				TP_ARGS(args),				\
 				TP_CONDITION(cond), 1);			\
 	}
 #else
-#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
+#define __DECLARE_TRACE_RCU(name, proto, args, cond)
 #endif
 
 /*
@@ -231,7 +225,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  * even when this tracepoint is off. This code has no purpose other than
  * poking RCU a bit.
  */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	extern int __traceiter_##name(data_proto);			\
 	DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name);	\
 	extern struct tracepoint __tracepoint_##name;			\
@@ -239,8 +233,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
 			__DO_TRACE(name,				\
-				TP_PROTO(data_proto),			\
-				TP_ARGS(data_args),			\
+				TP_ARGS(args),				\
 				TP_CONDITION(cond), 0);			\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			rcu_read_lock_sched_notrace();			\
@@ -249,7 +242,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		}							\
 	}								\
 	__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),		\
-		PARAMS(cond), PARAMS(data_proto), PARAMS(data_args))	\
+			    PARAMS(cond))				\
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
@@ -332,7 +325,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 
 
 #else /* !TRACEPOINTS_ENABLED */
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto)		\
 	static inline void trace_##name(proto)				\
 	{ }								\
 	static inline void trace_##name##_rcuidle(proto)		\
@@ -412,14 +405,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #define DECLARE_TRACE(name, proto, args)				\
 	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
 			cpu_online(raw_smp_processor_id()),		\
-			PARAMS(void *__data, proto),			\
-			PARAMS(__data, args))
+			PARAMS(void *__data, proto))
 
 #define DECLARE_TRACE_CONDITION(name, proto, args, cond)		\
 	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
 			cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
-			PARAMS(void *__data, proto),			\
-			PARAMS(__data, args))
+			PARAMS(void *__data, proto))
 
 #define TRACE_EVENT_FLAGS(event, flag)
 
-- 
cgit v1.2.3


From d9a1be1be331fc857d3fe29f86c3a305950b35a9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 8 Feb 2021 15:09:24 -0500
Subject: tracepoints: Do not punish non static call users

With static calls, a tracepoint can call the callback directly if there is
only one callback registered to that tracepoint. When there is more than
one, the static call will call the tracepoint's "iterator" function, which
needs to reload the tracepoint's "funcs" array again, as it could have
changed since the first time it was loaded.

But an arch without static calls is punished by having to load the
tracepoint's "funcs" array twice. Once in the DO_TRACE macro, and once
again in the iterator macro.

For archs without static calls, there's no reason to load the array macro
in the first place, since the iterator function will do it anyway.

Change the __DO_TRACE_CALL() macro to do the load and call of the
tracepoints funcs array only for architectures with static calls, and just
call the iterator function directly for architectures without static calls.

Link: https://lkml.kernel.org/r/20210208201050.909329787@goodmis.org

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 42bb5b753b33..2aad1c10821a 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -152,9 +152,19 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 #ifdef TRACEPOINTS_ENABLED
 
 #ifdef CONFIG_HAVE_STATIC_CALL
-#define __DO_TRACE_CALL(name)	static_call(tp_func_##name)
+#define __DO_TRACE_CALL(name, args)					\
+	do {								\
+		struct tracepoint_func *it_func_ptr;			\
+		void *__data;						\
+		it_func_ptr =						\
+			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
+		if (it_func_ptr) {					\
+			__data = (it_func_ptr)->data;			\
+			static_call(tp_func_##name)(__data, args);	\
+		}							\
+	} while (0)
 #else
-#define __DO_TRACE_CALL(name)	__traceiter_##name
+#define __DO_TRACE_CALL(name, args)	__traceiter_##name(NULL, args)
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
 /*
@@ -163,9 +173,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  */
 #define __DO_TRACE(name, args, cond, rcuidle)				\
 	do {								\
-		struct tracepoint_func *it_func_ptr;			\
 		int __maybe_unused __idx = 0;				\
-		void *__data;						\
 									\
 		if (!(cond))						\
 			return;						\
@@ -185,12 +193,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			rcu_irq_enter_irqson();				\
 		}							\
 									\
-		it_func_ptr =						\
-			rcu_dereference_raw((&__tracepoint_##name)->funcs); \
-		if (it_func_ptr) {					\
-			__data = (it_func_ptr)->data;			\
-			__DO_TRACE_CALL(name)(__data, args);		\
-		}							\
+		__DO_TRACE_CALL(name, TP_ARGS(args));			\
 									\
 		if (rcuidle) {						\
 			rcu_irq_exit_irqson();				\
-- 
cgit v1.2.3


From 7211f0a25781ace5f79b272318b4c60b5dcfd413 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 4 Feb 2021 14:30:04 -0500
Subject: tracepoints: Code clean up

Restructure the code a bit to make it simpler, fix some formatting problems
and add READ_ONCE/WRITE_ONCE to make sure there's no compiler load/store
tearing to the variables that can be accessed across CPUs.

Started with Mathieu Desnoyers's patch:

  Link: https://lore.kernel.org/lkml/20210203175741.20665-1-mathieu.desnoyers@efficios.com/

And will keep his signature, but I will take the responsibility of this
being correct, and keep the authorship.

Link: https://lkml.kernel.org/r/20210204143004.61126582@gandalf.local.home

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h |  2 +-
 kernel/tracepoint.c        | 91 ++++++++++++++++++----------------------------
 2 files changed, 36 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2aad1c10821a..9cfb099da58f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -305,7 +305,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			rcu_dereference_raw((&__tracepoint_##_name)->funcs); \
 		if (it_func_ptr) {					\
 			do {						\
-				it_func = (it_func_ptr)->func;		\
+				it_func = READ_ONCE((it_func_ptr)->func); \
 				__data = (it_func_ptr)->data;		\
 				((void(*)(void *, proto))(it_func))(__data, args); \
 			} while ((++it_func_ptr)->func);		\
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e8f20ae29c18..9f478d29b926 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -136,9 +136,9 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
 	 int prio)
 {
 	struct tracepoint_func *old, *new;
-	int nr_probes = 0;
-	int stub_funcs = 0;
-	int pos = -1;
+	int iter_probes;	/* Iterate over old probe array. */
+	int nr_probes = 0;	/* Counter for probes */
+	int pos = -1;		/* Insertion position into new array */
 
 	if (WARN_ON(!tp_func->func))
 		return ERR_PTR(-EINVAL);
@@ -147,54 +147,38 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
 	old = *funcs;
 	if (old) {
 		/* (N -> N+1), (N != 0, 1) probes */
-		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-			/* Insert before probes of lower priority */
-			if (pos < 0 && old[nr_probes].prio < prio)
-				pos = nr_probes;
-			if (old[nr_probes].func == tp_func->func &&
-			    old[nr_probes].data == tp_func->data)
+		for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+			if (old[iter_probes].func == tp_stub_func)
+				continue;	/* Skip stub functions. */
+			if (old[iter_probes].func == tp_func->func &&
+			    old[iter_probes].data == tp_func->data)
 				return ERR_PTR(-EEXIST);
-			if (old[nr_probes].func == tp_stub_func)
-				stub_funcs++;
+			nr_probes++;
 		}
 	}
-	/* + 2 : one for new probe, one for NULL func - stub functions */
-	new = allocate_probes(nr_probes + 2 - stub_funcs);
+	/* + 2 : one for new probe, one for NULL func */
+	new = allocate_probes(nr_probes + 2);
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	if (old) {
-		if (stub_funcs) {
-			/* Need to copy one at a time to remove stubs */
-			int probes = 0;
-
-			pos = -1;
-			for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-				if (old[nr_probes].func == tp_stub_func)
-					continue;
-				if (pos < 0 && old[nr_probes].prio < prio)
-					pos = probes++;
-				new[probes++] = old[nr_probes];
-			}
-			nr_probes = probes;
-			if (pos < 0)
-				pos = probes;
-			else
-				nr_probes--; /* Account for insertion */
-
-		} else if (pos < 0) {
-			pos = nr_probes;
-			memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
-		} else {
-			/* Copy higher priority probes ahead of the new probe */
-			memcpy(new, old, pos * sizeof(struct tracepoint_func));
-			/* Copy the rest after it. */
-			memcpy(new + pos + 1, old + pos,
-			       (nr_probes - pos) * sizeof(struct tracepoint_func));
+		nr_probes = 0;
+		for (iter_probes = 0; old[iter_probes].func; iter_probes++) {
+			if (old[iter_probes].func == tp_stub_func)
+				continue;
+			/* Insert before probes of lower priority */
+			if (pos < 0 && old[iter_probes].prio < prio)
+				pos = nr_probes++;
+			new[nr_probes++] = old[iter_probes];
 		}
-	} else
+		if (pos < 0)
+			pos = nr_probes++;
+		/* nr_probes now points to the end of the new array */
+	} else {
 		pos = 0;
+		nr_probes = 1; /* must point at end of array */
+	}
 	new[pos] = *tp_func;
-	new[nr_probes + 1].func = NULL;
+	new[nr_probes].func = NULL;
 	*funcs = new;
 	debug_print_probes(*funcs);
 	return old;
@@ -237,11 +221,12 @@ static void *func_remove(struct tracepoint_func **funcs,
 		/* + 1 for NULL */
 		new = allocate_probes(nr_probes - nr_del + 1);
 		if (new) {
-			for (i = 0; old[i].func; i++)
-				if ((old[i].func != tp_func->func
-				     || old[i].data != tp_func->data)
-				    && old[i].func != tp_stub_func)
+			for (i = 0; old[i].func; i++) {
+				if ((old[i].func != tp_func->func ||
+				     old[i].data != tp_func->data) &&
+				    old[i].func != tp_stub_func)
 					new[j++] = old[i];
+			}
 			new[nr_probes - nr_del].func = NULL;
 			*funcs = new;
 		} else {
@@ -249,17 +234,11 @@ static void *func_remove(struct tracepoint_func **funcs,
 			 * Failed to allocate, replace the old function
 			 * with calls to tp_stub_func.
 			 */
-			for (i = 0; old[i].func; i++)
+			for (i = 0; old[i].func; i++) {
 				if (old[i].func == tp_func->func &&
-				    old[i].data == tp_func->data) {
-					old[i].func = tp_stub_func;
-					/* Set the prio to the next event. */
-					if (old[i + 1].func)
-						old[i].prio =
-							old[i + 1].prio;
-					else
-						old[i].prio = -1;
-				}
+				    old[i].data == tp_func->data)
+					WRITE_ONCE(old[i].func, tp_stub_func);
+			}
 			*funcs = old;
 		}
 	}
-- 
cgit v1.2.3


From a1320e0c2fc834264fa2125ef5c40670dbb5b736 Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Thu, 4 Feb 2021 11:34:31 +0800
Subject: ftrace: Remove unused ftrace_force_update()

ftrace_force_update() is committed by Commit e1c08bdd9fa7 ("ftrace: force
recording") and removed by Commit cb7be3b2fc2c ("ftrace: remove daemon").
Remove it in header file.

Link: https://lkml.kernel.org/r/1612409671-8249-1-git-send-email-hejinyang@loongson.cn

Signed-off-by: Jinyang He <hejinyang@loongson.cn>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9a8ce28e4485..86e5028bfa20 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -485,7 +485,6 @@ struct dyn_ftrace {
 	struct dyn_arch_ftrace	arch;
 };
 
-int ftrace_force_update(void);
 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 			 int remove, int reset);
 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
@@ -740,7 +739,6 @@ extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else /* CONFIG_DYNAMIC_FTRACE */
 static inline int skip_trace(unsigned long ip) { return 0; }
-static inline int ftrace_force_update(void) { return 0; }
 static inline void ftrace_disable_daemon(void) { }
 static inline void ftrace_enable_daemon(void) { }
 static inline void ftrace_module_init(struct module *mod) { }
-- 
cgit v1.2.3


From 8f1fc1c15329a9d53bde5636e85ca98ece2ec7bd Mon Sep 17 00:00:00 2001
From: Martin Hundebøll <mhu@silicom.dk>
Date: Mon, 8 Feb 2021 16:01:57 +0100
Subject: PCI: Add Silicom Denmark vendor ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update pci_ids.h with the vendor ID for Silicom Denmark. The define is
going to be referenced in driver(s) for FPGA accelerated smart NICs.

Link: https://lore.kernel.org/r/20210208150158.2877414-1-mhu@silicom.dk
Signed-off-by: Martin Hundebøll <mhu@silicom.dk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Tom Rix <trix@redhat.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d8156a5dbee8..aae07d512ca6 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2588,6 +2588,8 @@
 
 #define PCI_VENDOR_ID_REDHAT		0x1b36
 
+#define PCI_VENDOR_ID_SILICOM_DENMARK	0x1c2c
+
 #define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS	0x1c36
 
 #define PCI_VENDOR_ID_CIRCUITCO		0x1cc8
-- 
cgit v1.2.3


From 29863d41bb6e1d969c62fdb15b0961806942960e Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Feb 2021 11:34:09 -0800
Subject: net: implement threaded-able napi poll loop support

This patch allows running each napi poll loop inside its own
kernel thread.
The kthread is created during netif_napi_add() if dev->threaded
is set. And threaded mode is enabled in napi_enable(). We will
provide a way to set dev->threaded and enable threaded mode
without a device up/down in the following patch.

Once that threaded mode is enabled and the kthread is
started, napi_schedule() will wake-up such thread instead
of scheduling the softirq.

The threaded poll loop behaves quite likely the net_rx_action,
but it does not have to manipulate local irqs and uses
an explicit scheduling point based on netdev_budget.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  21 +++------
 net/core/dev.c            | 112 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e9e7ada07ea1..99fb4ec9573e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
+	struct task_struct	*thread;
 };
 
 enum {
@@ -358,6 +359,7 @@ enum {
 	NAPI_STATE_NO_BUSY_POLL,	/* Do not add in napi_hash, no busy polling */
 	NAPI_STATE_IN_BUSY_POLL,	/* sk_busy_loop() owns this NAPI */
 	NAPI_STATE_PREFER_BUSY_POLL,	/* prefer busy-polling over softirq processing*/
+	NAPI_STATE_THREADED,		/* The poll is performed inside its own thread*/
 };
 
 enum {
@@ -369,6 +371,7 @@ enum {
 	NAPIF_STATE_NO_BUSY_POLL	= BIT(NAPI_STATE_NO_BUSY_POLL),
 	NAPIF_STATE_IN_BUSY_POLL	= BIT(NAPI_STATE_IN_BUSY_POLL),
 	NAPIF_STATE_PREFER_BUSY_POLL	= BIT(NAPI_STATE_PREFER_BUSY_POLL),
+	NAPIF_STATE_THREADED		= BIT(NAPI_STATE_THREADED),
 };
 
 enum gro_result {
@@ -503,20 +506,7 @@ static inline bool napi_complete(struct napi_struct *n)
  */
 void napi_disable(struct napi_struct *n);
 
-/**
- *	napi_enable - enable NAPI scheduling
- *	@n: NAPI context
- *
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
- */
-static inline void napi_enable(struct napi_struct *n)
-{
-	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-	smp_mb__before_atomic();
-	clear_bit(NAPI_STATE_SCHED, &n->state);
-	clear_bit(NAPI_STATE_NPSVC, &n->state);
-}
+void napi_enable(struct napi_struct *n);
 
 /**
  *	napi_synchronize - wait until NAPI is not running
@@ -1827,6 +1817,8 @@ enum netdev_priv_flags {
  *
  *	@wol_enabled:	Wake-on-LAN is enabled
  *
+ *	@threaded:	napi threaded mode is enabled
+ *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
  *				to another network namespace.
@@ -2145,6 +2137,7 @@ struct net_device {
 	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
 	unsigned		wol_enabled:1;
+	unsigned		threaded:1;
 
 	struct list_head	net_notifier_list;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 59751a22d7c3..1e35f4f44f3b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -91,6 +91,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/kthread.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
 #include <net/net_namespace.h>
@@ -1494,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_notify_peers);
 
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+	int err = 0;
+
+	/* Create and wake up the kthread once to put it in
+	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
+	 * warning and work with loadavg.
+	 */
+	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+				n->dev->name, n->napi_id);
+	if (IS_ERR(n->thread)) {
+		err = PTR_ERR(n->thread);
+		pr_err("kthread_run failed with err %d\n", err);
+		n->thread = NULL;
+	}
+
+	return err;
+}
+
 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -4265,6 +4287,21 @@ int gro_normal_batch __read_mostly = 8;
 static inline void ____napi_schedule(struct softnet_data *sd,
 				     struct napi_struct *napi)
 {
+	struct task_struct *thread;
+
+	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+		/* Paired with smp_mb__before_atomic() in
+		 * napi_enable(). Use READ_ONCE() to guarantee
+		 * a complete read on napi->thread. Only call
+		 * wake_up_process() when it's not NULL.
+		 */
+		thread = READ_ONCE(napi->thread);
+		if (thread) {
+			wake_up_process(thread);
+			return;
+		}
+	}
+
 	list_add_tail(&napi->poll_list, &sd->poll_list);
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
@@ -6728,6 +6765,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
 	napi_hash_add(napi);
+	/* Create kthread for this napi if dev->threaded is set.
+	 * Clear dev->threaded if kthread creation failed so that
+	 * threaded mode will not be enabled in napi_enable().
+	 */
+	if (dev->threaded && napi_kthread_create(napi))
+		dev->threaded = 0;
 }
 EXPORT_SYMBOL(netif_napi_add);
 
@@ -6745,9 +6788,28 @@ void napi_disable(struct napi_struct *n)
 
 	clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
+	clear_bit(NAPI_STATE_THREADED, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
 
+/**
+ *	napi_enable - enable NAPI scheduling
+ *	@n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+	smp_mb__before_atomic();
+	clear_bit(NAPI_STATE_SCHED, &n->state);
+	clear_bit(NAPI_STATE_NPSVC, &n->state);
+	if (n->dev->threaded && n->thread)
+		set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
 static void flush_gro_hash(struct napi_struct *napi)
 {
 	int i;
@@ -6773,6 +6835,11 @@ void __netif_napi_del(struct napi_struct *napi)
 
 	flush_gro_hash(napi);
 	napi->gro_bitmask = 0;
+
+	if (napi->thread) {
+		kthread_stop(napi->thread);
+		napi->thread = NULL;
+	}
 }
 EXPORT_SYMBOL(__netif_napi_del);
 
@@ -6867,6 +6934,51 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 	return work;
 }
 
+static int napi_thread_wait(struct napi_struct *napi)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+		if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+			WARN_ON(!list_empty(&napi->poll_list));
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		}
+
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+	struct napi_struct *napi = data;
+	void *have;
+
+	while (!napi_thread_wait(napi)) {
+		for (;;) {
+			bool repoll = false;
+
+			local_bh_disable();
+
+			have = netpoll_poll_lock(napi);
+			__napi_poll(napi, &repoll);
+			netpoll_poll_unlock(have);
+
+			__kfree_skb_flush();
+			local_bh_enable();
+
+			if (!repoll)
+				break;
+
+			cond_resched();
+		}
+	}
+	return 0;
+}
+
 static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-- 
cgit v1.2.3


From 5fdd2f0e5c64846bf3066689b73fc3b8dddd1c74 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Feb 2021 11:34:10 -0800
Subject: net: add sysfs attribute to control napi threaded mode

This patch adds a new sysfs attribute to the network device class.
Said attribute provides a per-device control to enable/disable the
threaded mode for all the napi instances of the given network device,
without the need for a device up/down.
User sets it to 1 or 0 to enable or disable threaded mode.
Note: when switching between threaded and the current softirq based mode
for a napi instance, it will not immediately take effect if the napi is
currently being polled. The mode switch will happen for the next time
napi_schedule() is called.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net | 15 ++++++++++
 include/linux/netdevice.h                 |  2 ++
 net/core/dev.c                            | 48 +++++++++++++++++++++++++++++--
 net/core/net-sysfs.c                      | 40 ++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 1f2002df5ba2..1419103d11f9 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -337,3 +337,18 @@ Contact:	netdev@vger.kernel.org
 Description:
 		32-bit unsigned integer counting the number of times the link has
 		been down
+
+What:		/sys/class/net/<iface>/threaded
+Date:		Jan 2021
+KernelVersion:	5.12
+Contact:	netdev@vger.kernel.org
+Description:
+		Boolean value to control the threaded mode per device. User could
+		set this value to enable/disable threaded mode for all napi
+		belonging to this device, without the need to do device up/down.
+
+		Possible values:
+		== ==================================
+		0  threaded mode disabled for this dev
+		1  threaded mode enabled for this dev
+		== ==================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 99fb4ec9573e..1340327f7abf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
+int dev_set_threaded(struct net_device *dev, bool threaded);
+
 /**
  *	napi_disable - prevent NAPI from scheduling
  *	@n: NAPI context
diff --git a/net/core/dev.c b/net/core/dev.c
index 1e35f4f44f3b..7647278e46f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4291,8 +4291,9 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 
 	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 		/* Paired with smp_mb__before_atomic() in
-		 * napi_enable(). Use READ_ONCE() to guarantee
-		 * a complete read on napi->thread. Only call
+		 * napi_enable()/dev_set_threaded().
+		 * Use READ_ONCE() to guarantee a complete
+		 * read on napi->thread. Only call
 		 * wake_up_process() when it's not NULL.
 		 */
 		thread = READ_ONCE(napi->thread);
@@ -6738,6 +6739,49 @@ static void init_gro_hash(struct napi_struct *napi)
 	napi->gro_bitmask = 0;
 }
 
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+	struct napi_struct *napi;
+	int err = 0;
+
+	if (dev->threaded == threaded)
+		return 0;
+
+	if (threaded) {
+		list_for_each_entry(napi, &dev->napi_list, dev_list) {
+			if (!napi->thread) {
+				err = napi_kthread_create(napi);
+				if (err) {
+					threaded = false;
+					break;
+				}
+			}
+		}
+	}
+
+	dev->threaded = threaded;
+
+	/* Make sure kthread is created before THREADED bit
+	 * is set.
+	 */
+	smp_mb__before_atomic();
+
+	/* Setting/unsetting threaded mode on a napi might not immediately
+	 * take effect, if the current napi instance is actively being
+	 * polled. In this case, the switch between threaded mode and
+	 * softirq mode will happen in the next round of napi_schedule().
+	 * This should not cause hiccups/stalls to the live traffic.
+	 */
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		if (threaded)
+			set_bit(NAPI_STATE_THREADED, &napi->state);
+		else
+			clear_bit(NAPI_STATE_THREADED, &napi->state);
+	}
+
+	return err;
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 91afb0b6de69..307628fdf380 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(phys_switch_id);
 
+static ssize_t threaded_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	ssize_t ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (dev_isalive(netdev))
+		ret = sprintf(buf, fmt_dec, netdev->threaded);
+
+	rtnl_unlock();
+	return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+	int ret;
+
+	if (list_empty(&dev->napi_list))
+		return -EOPNOTSUPP;
+
+	if (val != 0 && val != 1)
+		return -EOPNOTSUPP;
+
+	ret = dev_set_threaded(dev, val);
+
+	return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
 static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_netdev_group.attr,
 	&dev_attr_type.attr,
@@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_proto_down.attr,
 	&dev_attr_carrier_up_count.attr,
 	&dev_attr_carrier_down_count.attr,
+	&dev_attr_threaded.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
-- 
cgit v1.2.3


From 0f319d49a4167e402b01b2b56639386f0b6846ba Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 10 Feb 2021 09:52:47 +0100
Subject: locking/mutex: Kill mutex_trylock_recursive()

There are not users of mutex_trylock_recursive() in tree as of
v5.11-rc7.

Remove it.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210210085248.219210-2-bigeasy@linutronix.de
---
 include/linux/mutex.h  | 25 -------------------------
 kernel/locking/mutex.c | 10 ----------
 2 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index dcd185cbfe79..0cd631a19727 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -199,29 +199,4 @@ extern void mutex_unlock(struct mutex *lock);
 
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
-/*
- * These values are chosen such that FAIL and SUCCESS match the
- * values of the regular mutex_trylock().
- */
-enum mutex_trylock_recursive_enum {
-	MUTEX_TRYLOCK_FAILED    = 0,
-	MUTEX_TRYLOCK_SUCCESS   = 1,
-	MUTEX_TRYLOCK_RECURSIVE,
-};
-
-/**
- * mutex_trylock_recursive - trylock variant that allows recursive locking
- * @lock: mutex to be locked
- *
- * This function should not be used, _ever_. It is purely for hysterical GEM
- * raisins, and once those are gone this will be removed.
- *
- * Returns:
- *  - MUTEX_TRYLOCK_FAILED    - trylock failed,
- *  - MUTEX_TRYLOCK_SUCCESS   - lock acquired,
- *  - MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
- */
-extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock);
-
 #endif /* __LINUX_MUTEX_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5352ce50a97e..adb935090768 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -86,16 +86,6 @@ bool mutex_is_locked(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_is_locked);
 
-__must_check enum mutex_trylock_recursive_enum
-mutex_trylock_recursive(struct mutex *lock)
-{
-	if (unlikely(__mutex_owner(lock) == current))
-		return MUTEX_TRYLOCK_RECURSIVE;
-
-	return mutex_trylock(lock);
-}
-EXPORT_SYMBOL(mutex_trylock_recursive);
-
 static inline unsigned long __owner_flags(unsigned long owner)
 {
 	return owner & MUTEX_FLAGS;
-- 
cgit v1.2.3


From 7cbf1722d5fc5779946ee8f338e9e38b5de15856 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Feb 2021 00:03:20 +0000
Subject: io_uring: provide FIFO ordering for task_work

task_work is a LIFO list, due to how it's implemented as a lockless
list. For long chains of task_work, this can be problematic as the
first entry added is the last one processed. Similarly, we'd waste
a lot of CPU cycles reversing this list.

Wrap the task_work so we have a single task_work entry per task per
ctx, and use that to run it in the right order.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h               |   9 -----
 fs/io_uring.c            | 101 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/io_uring.h |  14 +++++++
 3 files changed, 108 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io-wq.h b/fs/io-wq.h
index e37a0f217cc8..096f1021018e 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -27,15 +27,6 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-struct io_wq_work_node {
-	struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-	struct io_wq_work_node *first;
-	struct io_wq_work_node *last;
-};
-
 static inline void wq_list_add_after(struct io_wq_work_node *node,
 				     struct io_wq_work_node *pos,
 				     struct io_wq_work_list *list)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 87a4b727fe1c..bfc8fcd93504 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -721,6 +721,11 @@ struct async_poll {
 	struct io_poll_iocb	*double_poll;
 };
 
+struct io_task_work {
+	struct io_wq_work_node	node;
+	task_work_func_t	func;
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -779,7 +784,10 @@ struct io_kiocb {
 	 * 2. to track reqs with ->files (see io_op_def::file_table)
 	 */
 	struct list_head		inflight_entry;
-	struct callback_head		task_work;
+	union {
+		struct io_task_work	io_task_work;
+		struct callback_head	task_work;
+	};
 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 	struct hlist_node		hash_node;
 	struct async_poll		*apoll;
@@ -2129,6 +2137,81 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return __io_req_find_next(req);
 }
 
+static bool __tctx_task_work(struct io_uring_task *tctx)
+{
+	struct io_wq_work_list list;
+	struct io_wq_work_node *node;
+
+	if (wq_list_empty(&tctx->task_list))
+		return false;
+
+	spin_lock(&tctx->task_lock);
+	list = tctx->task_list;
+	INIT_WQ_LIST(&tctx->task_list);
+	spin_unlock(&tctx->task_lock);
+
+	node = list.first;
+	while (node) {
+		struct io_wq_work_node *next = node->next;
+		struct io_kiocb *req;
+
+		req = container_of(node, struct io_kiocb, io_task_work.node);
+		req->task_work.func(&req->task_work);
+		node = next;
+	}
+
+	return list.first != NULL;
+}
+
+static void tctx_task_work(struct callback_head *cb)
+{
+	struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
+
+	while (__tctx_task_work(tctx))
+		cond_resched();
+
+	clear_bit(0, &tctx->task_state);
+}
+
+static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
+			    enum task_work_notify_mode notify)
+{
+	struct io_uring_task *tctx = tsk->io_uring;
+	struct io_wq_work_node *node, *prev;
+	int ret;
+
+	WARN_ON_ONCE(!tctx);
+
+	spin_lock(&tctx->task_lock);
+	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+	spin_unlock(&tctx->task_lock);
+
+	/* task_work already pending, we're done */
+	if (test_bit(0, &tctx->task_state) ||
+	    test_and_set_bit(0, &tctx->task_state))
+		return 0;
+
+	if (!task_work_add(tsk, &tctx->task_work, notify))
+		return 0;
+
+	/*
+	 * Slow path - we failed, find and delete work. if the work is not
+	 * in the list, it got run and we're fine.
+	 */
+	ret = 0;
+	spin_lock(&tctx->task_lock);
+	wq_list_for_each(node, prev, &tctx->task_list) {
+		if (&req->io_task_work.node == node) {
+			wq_list_del(&tctx->task_list, node, prev);
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&tctx->task_lock);
+	clear_bit(0, &tctx->task_state);
+	return ret;
+}
+
 static int io_req_task_work_add(struct io_kiocb *req)
 {
 	struct task_struct *tsk = req->task;
@@ -2149,7 +2232,7 @@ static int io_req_task_work_add(struct io_kiocb *req)
 	if (!(ctx->flags & IORING_SETUP_SQPOLL))
 		notify = TWA_SIGNAL;
 
-	ret = task_work_add(tsk, &req->task_work, notify);
+	ret = io_task_work_add(tsk, req, notify);
 	if (!ret)
 		wake_up_process(tsk);
 
@@ -2157,7 +2240,7 @@ static int io_req_task_work_add(struct io_kiocb *req)
 }
 
 static void io_req_task_work_add_fallback(struct io_kiocb *req,
-					  void (*cb)(struct callback_head *))
+					  task_work_func_t cb)
 {
 	struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
 
@@ -2216,7 +2299,7 @@ static void io_req_task_queue(struct io_kiocb *req)
 {
 	int ret;
 
-	init_task_work(&req->task_work, io_req_task_submit);
+	req->task_work.func = io_req_task_submit;
 	percpu_ref_get(&req->ctx->refs);
 
 	ret = io_req_task_work_add(req);
@@ -2347,7 +2430,7 @@ static void io_free_req_deferred(struct io_kiocb *req)
 {
 	int ret;
 
-	init_task_work(&req->task_work, io_put_req_deferred_cb);
+	req->task_work.func = io_put_req_deferred_cb;
 	ret = io_req_task_work_add(req);
 	if (unlikely(ret))
 		io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
@@ -3392,7 +3475,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 	list_del_init(&wait->entry);
 
-	init_task_work(&req->task_work, io_req_task_submit);
+	req->task_work.func = io_req_task_submit;
 	percpu_ref_get(&req->ctx->refs);
 
 	/* submit ref gets dropped, acquire a new one */
@@ -5083,7 +5166,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	list_del_init(&poll->wait.entry);
 
 	req->result = mask;
-	init_task_work(&req->task_work, func);
+	req->task_work.func = func;
 	percpu_ref_get(&req->ctx->refs);
 
 	/*
@@ -8086,6 +8169,10 @@ static int io_uring_alloc_task_context(struct task_struct *task)
 	io_init_identity(&tctx->__identity);
 	tctx->identity = &tctx->__identity;
 	task->io_uring = tctx;
+	spin_lock_init(&tctx->task_lock);
+	INIT_WQ_LIST(&tctx->task_list);
+	tctx->task_state = 0;
+	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
 
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 35b2d845704d..2eb6d19de336 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,15 @@ struct io_identity {
 	refcount_t			count;
 };
 
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
 struct io_uring_task {
 	/* submission side */
 	struct xarray		xa;
@@ -32,6 +41,11 @@ struct io_uring_task {
 	struct io_identity	*identity;
 	atomic_t		in_idle;
 	bool			sqpoll;
+
+	spinlock_t		task_lock;
+	struct io_wq_work_list	task_list;
+	unsigned long		task_state;
+	struct callback_head	task_work;
 };
 
 #if defined(CONFIG_IO_URING)
-- 
cgit v1.2.3


From a805a4fa4fa376bbc145762bb8b09caa2fa8af48 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 28 Jan 2021 13:47:30 +0900
Subject: block: introduce zone_write_granularity limit

Per ZBC and ZAC specifications, host-managed SMR hard-disks mandate that
all writes into sequential write required zones be aligned to the device
physical block size. However, NVMe ZNS does not have this constraint and
allows write operations into sequential zones to be aligned to the
device logical block size. This inconsistency does not help with
software portability across device types.

To solve this, introduce the zone_write_granularity queue limit to
indicate the alignment constraint, in bytes, of write operations into
zones of a zoned block device. This new limit is exported as a
read-only sysfs queue attribute and the helper
blk_queue_zone_write_granularity() introduced for drivers to set this
limit.

The function blk_queue_set_zoned() is modified to set this new limit to
the device logical block size by default. NVMe ZNS devices as well as
zoned nullb devices use this default value as is. The scsi disk driver
is modified to execute the blk_queue_zone_write_granularity() helper to
set the zone write granularity of host-managed SMR disks to the disk
physical block size.

The accessor functions queue_zone_write_granularity() and
bdev_zone_write_granularity() are also introduced.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@edc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/queue-sysfs.rst |  7 +++++++
 block/blk-settings.c                | 37 ++++++++++++++++++++++++++++++++++++-
 block/blk-sysfs.c                   |  8 ++++++++
 drivers/scsi/sd_zbc.c               |  8 ++++++++
 include/linux/blkdev.h              | 15 +++++++++++++++
 5 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index edc6e6960b96..4dc7f0d499a8 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -279,4 +279,11 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC
 do not support zone commands, they will be treated as regular block devices
 and zoned will report "none".
 
+zone_write_granularity (RO)
+---------------------------
+This indicates the alignment constraint, in bytes, for write operations in
+sequential zones of zoned block devices (devices with a zoned attributed
+that reports "host-managed" or "host-aware"). This value is always 0 for
+regular block devices.
+
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4c974340f1a9..a1e66165adcf 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
+	lim->zone_write_granularity = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -366,6 +367,28 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 }
 EXPORT_SYMBOL(blk_queue_physical_block_size);
 
+/**
+ * blk_queue_zone_write_granularity - set zone write granularity for the queue
+ * @q:  the request queue for the zoned device
+ * @size:  the zone write granularity size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible size allowing to write in
+ *   sequential zones of a zoned block device.
+ */
+void blk_queue_zone_write_granularity(struct request_queue *q,
+				      unsigned int size)
+{
+	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+		return;
+
+	q->limits.zone_write_granularity = size;
+
+	if (q->limits.zone_write_granularity < q->limits.logical_block_size)
+		q->limits.zone_write_granularity = q->limits.logical_block_size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
+
 /**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
@@ -631,6 +654,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			t->discard_granularity;
 	}
 
+	t->zone_write_granularity = max(t->zone_write_granularity,
+					b->zone_write_granularity);
 	t->zoned = max(t->zoned, b->zoned);
 	return ret;
 }
@@ -847,6 +872,8 @@ EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
  */
 void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 {
+	struct request_queue *q = disk->queue;
+
 	switch (model) {
 	case BLK_ZONED_HM:
 		/*
@@ -875,7 +902,15 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 		break;
 	}
 
-	disk->queue->limits.zoned = model;
+	q->limits.zoned = model;
+	if (model != BLK_ZONED_NONE) {
+		/*
+		 * Set the zone write granularity to the device logical block
+		 * size by default. The driver can change this value if needed.
+		 */
+		blk_queue_zone_write_granularity(q,
+						queue_logical_block_size(q));
+	}
 }
 EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..ae39c7f3d83d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -219,6 +219,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
 		(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
 }
 
+static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
+						 char *page)
+{
+	return queue_var_show(queue_zone_write_granularity(q), page);
+}
+
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
 	unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -585,6 +591,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -639,6 +646,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
+	&queue_zone_write_granularity_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index cf07b7f93579..8293b29584b3 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -789,6 +789,14 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 	blk_queue_max_active_zones(q, 0);
 	nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
 
+	/*
+	 * Per ZBC and ZAC specifications, writes in sequential write required
+	 * zones of host-managed devices must be aligned to the device physical
+	 * block size.
+	 */
+	if (blk_queue_zoned_model(q) == BLK_ZONED_HM)
+		blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
+
 	/* READ16/WRITE16 is mandatory for ZBC disks */
 	sdkp->device->use_16_for_rw = 1;
 	sdkp->device->use_10_for_rw = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0dea268bd61b..9149f4a5adb3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -337,6 +337,7 @@ struct queue_limits {
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
+	unsigned int		zone_write_granularity;
 
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
@@ -1160,6 +1161,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
 		unsigned int max_zone_append_sectors);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
+void blk_queue_zone_write_granularity(struct request_queue *q,
+				      unsigned int size);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 void blk_queue_update_readahead(struct request_queue *q);
@@ -1473,6 +1476,18 @@ static inline int bdev_io_opt(struct block_device *bdev)
 	return queue_io_opt(bdev_get_queue(bdev));
 }
 
+static inline unsigned int
+queue_zone_write_granularity(const struct request_queue *q)
+{
+	return q->limits.zone_write_granularity;
+}
+
+static inline unsigned int
+bdev_zone_write_granularity(struct block_device *bdev)
+{
+	return queue_zone_write_granularity(bdev_get_queue(bdev));
+}
+
 static inline int queue_alignment_offset(const struct request_queue *q)
 {
 	if (q->limits.misaligned)
-- 
cgit v1.2.3


From 83fba8c8114748a18e20391565cfdfdf8466075c Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Mon, 1 Feb 2021 11:49:38 +0800
Subject: blk-mq: introduce blk_mq_set_request_complete

nvme drivers need to set the state of request to MQ_RQ_COMPLETE when
directly complete request in queue_rq.
So add blk_mq_set_request_complete.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/blk-mq.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index aabbf6830ffc..2c473c9b8990 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -490,6 +490,18 @@ static inline int blk_mq_request_completed(struct request *rq)
 	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
 }
 
+/*
+ * 
+ * Set the state to complete when completing a request from inside ->queue_rq.
+ * This is used by drivers that want to ensure special complete actions that
+ * need access to the request are called on failure, e.g. by nvme for
+ * multipathing.
+ */
+static inline void blk_mq_set_request_complete(struct request *rq)
+{
+	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
+}
+
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
-- 
cgit v1.2.3


From 1852ebd1354201cf4ab9564947ff2a17a918b294 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 10 Feb 2021 21:27:56 +1100
Subject: of: irq: make a stub for of_irq_parse_one()

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/r/20210210214720.02e6a6be@canb.auug.org.au
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_irq.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index e8b78139f78c..f898d838d201 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -33,8 +33,6 @@ static inline int of_irq_parse_oldworld(struct device_node *device, int index,
 #endif /* CONFIG_PPC32 && CONFIG_PPC_PMAC */
 
 extern int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq);
-extern int of_irq_parse_one(struct device_node *device, int index,
-			  struct of_phandle_args *out_irq);
 extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data);
 extern int of_irq_to_resource(struct device_node *dev, int index,
 			      struct resource *r);
@@ -42,6 +40,8 @@ extern int of_irq_to_resource(struct device_node *dev, int index,
 extern void of_irq_init(const struct of_device_id *matches);
 
 #ifdef CONFIG_OF_IRQ
+extern int of_irq_parse_one(struct device_node *device, int index,
+			  struct of_phandle_args *out_irq);
 extern int of_irq_count(struct device_node *dev);
 extern int of_irq_get(struct device_node *dev, int index);
 extern int of_irq_get_byname(struct device_node *dev, const char *name);
@@ -57,6 +57,11 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
 extern void of_msi_configure(struct device *dev, struct device_node *np);
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 #else
+static inline int of_irq_parse_one(struct device_node *device, int index,
+				   struct of_phandle_args *out_irq)
+{
+	return 0;
+}
 static inline int of_irq_count(struct device_node *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From 01f810ace9ed37255f27608a0864abebccf0aab3 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Sat, 6 Feb 2021 20:10:24 -0500
Subject: bpf: Allow variable-offset stack access

Before this patch, variable offset access to the stack was dissalowed
for regular instructions, but was allowed for "indirect" accesses (i.e.
helpers). This patch removes the restriction, allowing reading and
writing to the stack through stack pointers with variable offsets. This
makes stack-allocated buffers more usable in programs, and brings stack
pointers closer to other types of pointers.

The motivation is being able to use stack-allocated buffers for data
manipulation. When the stack size limit is sufficient, allocating
buffers on the stack is simpler than per-cpu arrays, or other
alternatives.

In unpriviledged programs, variable-offset reads and writes are
disallowed (they were already disallowed for the indirect access case)
because the speculative execution checking code doesn't support them.
Additionally, when writing through a variable-offset stack pointer, if
any pointers are in the accessible range, there's possilibities of later
leaking pointers because the write cannot be tracked precisely.

Writes with variable offset mark the whole range as initialized, even
though we don't know which stack slots are actually written. This is in
order to not reject future reads to these slots. Note that this doesn't
affect writes done through helpers; like before, helpers need the whole
stack range to be initialized to begin with.
All the stack slots are in range are considered scalars after the write;
variable-offset register spills are not tracked.

For reads, all the stack slots in the variable range needs to be
initialized (but see above about what writes do), otherwise the read is
rejected. All register spilled in stack slots that might be read are
marked as having been read, however reads through such pointers don't do
register filling; the target register will always be either a scalar or
a constant zero.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210207011027.676572-2-andreimatei1@gmail.com
---
 include/linux/bpf.h          |   5 +
 include/linux/bpf_verifier.h |   3 +-
 kernel/bpf/verifier.c        | 657 +++++++++++++++++++++++++++++++++----------
 3 files changed, 518 insertions(+), 147 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 321966fc35db..079162bbd387 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1290,6 +1290,11 @@ static inline bool bpf_allow_ptr_leaks(void)
 	return perfmon_capable();
 }
 
+static inline bool bpf_allow_uninit_stack(void)
+{
+	return perfmon_capable();
+}
+
 static inline bool bpf_allow_ptr_to_map_access(void)
 {
 	return perfmon_capable();
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index dfe6f85d97dd..532c97836d0d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -195,7 +195,7 @@ struct bpf_func_state {
 	 * 0 = main function, 1 = first callee.
 	 */
 	u32 frameno;
-	/* subprog number == index within subprog_stack_depth
+	/* subprog number == index within subprog_info
 	 * zero == main subprog
 	 */
 	u32 subprogno;
@@ -404,6 +404,7 @@ struct bpf_verifier_env {
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool allow_uninit_stack;
 	bool allow_ptr_to_map_access;
 	bool bpf_capable;
 	bool bypass_spec_v1;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 15694246f854..400d79e99fc8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2275,12 +2275,14 @@ static void save_register_state(struct bpf_func_state *state,
 		state->stack[spi].slot_type[i] = STACK_SPILL;
 }
 
-/* check_stack_read/write functions track spill/fill of registers,
+/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_env *env,
-			     struct bpf_func_state *state, /* func where register points to */
-			     int off, int size, int value_regno, int insn_idx)
+static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+				       /* stack frame we're writing to */
+				       struct bpf_func_state *state,
+				       int off, int size, int value_regno,
+				       int insn_idx)
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -2406,9 +2408,175 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int check_stack_read(struct bpf_verifier_env *env,
-			    struct bpf_func_state *reg_state /* func where register points to */,
-			    int off, int size, int value_regno)
+/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+ * known to contain a variable offset.
+ * This function checks whether the write is permitted and conservatively
+ * tracks the effects of the write, considering that each stack slot in the
+ * dynamic range is potentially written to.
+ *
+ * 'off' includes 'regno->off'.
+ * 'value_regno' can be -1, meaning that an unknown value is being written to
+ * the stack.
+ *
+ * Spilled pointers in range are not marked as written because we don't know
+ * what's going to be actually written. This means that read propagation for
+ * future reads cannot be terminated by this write.
+ *
+ * For privileged programs, uninitialized stack slots are considered
+ * initialized by this write (even though we don't know exactly what offsets
+ * are going to be written to). The idea is that we don't want the verifier to
+ * reject future reads that access slots written to through variable offsets.
+ */
+static int check_stack_write_var_off(struct bpf_verifier_env *env,
+				     /* func where register points to */
+				     struct bpf_func_state *state,
+				     int ptr_regno, int off, int size,
+				     int value_regno, int insn_idx)
+{
+	struct bpf_func_state *cur; /* state of the current function */
+	int min_off, max_off;
+	int i, err;
+	struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+	bool writing_zero = false;
+	/* set if the fact that we're writing a zero is used to let any
+	 * stack slots remain STACK_ZERO
+	 */
+	bool zero_used = false;
+
+	cur = env->cur_state->frame[env->cur_state->curframe];
+	ptr_reg = &cur->regs[ptr_regno];
+	min_off = ptr_reg->smin_value + off;
+	max_off = ptr_reg->smax_value + off + size;
+	if (value_regno >= 0)
+		value_reg = &cur->regs[value_regno];
+	if (value_reg && register_is_null(value_reg))
+		writing_zero = true;
+
+	err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
+				 state->acquired_refs, true);
+	if (err)
+		return err;
+
+
+	/* Variable offset writes destroy any spilled pointers in range. */
+	for (i = min_off; i < max_off; i++) {
+		u8 new_type, *stype;
+		int slot, spi;
+
+		slot = -i - 1;
+		spi = slot / BPF_REG_SIZE;
+		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+
+		if (!env->allow_ptr_leaks
+				&& *stype != NOT_INIT
+				&& *stype != SCALAR_VALUE) {
+			/* Reject the write if there's are spilled pointers in
+			 * range. If we didn't reject here, the ptr status
+			 * would be erased below (even though not all slots are
+			 * actually overwritten), possibly opening the door to
+			 * leaks.
+			 */
+			verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
+				insn_idx, i);
+			return -EINVAL;
+		}
+
+		/* Erase all spilled pointers. */
+		state->stack[spi].spilled_ptr.type = NOT_INIT;
+
+		/* Update the slot type. */
+		new_type = STACK_MISC;
+		if (writing_zero && *stype == STACK_ZERO) {
+			new_type = STACK_ZERO;
+			zero_used = true;
+		}
+		/* If the slot is STACK_INVALID, we check whether it's OK to
+		 * pretend that it will be initialized by this write. The slot
+		 * might not actually be written to, and so if we mark it as
+		 * initialized future reads might leak uninitialized memory.
+		 * For privileged programs, we will accept such reads to slots
+		 * that may or may not be written because, if we're reject
+		 * them, the error would be too confusing.
+		 */
+		if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+			verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
+					insn_idx, i);
+			return -EINVAL;
+		}
+		*stype = new_type;
+	}
+	if (zero_used) {
+		/* backtracking doesn't work for STACK_ZERO yet. */
+		err = mark_chain_precision(env, value_regno);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/* When register 'dst_regno' is assigned some values from stack[min_off,
+ * max_off), we set the register's type according to the types of the
+ * respective stack slots. If all the stack values are known to be zeros, then
+ * so is the destination reg. Otherwise, the register is considered to be
+ * SCALAR. This function does not deal with register filling; the caller must
+ * ensure that all spilled registers in the stack range have been marked as
+ * read.
+ */
+static void mark_reg_stack_read(struct bpf_verifier_env *env,
+				/* func where src register points to */
+				struct bpf_func_state *ptr_state,
+				int min_off, int max_off, int dst_regno)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+	int i, slot, spi;
+	u8 *stype;
+	int zeros = 0;
+
+	for (i = min_off; i < max_off; i++) {
+		slot = -i - 1;
+		spi = slot / BPF_REG_SIZE;
+		stype = ptr_state->stack[spi].slot_type;
+		if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
+			break;
+		zeros++;
+	}
+	if (zeros == max_off - min_off) {
+		/* any access_size read into register is zero extended,
+		 * so the whole register == const_zero
+		 */
+		__mark_reg_const_zero(&state->regs[dst_regno]);
+		/* backtracking doesn't support STACK_ZERO yet,
+		 * so mark it precise here, so that later
+		 * backtracking can stop here.
+		 * Backtracking may not need this if this register
+		 * doesn't participate in pointer adjustment.
+		 * Forward propagation of precise flag is not
+		 * necessary either. This mark is only to stop
+		 * backtracking. Any register that contributed
+		 * to const 0 was marked precise before spill.
+		 */
+		state->regs[dst_regno].precise = true;
+	} else {
+		/* have read misc data from the stack */
+		mark_reg_unknown(env, state->regs, dst_regno);
+	}
+	state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
+}
+
+/* Read the stack at 'off' and put the results into the register indicated by
+ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
+ * spilled reg.
+ *
+ * 'dst_regno' can be -1, meaning that the read value is not going to a
+ * register.
+ *
+ * The access is assumed to be within the current stack bounds.
+ */
+static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
+				      /* func where src register points to */
+				      struct bpf_func_state *reg_state,
+				      int off, int size, int dst_regno)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@ -2416,11 +2584,6 @@ static int check_stack_read(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg;
 	u8 *stype;
 
-	if (reg_state->allocated_stack <= slot) {
-		verbose(env, "invalid read from stack off %d+0 size %d\n",
-			off, size);
-		return -EACCES;
-	}
 	stype = reg_state->stack[spi].slot_type;
 	reg = &reg_state->stack[spi].spilled_ptr;
 
@@ -2431,9 +2594,9 @@ static int check_stack_read(struct bpf_verifier_env *env,
 				verbose(env, "invalid size of register fill\n");
 				return -EACCES;
 			}
-			if (value_regno >= 0) {
-				mark_reg_unknown(env, state->regs, value_regno);
-				state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+			if (dst_regno >= 0) {
+				mark_reg_unknown(env, state->regs, dst_regno);
+				state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
 			}
 			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
 			return 0;
@@ -2445,16 +2608,16 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			}
 		}
 
-		if (value_regno >= 0) {
+		if (dst_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] = *reg;
+			state->regs[dst_regno] = *reg;
 			/* mark reg as written since spilled pointer state likely
 			 * has its liveness marks cleared by is_state_visited()
 			 * which resets stack/reg liveness for state transitions
 			 */
-			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
 		} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
-			/* If value_regno==-1, the caller is asking us whether
+			/* If dst_regno==-1, the caller is asking us whether
 			 * it is acceptable to use this value as a SCALAR_VALUE
 			 * (e.g. for XADD).
 			 * We must not allow unprivileged callers to do that
@@ -2466,70 +2629,167 @@ static int check_stack_read(struct bpf_verifier_env *env,
 		}
 		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
 	} else {
-		int zeros = 0;
+		u8 type;
 
 		for (i = 0; i < size; i++) {
-			if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
+			type = stype[(slot - i) % BPF_REG_SIZE];
+			if (type == STACK_MISC)
 				continue;
-			if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
-				zeros++;
+			if (type == STACK_ZERO)
 				continue;
-			}
 			verbose(env, "invalid read from stack off %d+%d size %d\n",
 				off, i, size);
 			return -EACCES;
 		}
 		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
-		if (value_regno >= 0) {
-			if (zeros == size) {
-				/* any size read into register is zero extended,
-				 * so the whole register == const_zero
-				 */
-				__mark_reg_const_zero(&state->regs[value_regno]);
-				/* backtracking doesn't support STACK_ZERO yet,
-				 * so mark it precise here, so that later
-				 * backtracking can stop here.
-				 * Backtracking may not need this if this register
-				 * doesn't participate in pointer adjustment.
-				 * Forward propagation of precise flag is not
-				 * necessary either. This mark is only to stop
-				 * backtracking. Any register that contributed
-				 * to const 0 was marked precise before spill.
-				 */
-				state->regs[value_regno].precise = true;
-			} else {
-				/* have read misc data from the stack */
-				mark_reg_unknown(env, state->regs, value_regno);
-			}
-			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
-		}
+		if (dst_regno >= 0)
+			mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
 	}
 	return 0;
 }
 
-static int check_stack_access(struct bpf_verifier_env *env,
-			      const struct bpf_reg_state *reg,
-			      int off, int size)
+enum stack_access_src {
+	ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
+	ACCESS_HELPER = 2,  /* the access is performed by a helper */
+};
+
+static int check_stack_range_initialized(struct bpf_verifier_env *env,
+					 int regno, int off, int access_size,
+					 bool zero_size_allowed,
+					 enum stack_access_src type,
+					 struct bpf_call_arg_meta *meta);
+
+static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+{
+	return cur_regs(env) + regno;
+}
+
+/* Read the stack at 'ptr_regno + off' and put the result into the register
+ * 'dst_regno'.
+ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+ * but not its variable offset.
+ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
+ *
+ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
+ * filling registers (i.e. reads of spilled register cannot be detected when
+ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
+ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+ * offset; for a fixed offset check_stack_read_fixed_off should be used
+ * instead.
+ */
+static int check_stack_read_var_off(struct bpf_verifier_env *env,
+				    int ptr_regno, int off, int size, int dst_regno)
 {
-	/* Stack accesses must be at a fixed offset, so that we
-	 * can determine what type of data were returned. See
-	 * check_stack_read().
+	/* The state of the source register. */
+	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+	struct bpf_func_state *ptr_state = func(env, reg);
+	int err;
+	int min_off, max_off;
+
+	/* Note that we pass a NULL meta, so raw access will not be permitted.
 	 */
-	if (!tnum_is_const(reg->var_off)) {
+	err = check_stack_range_initialized(env, ptr_regno, off, size,
+					    false, ACCESS_DIRECT, NULL);
+	if (err)
+		return err;
+
+	min_off = reg->smin_value + off;
+	max_off = reg->smax_value + off;
+	mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+	return 0;
+}
+
+/* check_stack_read dispatches to check_stack_read_fixed_off or
+ * check_stack_read_var_off.
+ *
+ * The caller must ensure that the offset falls within the allocated stack
+ * bounds.
+ *
+ * 'dst_regno' is a register which will receive the value from the stack. It
+ * can be -1, meaning that the read value is not going to a register.
+ */
+static int check_stack_read(struct bpf_verifier_env *env,
+			    int ptr_regno, int off, int size,
+			    int dst_regno)
+{
+	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+	struct bpf_func_state *state = func(env, reg);
+	int err;
+	/* Some accesses are only permitted with a static offset. */
+	bool var_off = !tnum_is_const(reg->var_off);
+
+	/* The offset is required to be static when reads don't go to a
+	 * register, in order to not leak pointers (see
+	 * check_stack_read_fixed_off).
+	 */
+	if (dst_regno < 0 && var_off) {
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
+		verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
 			tn_buf, off, size);
 		return -EACCES;
 	}
+	/* Variable offset is prohibited for unprivileged mode for simplicity
+	 * since it requires corresponding support in Spectre masking for stack
+	 * ALU. See also retrieve_ptr_limit().
+	 */
+	if (!env->bypass_spec_v1 && var_off) {
+		char tn_buf[48];
 
-	if (off >= 0 || off < -MAX_BPF_STACK) {
-		verbose(env, "invalid stack off=%d size=%d\n", off, size);
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+				ptr_regno, tn_buf);
 		return -EACCES;
 	}
 
-	return 0;
+	if (!var_off) {
+		off += reg->var_off.value;
+		err = check_stack_read_fixed_off(env, state, off, size,
+						 dst_regno);
+	} else {
+		/* Variable offset stack reads need more conservative handling
+		 * than fixed offset ones. Note that dst_regno >= 0 on this
+		 * branch.
+		 */
+		err = check_stack_read_var_off(env, ptr_regno, off, size,
+					       dst_regno);
+	}
+	return err;
+}
+
+
+/* check_stack_write dispatches to check_stack_write_fixed_off or
+ * check_stack_write_var_off.
+ *
+ * 'ptr_regno' is the register used as a pointer into the stack.
+ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
+ * 'value_regno' is the register whose value we're writing to the stack. It can
+ * be -1, meaning that we're not writing from a register.
+ *
+ * The caller must ensure that the offset falls within the maximum stack size.
+ */
+static int check_stack_write(struct bpf_verifier_env *env,
+			     int ptr_regno, int off, int size,
+			     int value_regno, int insn_idx)
+{
+	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+	struct bpf_func_state *state = func(env, reg);
+	int err;
+
+	if (tnum_is_const(reg->var_off)) {
+		off += reg->var_off.value;
+		err = check_stack_write_fixed_off(env, state, off, size,
+						  value_regno, insn_idx);
+	} else {
+		/* Variable offset stack reads need more conservative handling
+		 * than fixed offset ones.
+		 */
+		err = check_stack_write_var_off(env, state,
+						ptr_regno, off, size,
+						value_regno, insn_idx);
+	}
+	return err;
 }
 
 static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
@@ -2862,11 +3122,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 	return -EACCES;
 }
 
-static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
-{
-	return cur_regs(env) + regno;
-}
-
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
 	return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
@@ -2985,8 +3240,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		break;
 	case PTR_TO_STACK:
 		pointer_desc = "stack ";
-		/* The stack spill tracking logic in check_stack_write()
-		 * and check_stack_read() relies on stack accesses being
+		/* The stack spill tracking logic in check_stack_write_fixed_off()
+		 * and check_stack_read_fixed_off() relies on stack accesses being
 		 * aligned.
 		 */
 		strict = true;
@@ -3402,6 +3657,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* Check that the stack access at the given offset is within bounds. The
+ * maximum valid offset is -1.
+ *
+ * The minimum valid offset is -MAX_BPF_STACK for writes, and
+ * -state->allocated_stack for reads.
+ */
+static int check_stack_slot_within_bounds(int off,
+					  struct bpf_func_state *state,
+					  enum bpf_access_type t)
+{
+	int min_valid_off;
+
+	if (t == BPF_WRITE)
+		min_valid_off = -MAX_BPF_STACK;
+	else
+		min_valid_off = -state->allocated_stack;
+
+	if (off < min_valid_off || off > -1)
+		return -EACCES;
+	return 0;
+}
+
+/* Check that the stack access at 'regno + off' falls within the maximum stack
+ * bounds.
+ *
+ * 'off' includes `regno->offset`, but not its dynamic part (if any).
+ */
+static int check_stack_access_within_bounds(
+		struct bpf_verifier_env *env,
+		int regno, int off, int access_size,
+		enum stack_access_src src, enum bpf_access_type type)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = regs + regno;
+	struct bpf_func_state *state = func(env, reg);
+	int min_off, max_off;
+	int err;
+	char *err_extra;
+
+	if (src == ACCESS_HELPER)
+		/* We don't know if helpers are reading or writing (or both). */
+		err_extra = " indirect access to";
+	else if (type == BPF_READ)
+		err_extra = " read from";
+	else
+		err_extra = " write to";
+
+	if (tnum_is_const(reg->var_off)) {
+		min_off = reg->var_off.value + off;
+		if (access_size > 0)
+			max_off = min_off + access_size - 1;
+		else
+			max_off = min_off;
+	} else {
+		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+		    reg->smin_value <= -BPF_MAX_VAR_OFF) {
+			verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
+				err_extra, regno);
+			return -EACCES;
+		}
+		min_off = reg->smin_value + off;
+		if (access_size > 0)
+			max_off = reg->smax_value + off + access_size - 1;
+		else
+			max_off = min_off;
+	}
+
+	err = check_stack_slot_within_bounds(min_off, state, type);
+	if (!err)
+		err = check_stack_slot_within_bounds(max_off, state, type);
+
+	if (err) {
+		if (tnum_is_const(reg->var_off)) {
+			verbose(env, "invalid%s stack R%d off=%d size=%d\n",
+				err_extra, regno, off, access_size);
+		} else {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
+				err_extra, regno, tn_buf, access_size);
+		}
+	}
+	return err;
+}
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
@@ -3517,8 +3857,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
-		off += reg->var_off.value;
-		err = check_stack_access(env, reg, off, size);
+		/* Basic bounds checks. */
+		err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
 		if (err)
 			return err;
 
@@ -3527,12 +3867,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err)
 			return err;
 
-		if (t == BPF_WRITE)
-			err = check_stack_write(env, state, off, size,
-						value_regno, insn_idx);
-		else
-			err = check_stack_read(env, state, off, size,
+		if (t == BPF_READ)
+			err = check_stack_read(env, regno, off, size,
 					       value_regno);
+		else
+			err = check_stack_write(env, regno, off, size,
+						value_regno, insn_idx);
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose(env, "cannot write into packet\n");
@@ -3699,49 +4039,53 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	return 0;
 }
 
-static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
-				  int off, int access_size,
-				  bool zero_size_allowed)
+/* When register 'regno' is used to read the stack (either directly or through
+ * a helper function) make sure that it's within stack boundary and, depending
+ * on the access type, that all elements of the stack are initialized.
+ *
+ * 'off' includes 'regno->off', but not its dynamic part (if any).
+ *
+ * All registers that have been spilled on the stack in the slots within the
+ * read offsets are marked as read.
+ */
+static int check_stack_range_initialized(
+		struct bpf_verifier_env *env, int regno, int off,
+		int access_size, bool zero_size_allowed,
+		enum stack_access_src type, struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *reg = reg_state(env, regno);
+	struct bpf_func_state *state = func(env, reg);
+	int err, min_off, max_off, i, j, slot, spi;
+	char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
+	enum bpf_access_type bounds_check_type;
+	/* Some accesses can write anything into the stack, others are
+	 * read-only.
+	 */
+	bool clobber = false;
 
-	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
-		if (tnum_is_const(reg->var_off)) {
-			verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
-				regno, off, access_size);
-		} else {
-			char tn_buf[48];
-
-			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
-				regno, tn_buf, access_size);
-		}
+	if (access_size == 0 && !zero_size_allowed) {
+		verbose(env, "invalid zero-sized read\n");
 		return -EACCES;
 	}
-	return 0;
-}
 
-/* when register 'regno' is passed into function that will read 'access_size'
- * bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized.
- * Unlike most pointer bounds-checking functions, this one doesn't take an
- * 'off' argument, so it has to add in reg->off itself.
- */
-static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
-				int access_size, bool zero_size_allowed,
-				struct bpf_call_arg_meta *meta)
-{
-	struct bpf_reg_state *reg = reg_state(env, regno);
-	struct bpf_func_state *state = func(env, reg);
-	int err, min_off, max_off, i, j, slot, spi;
+	if (type == ACCESS_HELPER) {
+		/* The bounds checks for writes are more permissive than for
+		 * reads. However, if raw_mode is not set, we'll do extra
+		 * checks below.
+		 */
+		bounds_check_type = BPF_WRITE;
+		clobber = true;
+	} else {
+		bounds_check_type = BPF_READ;
+	}
+	err = check_stack_access_within_bounds(env, regno, off, access_size,
+					       type, bounds_check_type);
+	if (err)
+		return err;
+
 
 	if (tnum_is_const(reg->var_off)) {
-		min_off = max_off = reg->var_off.value + reg->off;
-		err = __check_stack_boundary(env, regno, min_off, access_size,
-					     zero_size_allowed);
-		if (err)
-			return err;
+		min_off = max_off = reg->var_off.value + off;
 	} else {
 		/* Variable offset is prohibited for unprivileged mode for
 		 * simplicity since it requires corresponding support in
@@ -3752,8 +4096,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
-				regno, tn_buf);
+			verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
+				regno, err_extra, tn_buf);
 			return -EACCES;
 		}
 		/* Only initialized buffer on stack is allowed to be accessed
@@ -3765,28 +4109,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		if (meta && meta->raw_mode)
 			meta = NULL;
 
-		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
-		    reg->smax_value <= -BPF_MAX_VAR_OFF) {
-			verbose(env, "R%d unbounded indirect variable offset stack access\n",
-				regno);
-			return -EACCES;
-		}
-		min_off = reg->smin_value + reg->off;
-		max_off = reg->smax_value + reg->off;
-		err = __check_stack_boundary(env, regno, min_off, access_size,
-					     zero_size_allowed);
-		if (err) {
-			verbose(env, "R%d min value is outside of stack bound\n",
-				regno);
-			return err;
-		}
-		err = __check_stack_boundary(env, regno, max_off, access_size,
-					     zero_size_allowed);
-		if (err) {
-			verbose(env, "R%d max value is outside of stack bound\n",
-				regno);
-			return err;
-		}
+		min_off = reg->smin_value + off;
+		max_off = reg->smax_value + off;
 	}
 
 	if (meta && meta->raw_mode) {
@@ -3806,8 +4130,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		if (*stype == STACK_MISC)
 			goto mark;
 		if (*stype == STACK_ZERO) {
-			/* helper can write anything into the stack */
-			*stype = STACK_MISC;
+			if (clobber) {
+				/* helper can write anything into the stack */
+				*stype = STACK_MISC;
+			}
 			goto mark;
 		}
 
@@ -3818,22 +4144,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		if (state->stack[spi].slot_type[0] == STACK_SPILL &&
 		    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
 		     env->allow_ptr_leaks)) {
-			__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
-			for (j = 0; j < BPF_REG_SIZE; j++)
-				state->stack[spi].slot_type[j] = STACK_MISC;
+			if (clobber) {
+				__mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+				for (j = 0; j < BPF_REG_SIZE; j++)
+					state->stack[spi].slot_type[j] = STACK_MISC;
+			}
 			goto mark;
 		}
 
 err:
 		if (tnum_is_const(reg->var_off)) {
-			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
-				min_off, i - min_off, access_size);
+			verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
+				err_extra, regno, min_off, i - min_off, access_size);
 		} else {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
-				tn_buf, i - min_off, access_size);
+			verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
+				err_extra, regno, tn_buf, i - min_off, access_size);
 		}
 		return -EACCES;
 mark:
@@ -3882,8 +4210,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 					   "rdwr",
 					   &env->prog->aux->max_rdwr_access);
 	case PTR_TO_STACK:
-		return check_stack_boundary(env, regno, access_size,
-					    zero_size_allowed, meta);
+		return check_stack_range_initialized(
+				env,
+				regno, reg->off, access_size,
+				zero_size_allowed, ACCESS_HELPER, meta);
 	default: /* scalar_value or invalid ptr */
 		/* Allow zero-byte read from NULL, regardless of pointer type */
 		if (zero_size_allowed && access_size == 0 &&
@@ -5547,6 +5877,41 @@ do_sim:
 	return !ret ? -EFAULT : 0;
 }
 
+/* check that stack access falls within stack limits and that 'reg' doesn't
+ * have a variable offset.
+ *
+ * Variable offset is prohibited for unprivileged mode for simplicity since it
+ * requires corresponding support in Spectre masking for stack ALU.  See also
+ * retrieve_ptr_limit().
+ *
+ *
+ * 'off' includes 'reg->off'.
+ */
+static int check_stack_access_for_ptr_arithmetic(
+				struct bpf_verifier_env *env,
+				int regno,
+				const struct bpf_reg_state *reg,
+				int off)
+{
+	if (!tnum_is_const(reg->var_off)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
+			regno, tn_buf, off);
+		return -EACCES;
+	}
+
+	if (off >= 0 || off < -MAX_BPF_STACK) {
+		verbose(env, "R%d stack pointer arithmetic goes out of range, "
+			"prohibited for !root; off=%d\n", regno, off);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
@@ -5790,10 +6155,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				"prohibited for !root\n", dst);
 			return -EACCES;
 		} else if (dst_reg->type == PTR_TO_STACK &&
-			   check_stack_access(env, dst_reg, dst_reg->off +
-					      dst_reg->var_off.value, 1)) {
-			verbose(env, "R%d stack pointer arithmetic goes out of range, "
-				"prohibited for !root\n", dst);
+			   check_stack_access_for_ptr_arithmetic(
+				   env, dst, dst_reg, dst_reg->off +
+				   dst_reg->var_off.value)) {
 			return -EACCES;
 		}
 	}
@@ -12129,6 +12493,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->strict_alignment = false;
 
 	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+	env->allow_uninit_stack = bpf_allow_uninit_stack();
 	env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
 	env->bypass_spec_v1 = bpf_bypass_spec_v1();
 	env->bypass_spec_v4 = bpf_bypass_spec_v4();
-- 
cgit v1.2.3


From f31e3386a4e92ba6eda7328cb508462956c94c64 Mon Sep 17 00:00:00 2001
From: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Date: Thu, 4 Feb 2021 09:49:51 -0800
Subject: ima: Free IMA measurement buffer after kexec syscall

IMA allocates kernel virtual memory to carry forward the measurement
list, from the current kernel to the next kernel on kexec system call,
in ima_add_kexec_buffer() function.  This buffer is not freed before
completing the kexec system call resulting in memory leak.

Add ima_buffer field in "struct kimage" to store the virtual address
of the buffer allocated for the IMA measurement list.
Free the memory allocated for the IMA measurement list in
kimage_file_post_load_cleanup() function.

Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/kexec.h              | 5 +++++
 kernel/kexec_file.c                | 5 +++++
 security/integrity/ima/ima_kexec.c | 2 ++
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9e93bef52968..5f61389f5f36 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -300,6 +300,11 @@ struct kimage {
 	/* Information for loading purgatory */
 	struct purgatory_info purgatory_info;
 #endif
+
+#ifdef CONFIG_IMA_KEXEC
+	/* Virtual address of IMA measurement buffer for kexec syscall */
+	void *ima_buffer;
+#endif
 };
 
 /* kexec interface functions */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b02086d70492..5c3447cf7ad5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -166,6 +166,11 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 	vfree(pi->sechdrs);
 	pi->sechdrs = NULL;
 
+#ifdef CONFIG_IMA_KEXEC
+	vfree(image->ima_buffer);
+	image->ima_buffer = NULL;
+#endif /* CONFIG_IMA_KEXEC */
+
 	/* See if architecture has anything to cleanup post load */
 	arch_kimage_file_post_load_cleanup(image);
 
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 206ddcaa5c67..e29bea3dd4cc 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -129,6 +129,8 @@ void ima_add_kexec_buffer(struct kimage *image)
 		return;
 	}
 
+	image->ima_buffer = kexec_buffer;
+
 	pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
 		 kbuf.mem);
 }
-- 
cgit v1.2.3


From cd1a41ceba8a4caef4d18a3a14d6d0f8c656efe4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Feb 2021 00:40:52 +0100
Subject: softirq: Move __ARCH_HAS_DO_SOFTIRQ to Kconfig

To prepare for inlining do_softirq_own_stack() replace
__ARCH_HAS_DO_SOFTIRQ with a Kconfig switch and select it in the affected
architectures.

This allows in the next step to move the function prototype and the inline
stub into a seperate asm-generic header file which is required to avoid
include recursion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210210002513.181713427@linutronix.de
---
 arch/Kconfig                      | 6 ++++++
 arch/parisc/Kconfig               | 1 +
 arch/parisc/include/asm/hardirq.h | 4 ----
 arch/powerpc/Kconfig              | 1 +
 arch/powerpc/include/asm/irq.h    | 2 --
 arch/s390/Kconfig                 | 1 +
 arch/s390/include/asm/hardirq.h   | 1 -
 arch/sh/Kconfig                   | 1 +
 arch/sh/include/asm/irq.h         | 1 -
 arch/sparc/Kconfig                | 1 +
 arch/sparc/include/asm/irq_64.h   | 1 -
 arch/x86/Kconfig                  | 1 +
 arch/x86/include/asm/irq.h        | 2 --
 include/linux/interrupt.h         | 2 +-
 14 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 24862d15f3a3..7eab9d0a4b43 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -759,6 +759,12 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
 	  This spares a stack switch and improves cache usage on softirq
 	  processing.
 
+config HAVE_SOFTIRQ_ON_OWN_STACK
+	bool
+	help
+	  Architecture provides a function to run __do_softirq() on a
+	  seperate stack.
+
 config PGTABLE_LEVELS
 	int
 	default 2
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 78b17621ee4a..7cd88c0a9a32 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -63,6 +63,7 @@ config PARISC
 	select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
 	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
 	select SET_FS
 
 	help
diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h
index fad29aa6f45f..1e4fbd0fd944 100644
--- a/arch/parisc/include/asm/hardirq.h
+++ b/arch/parisc/include/asm/hardirq.h
@@ -12,10 +12,6 @@
 #include <linux/threads.h>
 #include <linux/irq.h>
 
-#ifdef CONFIG_IRQSTACKS
-#define __ARCH_HAS_DO_SOFTIRQ
-#endif
-
 typedef struct {
 	unsigned int __softirq_pending;
 	unsigned int kernel_stack_usage;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..888d1ad436ce 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -237,6 +237,7 @@ config PPC
 	select MMU_GATHER_PAGE_SIZE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 4f983ca4030a..f3f264e441a7 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -37,8 +37,6 @@ extern int distribute_irqs;
 
 struct pt_regs;
 
-#define __ARCH_HAS_DO_SOFTIRQ
-
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 /*
  * Per-cpu stacks for handling critical, debug and machine check
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c72874f09741..e00c02ab5706 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -182,6 +182,7 @@ config S390
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE
 	select HAVE_RSEQ
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_VIRT_CPU_ACCOUNTING_IDLE
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index dfbc3c6c0674..58668ffb5488 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -18,7 +18,6 @@
 #define or_softirq_pending(x)  (S390_lowcore.softirq_pending |= (x))
 
 #define __ARCH_IRQ_STAT
-#define __ARCH_HAS_DO_SOFTIRQ
 #define __ARCH_IRQ_EXIT_IRQS_DISABLED
 
 static inline void ack_bad_irq(unsigned int irq)
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5fa580219a86..b49d5e033e29 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -56,6 +56,7 @@ config SUPERH
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_UID16
+	select HAVE_SOFTIRQ_ON_OWN_STACK if IRQSTACKS
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select IRQ_FORCED_THREADING
diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h
index 6d44c32ef047..839551ce398c 100644
--- a/arch/sh/include/asm/irq.h
+++ b/arch/sh/include/asm/irq.h
@@ -51,7 +51,6 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs);
 #ifdef CONFIG_IRQSTACKS
 extern void irq_ctx_init(int cpu);
 extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
 #else
 # define irq_ctx_init(cpu) do { } while (0)
 # define irq_ctx_exit(cpu) do { } while (0)
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c9c34dc52b7d..f0d22d530645 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -97,6 +97,7 @@ config SPARC64
 	select ARCH_HAS_PTE_SPECIAL
 	select PCI_DOMAINS if PCI
 	select ARCH_HAS_GIGANTIC_PAGE
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h
index 4d748e93b974..154df2cf19f4 100644
--- a/arch/sparc/include/asm/irq_64.h
+++ b/arch/sparc/include/asm/irq_64.h
@@ -93,7 +93,6 @@ void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
 
 extern void *hardirq_stack[NR_CPUS];
 extern void *softirq_stack[NR_CPUS];
-#define __ARCH_HAS_DO_SOFTIRQ
 
 #define NO_IRQ		0xffffffff
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e17ce871bb19..019e0e1a5ee0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -221,6 +221,7 @@ config X86
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
 	select HAVE_FUNCTION_ARG_ACCESS_API
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_STATIC_CALL
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 528c8a71fe7f..f70d2ca3887e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,8 +25,6 @@ static inline int irq_canonicalize(int irq)
 
 extern int irq_init_percpu_irqstack(unsigned int cpu);
 
-#define __ARCH_HAS_DO_SOFTIRQ
-
 struct irq_desc;
 
 extern void fixup_irqs(void);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index bb8ff9083e7d..f0b918f393a2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,7 +569,7 @@ struct softirq_action
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 
-#ifdef __ARCH_HAS_DO_SOFTIRQ
+#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
 void do_softirq_own_stack(void);
 #else
 static inline void do_softirq_own_stack(void)
-- 
cgit v1.2.3


From db1cc7aede37eb9235759131ddfefd9c0ea5136f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 Feb 2021 00:40:53 +0100
Subject: softirq: Move do_softirq_own_stack() to generic asm header

To avoid include recursion hell move the do_softirq_own_stack() related
content into a generic asm header and include it from all places in arch/
which need the prototype.

This allows architectures to provide an inline implementation of
do_softirq_own_stack() without introducing a lot of #ifdeffery all over the
place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210210002513.289960691@linutronix.de
---
 arch/parisc/kernel/irq.c            |  1 +
 arch/powerpc/kernel/irq.c           |  1 +
 arch/s390/kernel/irq.c              |  1 +
 arch/sh/kernel/irq.c                |  1 +
 arch/sparc/kernel/irq_64.c          |  1 +
 arch/x86/kernel/irq_32.c            |  1 +
 arch/x86/kernel/irq_64.c            |  1 +
 include/asm-generic/Kbuild          |  1 +
 include/asm-generic/softirq_stack.h | 14 ++++++++++++++
 include/linux/interrupt.h           |  9 ---------
 kernel/softirq.c                    |  2 ++
 11 files changed, 24 insertions(+), 9 deletions(-)
 create mode 100644 include/asm-generic/softirq_stack.h

(limited to 'include/linux')

diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 49cd6d2caefb..1632d525ad64 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <asm/io.h>
 
+#include <asm/softirq_stack.h>
 #include <asm/smp.h>
 #include <asm/ldcw.h>
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6b1eca53e36c..96296d3383a7 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,6 +65,7 @@
 #include <asm/livepatch.h>
 #include <asm/asm-prototypes.h>
 #include <asm/hw_irq.h>
+#include <asm/softirq_stack.h>
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index f8a8b9428ae2..a1a2f75f3c09 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -27,6 +27,7 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
 #include "entry.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index ab5f790b0cd2..ef0f0827cf57 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -20,6 +20,7 @@
 #include <linux/uaccess.h>
 #include <asm/thread_info.h>
 #include <cpu/mmu_context.h>
+#include <asm/softirq_stack.h>
 
 atomic_t irq_err_count;
 
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 3ec9f1402aad..c8848bb681a1 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -42,6 +42,7 @@
 #include <asm/head.h>
 #include <asm/hypervisor.h>
 #include <asm/cacheflush.h>
+#include <asm/softirq_stack.h>
 
 #include "entry.h"
 #include "cpumap.h"
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 0b79efc87be5..044902d5a3c4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -22,6 +22,7 @@
 
 #include <asm/apic.h>
 #include <asm/nospec-branch.h>
+#include <asm/softirq_stack.h>
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index b88fdb9686e6..f335c3910834 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
 #include <linux/sched/task_stack.h>
 
 #include <asm/cpu_entry_area.h>
+#include <asm/softirq_stack.h>
 #include <asm/irq_stack.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 267f6dfb8960..bd684188e2dc 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -51,6 +51,7 @@ mandatory-y += sections.h
 mandatory-y += serial.h
 mandatory-y += shmparam.h
 mandatory-y += simd.h
+mandatory-y += softirq_stack.h
 mandatory-y += switch_to.h
 mandatory-y += timex.h
 mandatory-y += tlbflush.h
diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h
new file mode 100644
index 000000000000..eceeecf6a5bd
--- /dev/null
+++ b/include/asm-generic/softirq_stack.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_GENERIC_SOFTIRQ_STACK_H
+#define __ASM_GENERIC_SOFTIRQ_STACK_H
+
+#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
+void do_softirq_own_stack(void);
+#else
+static inline void do_softirq_own_stack(void)
+{
+	__do_softirq();
+}
+#endif
+
+#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f0b918f393a2..967e25767153 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -569,15 +569,6 @@ struct softirq_action
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
 
-#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK
-void do_softirq_own_stack(void);
-#else
-static inline void do_softirq_own_stack(void)
-{
-	__do_softirq();
-}
-#endif
-
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9d71046ea247..9908ec4a9bfe 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -26,6 +26,8 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 
+#include <asm/softirq_stack.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
-- 
cgit v1.2.3


From 5b74df80f301e872143fa716f3f4361b2e293e19 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Mon, 4 Jan 2021 09:38:57 +0200
Subject: net/mlx5: Delete device list leftover

Device list is not stored in mlx5_priv anymore, so delete it as it's not
used.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 88197b87bd81..936302b2c141 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -588,7 +588,6 @@ struct mlx5_priv {
 	/* end: alloc staff */
 	struct dentry	       *dbg_root;
 
-	struct list_head        dev_list;
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
 	struct mlx5_adev       **adev;
-- 
cgit v1.2.3


From 8c0381f55bbf70a3b8ab24e4f5ac62125c44c804 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 10 Feb 2021 12:00:49 -0800
Subject: of: irq: Fix the return value for of_irq_parse_one() stub

When commit 1852ebd13542 ("of: irq: make a stub for of_irq_parse_one()")
added a stub for of_irq_parse_one() it set the return value to 0. Return
value of 0 in this instance means the call succeeded and the out_irq
pointer was filled with valid data. So, fix it to return an error value.

Fixes: 1852ebd13542 ("of: irq: make a stub for of_irq_parse_one()")
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210210200050.4106032-1-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/of_irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index f898d838d201..aaf219bd0354 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -60,7 +60,7 @@ u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 static inline int of_irq_parse_one(struct device_node *device, int index,
 				   struct of_phandle_args *out_irq)
 {
-	return 0;
+	return -EINVAL;
 }
 static inline int of_irq_count(struct device_node *dev)
 {
-- 
cgit v1.2.3


From 21a4e356d3588806307555c149b80cec3dedb180 Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Mon, 1 Feb 2021 15:48:12 +0100
Subject: Drivers: hv: vmbus: Restrict vmbus_devices on isolated guests

Only the VSCs or ICs that have been hardened and that are critical for
the successful adoption of Confidential VMs should be allowed if the
guest is running isolated.  This change reduces the footprint of the
code that will be exercised by Confidential VMs and hence the exposure
to bugs and vulnerabilities.

Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20210201144814.2701-3-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel_mgmt.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/hyperv.h    |  1 +
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 68950a1e4b63..f0ed730e2e4e 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -31,101 +31,118 @@ const struct vmbus_device vmbus_devs[] = {
 	{ .dev_type = HV_IDE,
 	  HV_IDE_GUID,
 	  .perf_device = true,
+	  .allowed_in_isolated = false,
 	},
 
 	/* SCSI */
 	{ .dev_type = HV_SCSI,
 	  HV_SCSI_GUID,
 	  .perf_device = true,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Fibre Channel */
 	{ .dev_type = HV_FC,
 	  HV_SYNTHFC_GUID,
 	  .perf_device = true,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Synthetic NIC */
 	{ .dev_type = HV_NIC,
 	  HV_NIC_GUID,
 	  .perf_device = true,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Network Direct */
 	{ .dev_type = HV_ND,
 	  HV_ND_GUID,
 	  .perf_device = true,
+	  .allowed_in_isolated = false,
 	},
 
 	/* PCIE */
 	{ .dev_type = HV_PCIE,
 	  HV_PCIE_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Synthetic Frame Buffer */
 	{ .dev_type = HV_FB,
 	  HV_SYNTHVID_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Synthetic Keyboard */
 	{ .dev_type = HV_KBD,
 	  HV_KBD_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Synthetic MOUSE */
 	{ .dev_type = HV_MOUSE,
 	  HV_MOUSE_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* KVP */
 	{ .dev_type = HV_KVP,
 	  HV_KVP_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Time Synch */
 	{ .dev_type = HV_TS,
 	  HV_TS_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Heartbeat */
 	{ .dev_type = HV_HB,
 	  HV_HEART_BEAT_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Shutdown */
 	{ .dev_type = HV_SHUTDOWN,
 	  HV_SHUTDOWN_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = true,
 	},
 
 	/* File copy */
 	{ .dev_type = HV_FCOPY,
 	  HV_FCOPY_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Backup */
 	{ .dev_type = HV_BACKUP,
 	  HV_VSS_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Dynamic Memory */
 	{ .dev_type = HV_DM,
 	  HV_DM_GUID,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 
 	/* Unknown GUID */
 	{ .dev_type = HV_UNKNOWN,
 	  .perf_device = false,
+	  .allowed_in_isolated = false,
 	},
 };
 
@@ -903,6 +920,20 @@ find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
 	return channel;
 }
 
+static bool vmbus_is_valid_device(const guid_t *guid)
+{
+	u16 i;
+
+	if (!hv_is_isolation_supported())
+		return true;
+
+	for (i = 0; i < ARRAY_SIZE(vmbus_devs); i++) {
+		if (guid_equal(guid, &vmbus_devs[i].guid))
+			return vmbus_devs[i].allowed_in_isolated;
+	}
+	return false;
+}
+
 /*
  * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
  *
@@ -917,6 +948,13 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 
 	trace_vmbus_onoffer(offer);
 
+	if (!vmbus_is_valid_device(&offer->offer.if_type)) {
+		pr_err_ratelimited("Invalid offer %d from the host supporting isolation\n",
+				   offer->child_relid);
+		atomic_dec(&vmbus_connection.offer_in_progress);
+		return;
+	}
+
 	oldchannel = find_primary_channel_by_offer(offer);
 
 	if (oldchannel != NULL) {
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index f0d48a368f13..e3426f8c12db 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -789,6 +789,7 @@ struct vmbus_device {
 	u16  dev_type;
 	guid_t guid;
 	bool perf_device;
+	bool allowed_in_isolated;
 };
 
 #define VMBUS_DEFAULT_MAX_PKT_SIZE 4096
-- 
cgit v1.2.3


From 78785010d428f7755bf51d1c08cb2566a73dc7f5 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 1 Feb 2021 11:43:34 -0600
Subject: hv: hyperv.h: Replace one-element array with flexible-array in struct
 icmsg_negotiate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Refactor the code according to the use of a flexible-array member in
struct icmsg_negotiate, instead of a one-element array.

Also, this helps the ongoing efforts to enable -Warray-bounds and fix the
following warnings:

drivers/hv/channel_mgmt.c:315:23: warning: array subscript 1 is above array bounds of ‘struct ic_version[1]’ [-Warray-bounds]
drivers/hv/channel_mgmt.c:316:23: warning: array subscript 1 is above array bounds of ‘struct ic_version[1]’ [-Warray-bounds]

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20210201174334.GA171933@embeddedor
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/hyperv.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index e3426f8c12db..9dd22af1b7f6 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1529,14 +1529,14 @@ struct icmsg_hdr {
 #define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100
 #define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr))
 #define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \
-	(ICMSG_HDR + offsetof(struct icmsg_negotiate, icversion_data) + \
+	(ICMSG_HDR + sizeof(struct icmsg_negotiate) + \
 	 (((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version)))
 
 struct icmsg_negotiate {
 	u16 icframe_vercnt;
 	u16 icmsg_vercnt;
 	u32 reserved;
-	struct ic_version icversion_data[1]; /* any size array */
+	struct ic_version icversion_data[]; /* any size array */
 } __packed;
 
 struct shutdown_msg_data {
-- 
cgit v1.2.3


From f11e2bc682cc197e33bfd118178cadb61326dc0e Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Wed, 10 Feb 2021 14:25:25 +0000
Subject: kgdb: Remove kgdb_schedule_breakpoint()

To the very best of my knowledge there has never been any in-tree
code that calls this function. It exists largely to support an
out-of-tree driver that provides kgdb-over-ethernet using the
netpoll API.

kgdboe has been out-of-tree for more than 10 years and I don't
recall any serious attempt to upstream it at any point in the last
five. At this stage it looks better to stop carrying this code in
the kernel and integrate the code into the out-of-tree driver
instead.

The long term trajectory for the kernel looks likely to include
effort to remove or reduce the use of tasklets (something that has
also been true for the last 10 years). Thus the main real reason
for this patch is to make explicit that the in-tree kgdb features
do not require tasklets.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20210210142525.2876648-1-daniel.thompson@linaro.org
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h      |  1 -
 kernel/debug/debug_core.c | 26 --------------------------
 2 files changed, 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 0d6cf64c8bb1..0444b44bd156 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -325,7 +325,6 @@ extern char *kgdb_mem2hex(char *mem, char *buf, int count);
 extern int kgdb_hex2mem(char *buf, char *mem, int count);
 
 extern int kgdb_isremovedbreak(unsigned long addr);
-extern void kgdb_schedule_breakpoint(void);
 extern int kgdb_has_hit_break(unsigned long addr);
 
 extern int
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7f22c1c0ffe8..b636d517c02c 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -119,7 +119,6 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
  */
 static atomic_t			masters_in_kgdb;
 static atomic_t			slaves_in_kgdb;
-static atomic_t			kgdb_break_tasklet_var;
 atomic_t			kgdb_setting_breakpoint;
 
 struct task_struct		*kgdb_usethread;
@@ -1084,31 +1083,6 @@ static void kgdb_unregister_callbacks(void)
 	}
 }
 
-/*
- * There are times a tasklet needs to be used vs a compiled in
- * break point so as to cause an exception outside a kgdb I/O module,
- * such as is the case with kgdboe, where calling a breakpoint in the
- * I/O driver itself would be fatal.
- */
-static void kgdb_tasklet_bpt(unsigned long ing)
-{
-	kgdb_breakpoint();
-	atomic_set(&kgdb_break_tasklet_var, 0);
-}
-
-static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
-
-void kgdb_schedule_breakpoint(void)
-{
-	if (atomic_read(&kgdb_break_tasklet_var) ||
-		atomic_read(&kgdb_active) != -1 ||
-		atomic_read(&kgdb_setting_breakpoint))
-		return;
-	atomic_inc(&kgdb_break_tasklet_var);
-	tasklet_schedule(&kgdb_tasklet_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-
 /**
  *	kgdb_register_io_module - register KGDB IO module
  *	@new_dbg_io_ops: the io ops vector
-- 
cgit v1.2.3


From f7684f5a048febd2a7bc98ee81d6dce52f7268b8 Mon Sep 17 00:00:00 2001
From: Frieder Schrempf <frieder.schrempf@kontron.de>
Date: Thu, 11 Feb 2021 11:55:30 +0100
Subject: regulator: pca9450: Enable system reset on WDOG_B assertion

By default the PCA9450 doesn't handle the assertion of the WDOG_B
signal, but this is required to guarantee that things like software
resets triggered by the watchdog work reliably.

As we don't want to rely on the bootloader to enable this, we tell
the PMIC to issue a cold reset in case the WDOG_B signal is
asserted (WDOG_B_CFG = 10), just as the NXP U-Boot code does.

Signed-off-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Link: https://lore.kernel.org/r/20210211105534.38972-3-frieder.schrempf@kontron.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/pca9450-regulator.c | 8 ++++++++
 include/linux/regulator/pca9450.h     | 7 +++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c
index 1bba8fdcb7b7..833d398c6aa2 100644
--- a/drivers/regulator/pca9450-regulator.c
+++ b/drivers/regulator/pca9450-regulator.c
@@ -797,6 +797,14 @@ static int pca9450_i2c_probe(struct i2c_client *i2c,
 		return ret;
 	}
 
+	/* Set reset behavior on assertion of WDOG_B signal */
+	ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL,
+				WDOG_B_CFG_MASK, WDOG_B_CFG_COLD_LDO12);
+	if (ret) {
+		dev_err(&i2c->dev, "Failed to set WDOG_B reset behavior\n");
+		return ret;
+	}
+
 	/*
 	 * The driver uses the LDO5CTRL_H register to control the LDO5 regulator.
 	 * This is only valid if the SD_VSEL input of the PMIC is high. Let's
diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h
index 1bbd3014f906..ccdb5320a240 100644
--- a/include/linux/regulator/pca9450.h
+++ b/include/linux/regulator/pca9450.h
@@ -216,4 +216,11 @@ enum {
 #define IRQ_THERM_105			0x02
 #define IRQ_THERM_125			0x01
 
+/* PCA9450_REG_RESET_CTRL bits */
+#define WDOG_B_CFG_MASK			0xC0
+#define WDOG_B_CFG_NONE			0x00
+#define WDOG_B_CFG_WARM			0x40
+#define WDOG_B_CFG_COLD_LDO12		0x80
+#define WDOG_B_CFG_COLD			0xC0
+
 #endif /* __LINUX_REG_PCA9450_H__ */
-- 
cgit v1.2.3


From 7bdcc48f4e80b01fd6057dfd382236a5b8123b61 Mon Sep 17 00:00:00 2001
From: Satya Tangirala <satyat@google.com>
Date: Mon, 1 Feb 2021 05:10:15 +0000
Subject: block/keyslot-manager: Introduce passthrough keyslot manager

The device mapper may map over devices that have inline encryption
capabilities, and to make use of those capabilities, the DM device must
itself advertise those inline encryption capabilities. One way to do this
would be to have the DM device set up a keyslot manager with a
"sufficiently large" number of keyslots, but that would use a lot of
memory. Also, the DM device itself has no "keyslots", and it doesn't make
much sense to talk about "programming a key into a DM device's keyslot
manager", so all that extra memory used to represent those keyslots is just
wasted. All a DM device really needs to be able to do is advertise the
crypto capabilities of the underlying devices in a coherent manner and
expose a way to evict keys from the underlying devices.

There are also devices with inline encryption hardware that do not
have a limited number of keyslots. One can send a raw encryption key along
with a bio to these devices (as opposed to typical inline encryption
hardware that require users to first program a raw encryption key into a
keyslot, and send the index of that keyslot along with the bio). These
devices also only need the same things from the keyslot manager that DM
devices need - a way to advertise crypto capabilities and potentially a way
to expose a function to evict keys from hardware.

So we introduce a "passthrough" keyslot manager that provides a way to
represent a keyslot manager that doesn't have just a limited number of
keyslots, and for which do not require keys to be programmed into keyslots.
DM devices can set up a passthrough keyslot manager in their request
queues, and advertise appropriate crypto capabilities based on those of the
underlying devices. Blk-crypto does not attempt to program keys into any
keyslots in the passthrough keyslot manager. Instead, if/when the bio is
resubmitted to the underlying device, blk-crypto will try to program the
key into the underlying device's keyslot manager.

Signed-off-by: Satya Tangirala <satyat@google.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 block/keyslot-manager.c         | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/keyslot-manager.h |  2 ++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
index 86f8195d8039..ac7ce83a76e8 100644
--- a/block/keyslot-manager.c
+++ b/block/keyslot-manager.c
@@ -62,6 +62,11 @@ static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
 		pm_runtime_put_sync(ksm->dev);
 }
 
+static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm)
+{
+	return ksm->num_slots == 0;
+}
+
 /**
  * blk_ksm_init() - Initialize a keyslot manager
  * @ksm: The keyslot_manager to initialize.
@@ -205,6 +210,10 @@ blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
 	int err;
 
 	*slot_ptr = NULL;
+
+	if (blk_ksm_is_passthrough(ksm))
+		return BLK_STS_OK;
+
 	down_read(&ksm->lock);
 	slot = blk_ksm_find_and_grab_keyslot(ksm, key);
 	up_read(&ksm->lock);
@@ -325,6 +334,16 @@ int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
 	struct blk_ksm_keyslot *slot;
 	int err = 0;
 
+	if (blk_ksm_is_passthrough(ksm)) {
+		if (ksm->ksm_ll_ops.keyslot_evict) {
+			blk_ksm_hw_enter(ksm);
+			err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1);
+			blk_ksm_hw_exit(ksm);
+			return err;
+		}
+		return 0;
+	}
+
 	blk_ksm_hw_enter(ksm);
 	slot = blk_ksm_find_keyslot(ksm, key);
 	if (!slot)
@@ -360,6 +379,9 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
 {
 	unsigned int slot;
 
+	if (blk_ksm_is_passthrough(ksm))
+		return;
+
 	/* This is for device initialization, so don't resume the device */
 	down_write(&ksm->lock);
 	for (slot = 0; slot < ksm->num_slots; slot++) {
@@ -401,3 +423,20 @@ void blk_ksm_unregister(struct request_queue *q)
 {
 	q->ksm = NULL;
 }
+
+/**
+ * blk_ksm_init_passthrough() - Init a passthrough keyslot manager
+ * @ksm: The keyslot manager to init
+ *
+ * Initialize a passthrough keyslot manager.
+ * Called by e.g. storage drivers to set up a keyslot manager in their
+ * request_queue, when the storage driver wants to manage its keys by itself.
+ * This is useful for inline encryption hardware that doesn't have the concept
+ * of keyslots, and for layered devices.
+ */
+void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm)
+{
+	memset(ksm, 0, sizeof(*ksm));
+	init_rwsem(&ksm->lock);
+}
+EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough);
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h
index 18f3f5346843..323e15dd6fa7 100644
--- a/include/linux/keyslot-manager.h
+++ b/include/linux/keyslot-manager.h
@@ -103,4 +103,6 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
 
 void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
 
+void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
+
 #endif /* __LINUX_KEYSLOT_MANAGER_H */
-- 
cgit v1.2.3


From d3b17a243790a34bd63fcef3fde63e29e2744938 Mon Sep 17 00:00:00 2001
From: Satya Tangirala <satyat@google.com>
Date: Mon, 1 Feb 2021 05:10:16 +0000
Subject: block/keyslot-manager: Introduce functions for device mapper support

Introduce blk_ksm_update_capabilities() to update the capabilities of
a keyslot manager (ksm) in-place. The pointer to a ksm in a device's
request queue may not be easily replaced, because upper layers like
the filesystem might access it (e.g. for programming keys/checking
capabilities) at the same time the device wants to replace that
request queue's ksm (and free the old ksm's memory). This function
allows the device to update the capabilities of the ksm in its request
queue directly. Devices can safely update the ksm this way without any
synchronization with upper layers *only* if the updated (new) ksm
continues to support all the crypto capabilities that the old ksm did
(see description below for blk_ksm_is_superset() for why this is so).

Also introduce blk_ksm_is_superset() which checks whether one ksm's
capabilities are a (not necessarily strict) superset of another ksm's.
The blk-crypto framework requires that crypto capabilities that were
advertised when a bio was created continue to be supported by the
device until that bio is ended - in practice this probably means that
a device's advertised crypto capabilities can *never* "shrink" (since
there's no synchronization between bio creation and when a device may
want to change its advertised capabilities) - so a previously
advertised crypto capability must always continue to be supported.
This function can be used to check that a new ksm is a valid
replacement for an old ksm.

Signed-off-by: Satya Tangirala <satyat@google.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 block/keyslot-manager.c         | 107 ++++++++++++++++++++++++++++++++++++++++
 include/linux/keyslot-manager.h |   9 ++++
 2 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
index ac7ce83a76e8..9f9494b80148 100644
--- a/block/keyslot-manager.c
+++ b/block/keyslot-manager.c
@@ -424,6 +424,113 @@ void blk_ksm_unregister(struct request_queue *q)
 	q->ksm = NULL;
 }
 
+/**
+ * blk_ksm_intersect_modes() - restrict supported modes by child device
+ * @parent: The keyslot manager for parent device
+ * @child: The keyslot manager for child device, or NULL
+ *
+ * Clear any crypto mode support bits in @parent that aren't set in @child.
+ * If @child is NULL, then all parent bits are cleared.
+ *
+ * Only use this when setting up the keyslot manager for a layered device,
+ * before it's been exposed yet.
+ */
+void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
+			     const struct blk_keyslot_manager *child)
+{
+	if (child) {
+		unsigned int i;
+
+		parent->max_dun_bytes_supported =
+			min(parent->max_dun_bytes_supported,
+			    child->max_dun_bytes_supported);
+		for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported);
+		     i++) {
+			parent->crypto_modes_supported[i] &=
+				child->crypto_modes_supported[i];
+		}
+	} else {
+		parent->max_dun_bytes_supported = 0;
+		memset(parent->crypto_modes_supported, 0,
+		       sizeof(parent->crypto_modes_supported));
+	}
+}
+EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes);
+
+/**
+ * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes
+ *			   and DUN bytes that another KSM supports. Here,
+ *			   "superset" refers to the mathematical meaning of the
+ *			   word - i.e. if two KSMs have the *same* capabilities,
+ *			   they *are* considered supersets of each other.
+ * @ksm_superset: The KSM that we want to verify is a superset
+ * @ksm_subset: The KSM that we want to verify is a subset
+ *
+ * Return: True if @ksm_superset supports a superset of the crypto modes and DUN
+ *	   bytes that @ksm_subset supports.
+ */
+bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
+			 struct blk_keyslot_manager *ksm_subset)
+{
+	int i;
+
+	if (!ksm_subset)
+		return true;
+
+	if (!ksm_superset)
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) {
+		if (ksm_subset->crypto_modes_supported[i] &
+		    (~ksm_superset->crypto_modes_supported[i])) {
+			return false;
+		}
+	}
+
+	if (ksm_subset->max_dun_bytes_supported >
+	    ksm_superset->max_dun_bytes_supported) {
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(blk_ksm_is_superset);
+
+/**
+ * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of
+ *				   another KSM
+ * @target_ksm: The KSM whose restrictions to update.
+ * @reference_ksm: The KSM to whose restrictions this function will update
+ *		   @target_ksm's restrictions to.
+ *
+ * Blk-crypto requires that crypto capabilities that were
+ * advertised when a bio was created continue to be supported by the
+ * device until that bio is ended. This is turn means that a device cannot
+ * shrink its advertised crypto capabilities without any explicit
+ * synchronization with upper layers. So if there's no such explicit
+ * synchronization, @reference_ksm must support all the crypto capabilities that
+ * @target_ksm does
+ * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true).
+ *
+ * Note also that as long as the crypto capabilities are being expanded, the
+ * order of updates becoming visible is not important because it's alright
+ * for blk-crypto to see stale values - they only cause blk-crypto to
+ * believe that a crypto capability isn't supported when it actually is (which
+ * might result in blk-crypto-fallback being used if available, or the bio being
+ * failed).
+ */
+void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
+				 struct blk_keyslot_manager *reference_ksm)
+{
+	memcpy(target_ksm->crypto_modes_supported,
+	       reference_ksm->crypto_modes_supported,
+	       sizeof(target_ksm->crypto_modes_supported));
+
+	target_ksm->max_dun_bytes_supported =
+				reference_ksm->max_dun_bytes_supported;
+}
+EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities);
+
 /**
  * blk_ksm_init_passthrough() - Init a passthrough keyslot manager
  * @ksm: The keyslot manager to init
diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h
index 323e15dd6fa7..164568f52be7 100644
--- a/include/linux/keyslot-manager.h
+++ b/include/linux/keyslot-manager.h
@@ -103,6 +103,15 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
 
 void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
 
+void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
+			     const struct blk_keyslot_manager *child);
+
 void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
 
+bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
+			 struct blk_keyslot_manager *ksm_subset);
+
+void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
+				 struct blk_keyslot_manager *reference_ksm);
+
 #endif /* __LINUX_KEYSLOT_MANAGER_H */
-- 
cgit v1.2.3


From aa6ce87a768226802f9a231b3909fe81c503852c Mon Sep 17 00:00:00 2001
From: Satya Tangirala <satyat@google.com>
Date: Mon, 1 Feb 2021 05:10:17 +0000
Subject: dm: add support for passing through inline crypto support

Update the device-mapper core to support exposing the inline crypto
support of the underlying device(s) through the device-mapper device.

This works by creating a "passthrough keyslot manager" for the dm
device, which declares support for encryption settings which all
underlying devices support.  When a supported setting is used, the bio
cloning code handles cloning the crypto context to the bios for all the
underlying devices.  When an unsupported setting is used, the blk-crypto
fallback is used as usual.

Crypto support on each underlying device is ignored unless the
corresponding dm target opts into exposing it.  This is needed because
for inline crypto to semantically operate on the original bio, the data
must not be transformed by the dm target.  Thus, targets like dm-linear
can expose crypto support of the underlying device, but targets like
dm-crypt can't.  (dm-crypt could use inline crypto itself, though.)

A DM device's table can only be changed if the "new" inline encryption
capabilities are a (*not* necessarily strict) superset of the "old" inline
encryption capabilities.  Attempts to make changes to the table that result
in some inline encryption capability becoming no longer supported will be
rejected.

For the sake of clarity, key eviction from underlying devices will be
handled in a future patch.

Co-developed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Satya Tangirala <satyat@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |   5 ++
 drivers/md/dm-table.c         | 162 ++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.c               |  18 ++++-
 include/linux/device-mapper.h |  11 +++
 include/uapi/linux/dm-ioctl.h |   4 +-
 5 files changed, 197 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 086d293c2b03..bf3e66f39a4a 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -13,6 +13,7 @@
 #include <linux/ktime.h>
 #include <linux/genhd.h>
 #include <linux/blk-mq.h>
+#include <linux/keyslot-manager.h>
 
 #include <trace/events/block.h>
 
@@ -162,6 +163,10 @@ struct dm_table {
 	void *event_context;
 
 	struct dm_md_mempools *mempools;
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+	struct blk_keyslot_manager *ksm;
+#endif
 };
 
 static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2bc256d61550..8c4ed3ec9409 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -187,6 +187,8 @@ static void free_devices(struct list_head *devices, struct mapped_device *md)
 	}
 }
 
+static void dm_table_destroy_keyslot_manager(struct dm_table *t);
+
 void dm_table_destroy(struct dm_table *t)
 {
 	unsigned int i;
@@ -215,6 +217,8 @@ void dm_table_destroy(struct dm_table *t)
 
 	dm_free_md_mempools(t->mempools);
 
+	dm_table_destroy_keyslot_manager(t);
+
 	kfree(t);
 }
 
@@ -1203,6 +1207,157 @@ static int dm_table_register_integrity(struct dm_table *t)
 	return 0;
 }
 
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+
+struct dm_keyslot_manager {
+	struct blk_keyslot_manager ksm;
+	struct mapped_device *md;
+};
+
+static int device_intersect_crypto_modes(struct dm_target *ti,
+					 struct dm_dev *dev, sector_t start,
+					 sector_t len, void *data)
+{
+	struct blk_keyslot_manager *parent = data;
+	struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm;
+
+	blk_ksm_intersect_modes(parent, child);
+	return 0;
+}
+
+void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+{
+	struct dm_keyslot_manager *dksm = container_of(ksm,
+						       struct dm_keyslot_manager,
+						       ksm);
+
+	if (!ksm)
+		return;
+
+	blk_ksm_destroy(ksm);
+	kfree(dksm);
+}
+
+static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+{
+	dm_destroy_keyslot_manager(t->ksm);
+	t->ksm = NULL;
+}
+
+/*
+ * Constructs and initializes t->ksm with a keyslot manager that
+ * represents the common set of crypto capabilities of the devices
+ * described by the dm_table. However, if the constructed keyslot
+ * manager does not support a superset of the crypto capabilities
+ * supported by the current keyslot manager of the mapped_device,
+ * it returns an error instead, since we don't support restricting
+ * crypto capabilities on table changes. Finally, if the constructed
+ * keyslot manager doesn't actually support any crypto modes at all,
+ * it just returns NULL.
+ */
+static int dm_table_construct_keyslot_manager(struct dm_table *t)
+{
+	struct dm_keyslot_manager *dksm;
+	struct blk_keyslot_manager *ksm;
+	struct dm_target *ti;
+	unsigned int i;
+	bool ksm_is_empty = true;
+
+	dksm = kmalloc(sizeof(*dksm), GFP_KERNEL);
+	if (!dksm)
+		return -ENOMEM;
+	dksm->md = t->md;
+
+	ksm = &dksm->ksm;
+	blk_ksm_init_passthrough(ksm);
+	ksm->max_dun_bytes_supported = UINT_MAX;
+	memset(ksm->crypto_modes_supported, 0xFF,
+	       sizeof(ksm->crypto_modes_supported));
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
+
+		if (!dm_target_passes_crypto(ti->type)) {
+			blk_ksm_intersect_modes(ksm, NULL);
+			break;
+		}
+		if (!ti->type->iterate_devices)
+			continue;
+		ti->type->iterate_devices(ti, device_intersect_crypto_modes,
+					  ksm);
+	}
+
+	if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) {
+		DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!");
+		dm_destroy_keyslot_manager(ksm);
+		return -EINVAL;
+	}
+
+	/*
+	 * If the new KSM doesn't actually support any crypto modes, we may as
+	 * well represent it with a NULL ksm.
+	 */
+	ksm_is_empty = true;
+	for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) {
+		if (ksm->crypto_modes_supported[i]) {
+			ksm_is_empty = false;
+			break;
+		}
+	}
+
+	if (ksm_is_empty) {
+		dm_destroy_keyslot_manager(ksm);
+		ksm = NULL;
+	}
+
+	/*
+	 * t->ksm is only set temporarily while the table is being set
+	 * up, and it gets set to NULL after the capabilities have
+	 * been transferred to the request_queue.
+	 */
+	t->ksm = ksm;
+
+	return 0;
+}
+
+static void dm_update_keyslot_manager(struct request_queue *q,
+				      struct dm_table *t)
+{
+	if (!t->ksm)
+		return;
+
+	/* Make the ksm less restrictive */
+	if (!q->ksm) {
+		blk_ksm_register(t->ksm, q);
+	} else {
+		blk_ksm_update_capabilities(q->ksm, t->ksm);
+		dm_destroy_keyslot_manager(t->ksm);
+	}
+	t->ksm = NULL;
+}
+
+#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+
+static int dm_table_construct_keyslot_manager(struct dm_table *t)
+{
+	return 0;
+}
+
+void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm)
+{
+}
+
+static void dm_table_destroy_keyslot_manager(struct dm_table *t)
+{
+}
+
+static void dm_update_keyslot_manager(struct request_queue *q,
+				      struct dm_table *t)
+{
+}
+
+#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
+
 /*
  * Prepares the table for use by building the indices,
  * setting the type, and allocating mempools.
@@ -1229,6 +1384,12 @@ int dm_table_complete(struct dm_table *t)
 		return r;
 	}
 
+	r = dm_table_construct_keyslot_manager(t);
+	if (r) {
+		DMERR("could not construct keyslot manager.");
+		return r;
+	}
+
 	r = dm_table_alloc_md_mempools(t, t->md);
 	if (r)
 		DMERR("unable to allocate mempools");
@@ -1863,6 +2024,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 #endif
 
+	dm_update_keyslot_manager(q, t);
 	blk_queue_update_readahead(q);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 64fda1429e39..7021aea82aa4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -28,6 +28,7 @@
 #include <linux/refcount.h>
 #include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
+#include <linux/keyslot-manager.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -1722,6 +1723,19 @@ static const struct dax_operations dm_dax_ops;
 
 static void dm_wq_work(struct work_struct *work);
 
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+{
+	dm_destroy_keyslot_manager(q->ksm);
+}
+
+#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+
+static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
+{
+}
+#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
+
 static void cleanup_mapped_device(struct mapped_device *md)
 {
 	if (md->wq)
@@ -1743,8 +1757,10 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		put_disk(md->disk);
 	}
 
-	if (md->queue)
+	if (md->queue) {
+		dm_queue_destroy_keyslot_manager(md->queue);
 		blk_cleanup_queue(md->queue);
+	}
 
 	cleanup_srcu_struct(&md->io_barrier);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 61a66fb8ebb3..47588130ef5e 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -257,6 +257,12 @@ struct target_type {
 #define DM_TARGET_NOWAIT		0x00000080
 #define dm_target_supports_nowait(type) ((type)->features & DM_TARGET_NOWAIT)
 
+/*
+ * A target supports passing through inline crypto support.
+ */
+#define DM_TARGET_PASSES_CRYPTO		0x00000100
+#define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO)
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
@@ -533,6 +539,11 @@ void dm_table_run_md_queue_async(struct dm_table *t);
 struct dm_table *dm_swap_table(struct mapped_device *md,
 			       struct dm_table *t);
 
+/*
+ * Table keyslot manager functions
+ */
+void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
+
 /*
  * A wrapper around vmalloc.
  */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 4933b6b67b85..fcff6669137b 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -272,9 +272,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	43
+#define DM_VERSION_MINOR	44
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2020-10-01)"
+#define DM_VERSION_EXTRA	"-ioctl (2021-02-01)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From e3290b9491ff5b7ee40f9e0a4c06821988a2a2bf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Feb 2021 17:38:30 -0500
Subject: dm: simplify target code conditional on CONFIG_BLK_DEV_ZONED

Allow removal of CONFIG_BLK_DEV_ZONED conditionals in target_type
definition of various targets.

Suggested-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c         |  6 ++----
 drivers/md/dm-flakey.c        |  6 ++----
 drivers/md/dm-linear.c        |  7 ++-----
 include/linux/device-mapper.h | 16 ++++++++++++++--
 4 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ee20b586f526..ae0f0a4e3689 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3134,7 +3134,6 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-
 static int crypt_report_zones(struct dm_target *ti,
 		struct dm_report_zones_args *args, unsigned int nr_zones)
 {
@@ -3145,7 +3144,8 @@ static int crypt_report_zones(struct dm_target *ti,
 	return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
 				   dm_report_zones_cb, args);
 }
-
+#else
+#define crypt_report_zones NULL
 #endif
 
 /*
@@ -3580,10 +3580,8 @@ static struct target_type crypt_target = {
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-#ifdef CONFIG_BLK_DEV_ZONED
 	.features = DM_TARGET_ZONED_HM,
 	.report_zones = crypt_report_zones,
-#endif
 	.map    = crypt_map,
 	.status = crypt_status,
 	.postsuspend = crypt_postsuspend,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 30c6bc151213..b7fee9936f05 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -469,6 +469,8 @@ static int flakey_report_zones(struct dm_target *ti,
 	return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
 				   dm_report_zones_cb, args);
 }
+#else
+#define flakey_report_zones NULL
 #endif
 
 static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
@@ -481,12 +483,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 static struct target_type flakey_target = {
 	.name   = "flakey",
 	.version = {1, 5, 0},
-#ifdef CONFIG_BLK_DEV_ZONED
 	.features = DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO,
 	.report_zones = flakey_report_zones,
-#else
-	.features = DM_TARGET_PASSES_CRYPTO,
-#endif
 	.module = THIS_MODULE,
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index fc9c4272c10d..92db0f5e7f28 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -146,6 +146,8 @@ static int linear_report_zones(struct dm_target *ti,
 	return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
 				   dm_report_zones_cb, args);
 }
+#else
+#define linear_report_zones NULL
 #endif
 
 static int linear_iterate_devices(struct dm_target *ti,
@@ -227,14 +229,9 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
 static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 4, 0},
-#ifdef CONFIG_BLK_DEV_ZONED
 	.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
 		    DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO,
 	.report_zones = linear_report_zones,
-#else
-	.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
-		    DM_TARGET_PASSES_CRYPTO,
-#endif
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 47588130ef5e..c98d847b0f0b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -93,9 +93,18 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv,
 
 typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev);
 
+#ifdef CONFIG_BLK_DEV_ZONED
 typedef int (*dm_report_zones_fn) (struct dm_target *ti,
 				   struct dm_report_zones_args *args,
 				   unsigned int nr_zones);
+#else
+/*
+ * Define dm_report_zones_fn so that targets can assign to NULL if
+ * CONFIG_BLK_DEV_ZONED disabled. Otherwise each target needs to do
+ * awkward #ifdefs in their target_type, etc.
+ */
+typedef int (*dm_report_zones_fn) (struct dm_target *dummy);
+#endif
 
 /*
  * These iteration functions are typically used to check (and combine)
@@ -187,9 +196,7 @@ struct target_type {
 	dm_status_fn status;
 	dm_message_fn message;
 	dm_prepare_ioctl_fn prepare_ioctl;
-#ifdef CONFIG_BLK_DEV_ZONED
 	dm_report_zones_fn report_zones;
-#endif
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
@@ -248,8 +255,13 @@ struct target_type {
 /*
  * Indicates that a target supports host-managed zoned block devices.
  */
+#ifdef CONFIG_BLK_DEV_ZONED
 #define DM_TARGET_ZONED_HM		0x00000040
 #define dm_target_supports_zoned_hm(type) ((type)->features & DM_TARGET_ZONED_HM)
+#else
+#define DM_TARGET_ZONED_HM		0x00000000
+#define dm_target_supports_zoned_hm(type) (false)
+#endif
 
 /*
  * A target handles REQ_NOWAIT
-- 
cgit v1.2.3


From a666e5c05e7c4aaabb2c5d58117b0946803d03d2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 10 Feb 2021 15:26:23 -0500
Subject: dm: fix deadlock when swapping to encrypted device

The system would deadlock when swapping to a dm-crypt device. The reason
is that for each incoming write bio, dm-crypt allocates memory that holds
encrypted data. These excessive allocations exhaust all the memory and the
result is either deadlock or OOM trigger.

This patch limits the number of in-flight swap bios, so that the memory
consumed by dm-crypt is limited. The limit is enforced if the target set
the "limit_swap_bios" variable and if the bio has REQ_SWAP set.

Non-swap bios are not affected becuase taking the semaphore would cause
performance degradation.

This is similar to request-based drivers - they will also block when the
number of requests is over the limit.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |  4 +++
 drivers/md/dm-crypt.c         |  1 +
 drivers/md/dm.c               | 60 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h |  5 ++++
 4 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index bf3e66f39a4a..5953ff2bd260 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -103,6 +103,10 @@ struct mapped_device {
 	/* kobject and completion */
 	struct dm_kobject_holder kobj_holder;
 
+	int swap_bios;
+	struct semaphore swap_bios_semaphore;
+	struct mutex swap_bios_lock;
+
 	struct dm_stats stats;
 
 	/* for blk-mq request-based DM support */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ae0f0a4e3689..11c105ecd165 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3342,6 +3342,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	wake_up_process(cc->write_thread);
 
 	ti->num_flush_bios = 1;
+	ti->limit_swap_bios = true;
 
 	return 0;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7021aea82aa4..50b693d776d6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -153,6 +153,16 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 #define DM_NUMA_NODE NUMA_NO_NODE
 static int dm_numa_node = DM_NUMA_NODE;
 
+#define DEFAULT_SWAP_BIOS	(8 * 1048576 / PAGE_SIZE)
+static int swap_bios = DEFAULT_SWAP_BIOS;
+static int get_swap_bios(void)
+{
+	int latch = READ_ONCE(swap_bios);
+	if (unlikely(latch <= 0))
+		latch = DEFAULT_SWAP_BIOS;
+	return latch;
+}
+
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -974,6 +984,11 @@ void disable_write_zeroes(struct mapped_device *md)
 	limits->max_write_zeroes_sectors = 0;
 }
 
+static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
+{
+	return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
+}
+
 static void clone_endio(struct bio *bio)
 {
 	blk_status_t error = bio->bi_status;
@@ -1025,6 +1040,11 @@ static void clone_endio(struct bio *bio)
 		}
 	}
 
+	if (unlikely(swap_bios_limit(tio->ti, bio))) {
+		struct mapped_device *md = io->md;
+		up(&md->swap_bios_semaphore);
+	}
+
 	free_tio(tio);
 	dec_pending(io, error);
 }
@@ -1258,6 +1278,22 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 }
 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
 
+static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
+{
+	mutex_lock(&md->swap_bios_lock);
+	while (latch < md->swap_bios) {
+		cond_resched();
+		down(&md->swap_bios_semaphore);
+		md->swap_bios--;
+	}
+	while (latch > md->swap_bios) {
+		cond_resched();
+		up(&md->swap_bios_semaphore);
+		md->swap_bios++;
+	}
+	mutex_unlock(&md->swap_bios_lock);
+}
+
 static blk_qc_t __map_bio(struct dm_target_io *tio)
 {
 	int r;
@@ -1277,6 +1313,14 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
 	atomic_inc(&io->io_count);
 	sector = clone->bi_iter.bi_sector;
 
+	if (unlikely(swap_bios_limit(ti, clone))) {
+		struct mapped_device *md = io->md;
+		int latch = get_swap_bios();
+		if (unlikely(latch != md->swap_bios))
+			__set_swap_bios_limit(md, latch);
+		down(&md->swap_bios_semaphore);
+	}
+
 	r = ti->type->map(ti, clone);
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
@@ -1287,10 +1331,18 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
 		ret = submit_bio_noacct(clone);
 		break;
 	case DM_MAPIO_KILL:
+		if (unlikely(swap_bios_limit(ti, clone))) {
+			struct mapped_device *md = io->md;
+			up(&md->swap_bios_semaphore);
+		}
 		free_tio(tio);
 		dec_pending(io, BLK_STS_IOERR);
 		break;
 	case DM_MAPIO_REQUEUE:
+		if (unlikely(swap_bios_limit(ti, clone))) {
+			struct mapped_device *md = io->md;
+			up(&md->swap_bios_semaphore);
+		}
 		free_tio(tio);
 		dec_pending(io, BLK_STS_DM_REQUEUE);
 		break;
@@ -1767,6 +1819,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	mutex_destroy(&md->suspend_lock);
 	mutex_destroy(&md->type_lock);
 	mutex_destroy(&md->table_devices_lock);
+	mutex_destroy(&md->swap_bios_lock);
 
 	dm_mq_cleanup_mapped_device(md);
 }
@@ -1834,6 +1887,10 @@ static struct mapped_device *alloc_dev(int minor)
 	init_waitqueue_head(&md->eventq);
 	init_completion(&md->kobj_holder.completion);
 
+	md->swap_bios = get_swap_bios();
+	sema_init(&md->swap_bios_semaphore, md->swap_bios);
+	mutex_init(&md->swap_bios_lock);
+
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
 	md->disk->fops = &dm_blk_dops;
@@ -3117,6 +3174,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
 
+module_param(swap_bios, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c98d847b0f0b..7f4ac87c0b32 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -343,6 +343,11 @@ struct dm_target {
 	 * whether or not its underlying devices have support.
 	 */
 	bool discards_supported:1;
+
+	/*
+	 * Set if we need to limit the number of in-flight bios when swapping.
+	 */
+	bool limit_swap_bios:1;
 };
 
 void *dm_per_bio_data(struct bio *bio, size_t data_size);
-- 
cgit v1.2.3


From 700d4796ef59f5faf240d307839bd419e2b6bdff Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 9 Feb 2021 19:36:26 -0800
Subject: bpf: Optimize program stats

Move bpf_prog_stats from prog->aux into prog to avoid one extra load
in critical path of program execution.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210210033634.62081-2-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h     |  8 --------
 include/linux/filter.h  | 14 +++++++++++---
 kernel/bpf/core.c       |  8 ++++----
 kernel/bpf/syscall.c    |  2 +-
 kernel/bpf/trampoline.c |  2 +-
 kernel/bpf/verifier.c   |  2 +-
 6 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 079162bbd387..5a388955e6b6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -14,7 +14,6 @@
 #include <linux/numa.h>
 #include <linux/mm_types.h>
 #include <linux/wait.h>
-#include <linux/u64_stats_sync.h>
 #include <linux/refcount.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
@@ -507,12 +506,6 @@ enum bpf_cgroup_storage_type {
  */
 #define MAX_BPF_FUNC_ARGS 12
 
-struct bpf_prog_stats {
-	u64 cnt;
-	u64 nsecs;
-	struct u64_stats_sync syncp;
-} __aligned(2 * sizeof(u64));
-
 struct btf_func_model {
 	u8 ret_size;
 	u8 nr_args;
@@ -845,7 +838,6 @@ struct bpf_prog_aux {
 	u32 linfo_idx;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
-	struct bpf_prog_stats __percpu *stats;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5b3137d7b690..cecb03c9d251 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sockptr.h>
 #include <crypto/sha1.h>
+#include <linux/u64_stats_sync.h>
 
 #include <net/sch_generic.h>
 
@@ -539,6 +540,12 @@ struct bpf_binary_header {
 	u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
 };
 
+struct bpf_prog_stats {
+	u64 cnt;
+	u64 nsecs;
+	struct u64_stats_sync syncp;
+} __aligned(2 * sizeof(u64));
+
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
 	u16			jited:1,	/* Is our filter JIT'ed? */
@@ -557,10 +564,11 @@ struct bpf_prog {
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
 	u8			tag[BPF_TAG_SIZE];
-	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
-	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	struct bpf_prog_stats __percpu *stats;
 	unsigned int		(*bpf_func)(const void *ctx,
 					    const struct bpf_insn *insn);
+	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
+	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	/* Instructions for interpreter */
 	struct sock_filter	insns[0];
 	struct bpf_insn		insnsi[];
@@ -581,7 +589,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 		struct bpf_prog_stats *__stats;				\
 		u64 __start = sched_clock();				\
 		__ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
-		__stats = this_cpu_ptr(prog->aux->stats);		\
+		__stats = this_cpu_ptr(prog->stats);			\
 		u64_stats_update_begin(&__stats->syncp);		\
 		__stats->cnt++;						\
 		__stats->nsecs += sched_clock() - __start;		\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5bbd4884ff7a..2cf71fd39c22 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -114,8 +114,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (!prog)
 		return NULL;
 
-	prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
-	if (!prog->aux->stats) {
+	prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+	if (!prog->stats) {
 		kfree(prog->aux);
 		vfree(prog);
 		return NULL;
@@ -124,7 +124,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	for_each_possible_cpu(cpu) {
 		struct bpf_prog_stats *pstats;
 
-		pstats = per_cpu_ptr(prog->aux->stats, cpu);
+		pstats = per_cpu_ptr(prog->stats, cpu);
 		u64_stats_init(&pstats->syncp);
 	}
 	return prog;
@@ -249,10 +249,10 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	if (fp->aux) {
 		mutex_destroy(&fp->aux->used_maps_mutex);
 		mutex_destroy(&fp->aux->dst_mutex);
-		free_percpu(fp->aux->stats);
 		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
+	free_percpu(fp->stats);
 	vfree(fp);
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e5999d86c76e..f7df56a704de 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1739,7 +1739,7 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
 		unsigned int start;
 		u64 tnsecs, tcnt;
 
-		st = per_cpu_ptr(prog->aux->stats, cpu);
+		st = per_cpu_ptr(prog->stats, cpu);
 		do {
 			start = u64_stats_fetch_begin_irq(&st->syncp);
 			tnsecs = st->nsecs;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 35c5887d82ff..5be3beeedd74 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -412,7 +412,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
 	     * Hence check that 'start' is not zero.
 	     */
 	    start) {
-		stats = this_cpu_ptr(prog->aux->stats);
+		stats = this_cpu_ptr(prog->stats);
 		u64_stats_update_begin(&stats->syncp);
 		stats->cnt++;
 		stats->nsecs += sched_clock() - start;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 400d79e99fc8..424c1ba0f52f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11253,7 +11253,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		/* BPF_PROG_RUN doesn't call subprogs directly,
 		 * hence main prog stats include the runtime of subprogs.
 		 * subprogs don't have IDs and not reachable via prog_get_next_id
-		 * func[i]->aux->stats will never be accessed and stays NULL
+		 * func[i]->stats will never be accessed and stays NULL
 		 */
 		func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
 		if (!func[i])
-- 
cgit v1.2.3


From f2dd3b39467411c53703125a111f45b3672c1771 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 9 Feb 2021 19:36:28 -0800
Subject: bpf: Compute program stats for sleepable programs

Since sleepable programs don't migrate from the cpu the excution stats can be
computed for them as well. Reuse the same infrastructure for both sleepable and
non-sleepable programs.

run_cnt     -> the number of times the program was executed.
run_time_ns -> the program execution time in nanoseconds including the
               off-cpu time when the program was sleeping.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/bpf/20210210033634.62081-4-alexei.starovoitov@gmail.com
---
 arch/x86/net/bpf_jit_comp.c | 31 ++++++++++++-------------------
 include/linux/bpf.h         |  4 ++--
 kernel/bpf/trampoline.c     | 42 ++++++++++++++++++++++++++++--------------
 3 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a3dc3bd154ac..d11b9bcebbea 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1742,15 +1742,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	if (p->aux->sleepable) {
-		if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
+	if (emit_call(&prog,
+		      p->aux->sleepable ? __bpf_prog_enter_sleepable :
+		      __bpf_prog_enter, prog))
 			return -EINVAL;
-	} else {
-		if (emit_call(&prog, __bpf_prog_enter, prog))
-			return -EINVAL;
-		/* remember prog start time returned by __bpf_prog_enter */
-		emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
-	}
+	/* remember prog start time returned by __bpf_prog_enter */
+	emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
 
 	/* arg1: lea rdi, [rbp - stack_size] */
 	EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1770,18 +1767,14 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	if (mod_ret)
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 
-	if (p->aux->sleepable) {
-		if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
+	/* arg1: mov rdi, progs[i] */
+	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+	/* arg2: mov rsi, rbx <- start time in nsec */
+	emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+	if (emit_call(&prog,
+		      p->aux->sleepable ? __bpf_prog_exit_sleepable :
+		      __bpf_prog_exit, prog))
 			return -EINVAL;
-	} else {
-		/* arg1: mov rdi, progs[i] */
-		emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
-			       (u32) (long) p);
-		/* arg2: mov rsi, rbx <- start time in nsec */
-		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
-		if (emit_call(&prog, __bpf_prog_exit, prog))
-			return -EINVAL;
-	}
 
 	*pprog = prog;
 	return 0;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5a388955e6b6..e1840ceaf55f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -563,8 +563,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 /* these two functions are called from generated trampoline */
 u64 notrace __bpf_prog_enter(void);
 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
-void notrace __bpf_prog_enter_sleepable(void);
-void notrace __bpf_prog_exit_sleepable(void);
+u64 notrace __bpf_prog_enter_sleepable(void);
+void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
 
 struct bpf_ksym {
 	unsigned long		 start;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 89fc849ba271..48eb021e1421 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -381,56 +381,70 @@ out:
 	mutex_unlock(&trampoline_mutex);
 }
 
+#define NO_START_TIME 0
+static u64 notrace bpf_prog_start_time(void)
+{
+	u64 start = NO_START_TIME;
+
+	if (static_branch_unlikely(&bpf_stats_enabled_key))
+		start = sched_clock();
+	return start;
+}
+
 /* The logic is similar to BPF_PROG_RUN, but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
- * call _bpf_prog_enter
+ * call __bpf_prog_enter
  * call prog->bpf_func
  * call __bpf_prog_exit
  */
 u64 notrace __bpf_prog_enter(void)
 	__acquires(RCU)
 {
-	u64 start = 0;
-
 	rcu_read_lock();
 	migrate_disable();
-	if (static_branch_unlikely(&bpf_stats_enabled_key))
-		start = sched_clock();
-	return start;
+	return bpf_prog_start_time();
 }
 
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
-	__releases(RCU)
+static void notrace update_prog_stats(struct bpf_prog *prog,
+				      u64 start)
 {
 	struct bpf_prog_stats *stats;
 
 	if (static_branch_unlikely(&bpf_stats_enabled_key) &&
-	    /* static_key could be enabled in __bpf_prog_enter
-	     * and disabled in __bpf_prog_exit.
+	    /* static_key could be enabled in __bpf_prog_enter*
+	     * and disabled in __bpf_prog_exit*.
 	     * And vice versa.
-	     * Hence check that 'start' is not zero.
+	     * Hence check that 'start' is valid.
 	     */
-	    start) {
+	    start > NO_START_TIME) {
 		stats = this_cpu_ptr(prog->stats);
 		u64_stats_update_begin(&stats->syncp);
 		stats->cnt++;
 		stats->nsecs += sched_clock() - start;
 		u64_stats_update_end(&stats->syncp);
 	}
+}
+
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+	__releases(RCU)
+{
+	update_prog_stats(prog, start);
 	migrate_enable();
 	rcu_read_unlock();
 }
 
-void notrace __bpf_prog_enter_sleepable(void)
+u64 notrace __bpf_prog_enter_sleepable(void)
 {
 	rcu_read_lock_trace();
 	migrate_disable();
 	might_fault();
+	return bpf_prog_start_time();
 }
 
-void notrace __bpf_prog_exit_sleepable(void)
+void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
 {
+	update_prog_stats(prog, start);
 	migrate_enable();
 	rcu_read_unlock_trace();
 }
-- 
cgit v1.2.3


From ca06f55b90020cd97f4cc6d52db95436162e7dcf Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 9 Feb 2021 19:36:29 -0800
Subject: bpf: Add per-program recursion prevention mechanism

Since both sleepable and non-sleepable programs execute under migrate_disable
add recursion prevention mechanism to both types of programs when they're
executed via bpf trampoline.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210210033634.62081-5-alexei.starovoitov@gmail.com
---
 arch/x86/net/bpf_jit_comp.c                        | 15 ++++++++++++++
 include/linux/bpf.h                                |  6 +++---
 include/linux/filter.h                             |  1 +
 kernel/bpf/core.c                                  |  8 ++++++++
 kernel/bpf/trampoline.c                            | 23 ++++++++++++++++++----
 .../selftests/bpf/prog_tests/fexit_stress.c        |  4 ++--
 .../selftests/bpf/prog_tests/trampoline_count.c    |  4 ++--
 7 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d11b9bcebbea..79e7a0ec1da5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1740,8 +1740,11 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 			   struct bpf_prog *p, int stack_size, bool mod_ret)
 {
 	u8 *prog = *pprog;
+	u8 *jmp_insn;
 	int cnt = 0;
 
+	/* arg1: mov rdi, progs[i] */
+	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
 	if (emit_call(&prog,
 		      p->aux->sleepable ? __bpf_prog_enter_sleepable :
 		      __bpf_prog_enter, prog))
@@ -1749,6 +1752,14 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	/* remember prog start time returned by __bpf_prog_enter */
 	emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
 
+	/* if (__bpf_prog_enter*(prog) == 0)
+	 *	goto skip_exec_of_prog;
+	 */
+	EMIT3(0x48, 0x85, 0xC0);  /* test rax,rax */
+	/* emit 2 nops that will be replaced with JE insn */
+	jmp_insn = prog;
+	emit_nops(&prog, 2);
+
 	/* arg1: lea rdi, [rbp - stack_size] */
 	EMIT4(0x48, 0x8D, 0x7D, -stack_size);
 	/* arg2: progs[i]->insnsi for interpreter */
@@ -1767,6 +1778,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	if (mod_ret)
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 
+	/* replace 2 nops with JE insn, since jmp target is known */
+	jmp_insn[0] = X86_JE;
+	jmp_insn[1] = prog - jmp_insn - 2;
+
 	/* arg1: mov rdi, progs[i] */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
 	/* arg2: mov rsi, rbx <- start time in nsec */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1840ceaf55f..1c8ea682c007 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -529,7 +529,7 @@ struct btf_func_model {
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.  Pick a number to fit into BPF_IMAGE_SIZE / 2
  */
-#define BPF_MAX_TRAMP_PROGS 40
+#define BPF_MAX_TRAMP_PROGS 38
 
 struct bpf_tramp_progs {
 	struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS];
@@ -561,9 +561,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 				struct bpf_tramp_progs *tprogs,
 				void *orig_call);
 /* these two functions are called from generated trampoline */
-u64 notrace __bpf_prog_enter(void);
+u64 notrace __bpf_prog_enter(struct bpf_prog *prog);
 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
-u64 notrace __bpf_prog_enter_sleepable(void);
+u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
 
 struct bpf_ksym {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cecb03c9d251..6a06f3c69f4e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -565,6 +565,7 @@ struct bpf_prog {
 	u32			jited_len;	/* Size of jited insns in bytes */
 	u8			tag[BPF_TAG_SIZE];
 	struct bpf_prog_stats __percpu *stats;
+	int __percpu		*active;
 	unsigned int		(*bpf_func)(const void *ctx,
 					    const struct bpf_insn *insn);
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2cf71fd39c22..334070c4b8a1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 		vfree(fp);
 		return NULL;
 	}
+	fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+	if (!fp->active) {
+		vfree(fp);
+		kfree(aux);
+		return NULL;
+	}
 
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
@@ -116,6 +122,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 
 	prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
 	if (!prog->stats) {
+		free_percpu(prog->active);
 		kfree(prog->aux);
 		vfree(prog);
 		return NULL;
@@ -253,6 +260,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux);
 	}
 	free_percpu(fp->stats);
+	free_percpu(fp->active);
 	vfree(fp);
 }
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 48eb021e1421..89ef6320d19b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -381,13 +381,16 @@ out:
 	mutex_unlock(&trampoline_mutex);
 }
 
-#define NO_START_TIME 0
+#define NO_START_TIME 1
 static u64 notrace bpf_prog_start_time(void)
 {
 	u64 start = NO_START_TIME;
 
-	if (static_branch_unlikely(&bpf_stats_enabled_key))
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 		start = sched_clock();
+		if (unlikely(!start))
+			start = NO_START_TIME;
+	}
 	return start;
 }
 
@@ -397,12 +400,20 @@ static u64 notrace bpf_prog_start_time(void)
  * call __bpf_prog_enter
  * call prog->bpf_func
  * call __bpf_prog_exit
+ *
+ * __bpf_prog_enter returns:
+ * 0 - skip execution of the bpf prog
+ * 1 - execute bpf prog
+ * [2..MAX_U64] - excute bpf prog and record execution time.
+ *     This is start time.
  */
-u64 notrace __bpf_prog_enter(void)
+u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
 	__acquires(RCU)
 {
 	rcu_read_lock();
 	migrate_disable();
+	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1))
+		return 0;
 	return bpf_prog_start_time();
 }
 
@@ -430,21 +441,25 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
 	__releases(RCU)
 {
 	update_prog_stats(prog, start);
+	__this_cpu_dec(*(prog->active));
 	migrate_enable();
 	rcu_read_unlock();
 }
 
-u64 notrace __bpf_prog_enter_sleepable(void)
+u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
 {
 	rcu_read_lock_trace();
 	migrate_disable();
 	might_fault();
+	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1))
+		return 0;
 	return bpf_prog_start_time();
 }
 
 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
 {
 	update_prog_stats(prog, start);
+	__this_cpu_dec(*(prog->active));
 	migrate_enable();
 	rcu_read_unlock_trace();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
index 3b9dbf7433f0..7c9b62e971f1 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
@@ -2,8 +2,8 @@
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
 
-/* x86-64 fits 55 JITed and 43 interpreted progs into half page */
-#define CNT 40
+/* that's kernel internal BPF_MAX_TRAMP_PROGS define */
+#define CNT 38
 
 void test_fexit_stress(void)
 {
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
index 781c8d11604b..f3022d934e2d 100644
--- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -4,7 +4,7 @@
 #include <sys/prctl.h>
 #include <test_progs.h>
 
-#define MAX_TRAMP_PROGS 40
+#define MAX_TRAMP_PROGS 38
 
 struct inst {
 	struct bpf_object *obj;
@@ -52,7 +52,7 @@ void test_trampoline_count(void)
 	struct bpf_link *link;
 	char comm[16] = {};
 
-	/* attach 'allowed' 40 trampoline programs */
+	/* attach 'allowed' trampoline programs */
 	for (i = 0; i < MAX_TRAMP_PROGS; i++) {
 		obj = bpf_object__open_file(object, NULL);
 		if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) {
-- 
cgit v1.2.3


From 9ed9e9ba2337205311398a312796c213737bac35 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 9 Feb 2021 19:36:31 -0800
Subject: bpf: Count the number of times recursion was prevented

Add per-program counter for number of times recursion prevention mechanism
was triggered and expose it via show_fdinfo and bpf_prog_info.
Teach bpftool to print it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210210033634.62081-7-alexei.starovoitov@gmail.com
---
 include/linux/filter.h         |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 14 ++++++++++----
 kernel/bpf/trampoline.c        | 18 ++++++++++++++++--
 tools/bpf/bpftool/prog.c       |  4 ++++
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6a06f3c69f4e..3b00fc906ccd 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -543,6 +543,7 @@ struct bpf_binary_header {
 struct bpf_prog_stats {
 	u64 cnt;
 	u64 nsecs;
+	u64 misses;
 	struct u64_stats_sync syncp;
 } __aligned(2 * sizeof(u64));
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c001766adcbc..c547ad1ffe43 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4501,6 +4501,7 @@ struct bpf_prog_info {
 	__aligned_u64 prog_tags;
 	__u64 run_time_ns;
 	__u64 run_cnt;
+	__u64 recursion_misses;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f7df56a704de..c859bc46d06c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 static void bpf_prog_get_stats(const struct bpf_prog *prog,
 			       struct bpf_prog_stats *stats)
 {
-	u64 nsecs = 0, cnt = 0;
+	u64 nsecs = 0, cnt = 0, misses = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		const struct bpf_prog_stats *st;
 		unsigned int start;
-		u64 tnsecs, tcnt;
+		u64 tnsecs, tcnt, tmisses;
 
 		st = per_cpu_ptr(prog->stats, cpu);
 		do {
 			start = u64_stats_fetch_begin_irq(&st->syncp);
 			tnsecs = st->nsecs;
 			tcnt = st->cnt;
+			tmisses = st->misses;
 		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
 		nsecs += tnsecs;
 		cnt += tcnt;
+		misses += tmisses;
 	}
 	stats->nsecs = nsecs;
 	stats->cnt = cnt;
+	stats->misses = misses;
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "memlock:\t%llu\n"
 		   "prog_id:\t%u\n"
 		   "run_time_ns:\t%llu\n"
-		   "run_cnt:\t%llu\n",
+		   "run_cnt:\t%llu\n"
+		   "recursion_misses:\t%llu\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
 		   prog->pages * 1ULL << PAGE_SHIFT,
 		   prog->aux->id,
 		   stats.nsecs,
-		   stats.cnt);
+		   stats.cnt,
+		   stats.misses);
 }
 #endif
 
@@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
 	bpf_prog_get_stats(prog, &stats);
 	info.run_time_ns = stats.nsecs;
 	info.run_cnt = stats.cnt;
+	info.recursion_misses = stats.misses;
 
 	if (!bpf_capable()) {
 		info.jited_prog_len = 0;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 89ef6320d19b..7bc3b3209224 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -394,6 +394,16 @@ static u64 notrace bpf_prog_start_time(void)
 	return start;
 }
 
+static void notrace inc_misses_counter(struct bpf_prog *prog)
+{
+	struct bpf_prog_stats *stats;
+
+	stats = this_cpu_ptr(prog->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->misses++;
+	u64_stats_update_end(&stats->syncp);
+}
+
 /* The logic is similar to BPF_PROG_RUN, but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
@@ -412,8 +422,10 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
 {
 	rcu_read_lock();
 	migrate_disable();
-	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1))
+	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
+		inc_misses_counter(prog);
 		return 0;
+	}
 	return bpf_prog_start_time();
 }
 
@@ -451,8 +463,10 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
 	rcu_read_lock_trace();
 	migrate_disable();
 	might_fault();
-	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1))
+	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
+		inc_misses_counter(prog);
 		return 0;
+	}
 	return bpf_prog_start_time();
 }
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 1fe3ba255bad..f2b915b20546 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -368,6 +368,8 @@ static void print_prog_header_json(struct bpf_prog_info *info)
 		jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns);
 		jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt);
 	}
+	if (info->recursion_misses)
+		jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses);
 }
 
 static void print_prog_json(struct bpf_prog_info *info, int fd)
@@ -446,6 +448,8 @@ static void print_prog_header_plain(struct bpf_prog_info *info)
 	if (info->run_time_ns)
 		printf(" run_time_ns %lld run_cnt %lld",
 		       info->run_time_ns, info->run_cnt);
+	if (info->recursion_misses)
+		printf(" recursion_misses %lld", info->recursion_misses);
 	printf("\n");
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c001766adcbc..c547ad1ffe43 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4501,6 +4501,7 @@ struct bpf_prog_info {
 	__aligned_u64 prog_tags;
 	__u64 run_time_ns;
 	__u64 run_cnt;
+	__u64 recursion_misses;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit v1.2.3


From 539cf68cd51bfcd2987ce1c44e628e9da69de7c8 Mon Sep 17 00:00:00 2001
From: Pratyush Yadav <p.yadav@ti.com>
Date: Thu, 4 Feb 2021 19:42:17 +0530
Subject: spi: spi-mem: add spi_mem_dtr_supports_op()

spi_mem_default_supports_op() rejects DTR ops by default to ensure that
the controller drivers that haven't been updated with DTR support
continue to reject them. It also makes sure that controllers that don't
support DTR mode at all (which is most of them at the moment) also
reject them.

This means that controller drivers that want to support DTR mode can't
use spi_mem_default_supports_op(). Driver authors have to roll their own
supports_op() function and mimic the buswidth checks. See
spi-cadence-quadspi.c for example. Or even worse, driver authors might
skip it completely or get it wrong.

Add spi_mem_dtr_supports_op(). It provides a basic sanity check for DTR
ops and performs the buswidth requirement check. Move the logic for
checking buswidth in spi_mem_default_supports_op() to a separate
function so the logic is not repeated twice.

Signed-off-by: Pratyush Yadav <p.yadav@ti.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20210204141218.32229-1-p.yadav@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-mem.c       | 22 +++++++++++++++++++---
 include/linux/spi/spi-mem.h |  9 +++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index c64371ce6c38..dc713b0c3c4d 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -137,8 +137,8 @@ static int spi_check_buswidth_req(struct spi_mem *mem, u8 buswidth, bool tx)
 	return -ENOTSUPP;
 }
 
-bool spi_mem_default_supports_op(struct spi_mem *mem,
-				 const struct spi_mem_op *op)
+static bool spi_mem_check_buswidth(struct spi_mem *mem,
+				   const struct spi_mem_op *op)
 {
 	if (spi_check_buswidth_req(mem, op->cmd.buswidth, true))
 		return false;
@@ -156,13 +156,29 @@ bool spi_mem_default_supports_op(struct spi_mem *mem,
 				   op->data.dir == SPI_MEM_DATA_OUT))
 		return false;
 
+	return true;
+}
+
+bool spi_mem_dtr_supports_op(struct spi_mem *mem,
+			     const struct spi_mem_op *op)
+{
+	if (op->cmd.nbytes != 2)
+		return false;
+
+	return spi_mem_check_buswidth(mem, op);
+}
+EXPORT_SYMBOL_GPL(spi_mem_dtr_supports_op);
+
+bool spi_mem_default_supports_op(struct spi_mem *mem,
+				 const struct spi_mem_op *op)
+{
 	if (op->cmd.dtr || op->addr.dtr || op->dummy.dtr || op->data.dtr)
 		return false;
 
 	if (op->cmd.nbytes != 1)
 		return false;
 
-	return true;
+	return spi_mem_check_buswidth(mem, op);
 }
 EXPORT_SYMBOL_GPL(spi_mem_default_supports_op);
 
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index 159463cc659c..2b65c9edc34e 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -311,6 +311,9 @@ void spi_controller_dma_unmap_mem_op_data(struct spi_controller *ctlr,
 bool spi_mem_default_supports_op(struct spi_mem *mem,
 				 const struct spi_mem_op *op);
 
+bool spi_mem_dtr_supports_op(struct spi_mem *mem,
+			     const struct spi_mem_op *op);
+
 #else
 static inline int
 spi_controller_dma_map_mem_op_data(struct spi_controller *ctlr,
@@ -334,6 +337,12 @@ bool spi_mem_default_supports_op(struct spi_mem *mem,
 	return false;
 }
 
+static inline
+bool spi_mem_dtr_supports_op(struct spi_mem *mem,
+			     const struct spi_mem_op *op)
+{
+	return false;
+}
 #endif /* CONFIG_SPI_MEM */
 
 int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op);
-- 
cgit v1.2.3


From 53abf3fe831756261f399dad03ccc07235296acf Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Thu, 11 Feb 2021 10:20:36 -0700
Subject: coresight: etm-perf: Clarify comment on perf options

In theory, the options should be arbitrary values and are neutral for
any ETM version; so far perf tool uses ETMv3.5/PTM ETMCR config bits
except for register's bit definitions, also uses as options.

This can introduce confusion, especially if we want to add a new option
but the new option is not supported by ETMv3.5/PTM ETMCR.  But on the
other hand, we cannot change options since these options are generic
CoreSight PMU ABI.

For easier maintenance and avoid confusion, this patch refines the
comment to clarify perf options, and gives out the background info for
these bits are coming from ETMv3.5/PTM.  Afterwards, we should take
these options as general knobs, and if there have any confliction with
ETMv3.5/PTM, should consider to define saperate macros for ETMv3.5/PTM
ETMCR config bits.

Suggested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210206150833.42120-2-leo.yan@linaro.org
Link: https://lore.kernel.org/r/20210211172038.2483517-2-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm-perf.c |  5 ++++-
 include/linux/coresight-pmu.h                    | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index bdc34ca449f7..465ef1aa8c82 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -27,7 +27,10 @@ static bool etm_perf_up;
 static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
 static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
-/* ETMv3.5/PTM's ETMCR is 'config' */
+/*
+ * The PMU formats were orignally for ETMv3.5/PTM's ETMCR 'config';
+ * now take them as general formats and apply on all ETMs.
+ */
 PMU_FORMAT_ATTR(cycacc,		"config:" __stringify(ETM_OPT_CYCACC));
 PMU_FORMAT_ATTR(contextid,	"config:" __stringify(ETM_OPT_CTXTID));
 PMU_FORMAT_ATTR(timestamp,	"config:" __stringify(ETM_OPT_TS));
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index b0e35eec6499..5dc47cfdcf07 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -10,11 +10,18 @@
 #define CORESIGHT_ETM_PMU_NAME "cs_etm"
 #define CORESIGHT_ETM_PMU_SEED  0x10
 
-/* ETMv3.5/PTM's ETMCR config bit */
-#define ETM_OPT_CYCACC  12
-#define ETM_OPT_CTXTID	14
-#define ETM_OPT_TS      28
-#define ETM_OPT_RETSTK	29
+/*
+ * Below are the definition of bit offsets for perf option, and works as
+ * arbitrary values for all ETM versions.
+ *
+ * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore,
+ * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and
+ * directly use below macros as config bits.
+ */
+#define ETM_OPT_CYCACC		12
+#define ETM_OPT_CTXTID		14
+#define ETM_OPT_TS		28
+#define ETM_OPT_RETSTK		29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC	4
-- 
cgit v1.2.3


From 88f11864cf1d1324f620059ec747d74b72d9d736 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 11 Feb 2021 10:20:37 -0700
Subject: coresight: etm-perf: Support PID tracing for kernel at EL2

When the kernel is running at EL2, the PID is stored in CONTEXTIDR_EL2.
So, tracing CONTEXTIDR_EL1 doesn't give us the pid of the process.
Thus we should trace the VMID with VMIDOPT set to trace CONTEXTIDR_EL2
instead of CONTEXTIDR_EL1.  Given that we have an existing config
option "contextid" and this will be useful for tracing virtual machines
(when we get to support virtualization).

So instead, this patch extends option CTXTID with an extra bit
ETM_OPT_CTXTID2 (bit 15), thus on an EL2 kernel, we will have another
bit available for the perf tool: ETM_OPT_CTXTID is for kernel running in
EL1, ETM_OPT_CTXTID2 is used when kernel runs in EL2 with VHE enabled.

The tool must be backward compatible for users, i.e, "contextid" today
traces PID and that should remain the same; for this purpose, the perf
tool is updated to automatically set corresponding bit for the
"contextid" config, therefore, the user doesn't have to bother which EL
the kernel is running.

  i.e, perf record -e cs_etm/contextid/u --

will always do the "pid" tracing, independent of the kernel EL.

The driver parses the format "contextid", which traces CONTEXTIDR_EL1
for ETM_OPT_CTXTID (on EL1 kernel) and traces CONTEXTIDR_EL2 for
ETM_OPT_CTXTID2 (on EL2 kernel).

Besides the enhancement for format "contexid", extra two formats are
introduced: "contextid1" and "contextid2".  This considers to support
tracing both CONTEXTIDR_EL1 and CONTEXTIDR_EL2 when the kernel is
running at EL2.  Finally, the PMU formats are defined as follow:

  "contextid1": Available on both EL1 kernel and EL2 kernel.  When the
                kernel is running at EL1, "contextid1" enables the PID
		tracing; when the kernel is running at EL2, this enables
		tracing the PID of guest applications.

  "contextid2": Only usable when the kernel is running at EL2.  When
                selected, enables PID tracing on EL2 kernel.

  "contextid":  Will be an alias for the option that enables PID
                tracing.  I.e,
                contextid == contextid1, on EL1 kernel.
                contextid == contextid2, on EL2 kernel.

Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Al Grant <al.grant@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Leo Yan <leo.yan@linaro.org>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
[ Added two config formats: contextid1, contextid2 ]
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: https://lore.kernel.org/r/20210206150833.42120-4-leo.yan@linaro.org
Link: https://lore.kernel.org/r/20210211172038.2483517-3-mathieu.poirier@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm-perf.c   | 27 +++++++++++++++++++++-
 drivers/hwtracing/coresight/coresight-etm4x-core.c | 13 +++++++++++
 include/linux/coresight-pmu.h                      |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 465ef1aa8c82..0f603b4094f2 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -32,15 +32,40 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
  * now take them as general formats and apply on all ETMs.
  */
 PMU_FORMAT_ATTR(cycacc,		"config:" __stringify(ETM_OPT_CYCACC));
-PMU_FORMAT_ATTR(contextid,	"config:" __stringify(ETM_OPT_CTXTID));
+/* contextid1 enables tracing CONTEXTIDR_EL1 for ETMv4 */
+PMU_FORMAT_ATTR(contextid1,	"config:" __stringify(ETM_OPT_CTXTID));
+/* contextid2 enables tracing CONTEXTIDR_EL2 for ETMv4 */
+PMU_FORMAT_ATTR(contextid2,	"config:" __stringify(ETM_OPT_CTXTID2));
 PMU_FORMAT_ATTR(timestamp,	"config:" __stringify(ETM_OPT_TS));
 PMU_FORMAT_ATTR(retstack,	"config:" __stringify(ETM_OPT_RETSTK));
 /* Sink ID - same for all ETMs */
 PMU_FORMAT_ATTR(sinkid,		"config2:0-31");
 
+/*
+ * contextid always traces the "PID".  The PID is in CONTEXTIDR_EL1
+ * when the kernel is running at EL1; when the kernel is at EL2,
+ * the PID is in CONTEXTIDR_EL2.
+ */
+static ssize_t format_attr_contextid_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *page)
+{
+	int pid_fmt = ETM_OPT_CTXTID;
+
+#if defined(CONFIG_CORESIGHT_SOURCE_ETM4X)
+	pid_fmt = is_kernel_in_hyp_mode() ? ETM_OPT_CTXTID2 : ETM_OPT_CTXTID;
+#endif
+	return sprintf(page, "config:%d\n", pid_fmt);
+}
+
+struct device_attribute format_attr_contextid =
+	__ATTR(contextid, 0444, format_attr_contextid_show, NULL);
+
 static struct attribute *etm_config_formats_attr[] = {
 	&format_attr_cycacc.attr,
 	&format_attr_contextid.attr,
+	&format_attr_contextid1.attr,
+	&format_attr_contextid2.attr,
 	&format_attr_timestamp.attr,
 	&format_attr_retstack.attr,
 	&format_attr_sinkid.attr,
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c
index c8ecd91e289e..15016f757828 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -550,6 +550,19 @@ static int etm4_parse_event_config(struct etmv4_drvdata *drvdata,
 		/* bit[6], Context ID tracing bit */
 		config->cfg |= BIT(ETM4_CFG_BIT_CTXTID);
 
+	/*
+	 * If set bit ETM_OPT_CTXTID2 in perf config, this asks to trace VMID
+	 * for recording CONTEXTIDR_EL2.  Do not enable VMID tracing if the
+	 * kernel is not running in EL2.
+	 */
+	if (attr->config & BIT(ETM_OPT_CTXTID2)) {
+		if (!is_kernel_in_hyp_mode()) {
+			ret = -EINVAL;
+			goto out;
+		}
+		config->cfg |= BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT);
+	}
+
 	/* return stack - enable if selected and supported */
 	if ((attr->config & BIT(ETM_OPT_RETSTK)) && drvdata->retstack)
 		/* bit[12], Return stack enable bit */
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index 5dc47cfdcf07..4ac5c081af93 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -20,14 +20,17 @@
  */
 #define ETM_OPT_CYCACC		12
 #define ETM_OPT_CTXTID		14
+#define ETM_OPT_CTXTID2		15
 #define ETM_OPT_TS		28
 #define ETM_OPT_RETSTK		29
 
 /* ETMv4 CONFIGR programming bits for the ETM OPTs */
 #define ETM4_CFG_BIT_CYCACC	4
 #define ETM4_CFG_BIT_CTXTID	6
+#define ETM4_CFG_BIT_VMID	7
 #define ETM4_CFG_BIT_TS		11
 #define ETM4_CFG_BIT_RETSTK	12
+#define ETM4_CFG_BIT_VMID_OPT	15
 
 static inline int coresight_get_trace_id(int cpu)
 {
-- 
cgit v1.2.3


From bb90d4bc7b6a536b2e4db45f4763e467c2008251 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 9 Feb 2021 22:22:14 -0800
Subject: mm/highmem: Lift memcpy_[to|from]_page to core

Working through a conversion to a call kmap_local_page() instead of
kmap() revealed many places where the pattern kmap/memcpy/kunmap
occurred.

Eric Biggers, Matthew Wilcox, Christoph Hellwig, Dan Williams, and Al
Viro all suggested putting this code into helper functions.  Al Viro
further pointed out that these functions already existed in the iov_iter
code.[1]

Various locations for the lifted functions were considered.

Headers like mm.h or string.h seem ok but don't really portray the
functionality well.  pagemap.h made some sense but is for page cache
functionality.[2]

Another alternative would be to create a new header for the promoted
memcpy functions, but it masks the fact that these are designed to copy
to/from pages using the kernel direct mappings and complicates matters
with a new header.

Placing these functions in 'highmem.h' is suboptimal especially with the
changes being proposed in the functionality of kmap.  From a caller
perspective including/using 'highmem.h' implies that the functions
defined in that header are only required when highmem is in use which is
increasingly not the case with modern processors.  However, highmem.h is
where all the current functions like this reside (zero_user(),
clear_highpage(), clear_user_highpage(), copy_user_highpage(), and
copy_highpage()).  So it makes the most sense even though it is
distasteful for some.[3]

Lift memcpy_to_page() and memcpy_from_page() to pagemap.h.

[1] https://lore.kernel.org/lkml/20201013200149.GI3576660@ZenIV.linux.org.uk/
    https://lore.kernel.org/lkml/20201013112544.GA5249@infradead.org/

[2] https://lore.kernel.org/lkml/20201208122316.GH7338@casper.infradead.org/

[3] https://lore.kernel.org/lkml/20201013200149.GI3576660@ZenIV.linux.org.uk/#t
    https://lore.kernel.org/lkml/20201208163814.GN1563847@iweiny-DESK2.sc.intel.com/

Cc: Boris Pismenny <borisp@mellanox.com>
Cc: Or Gerlitz <gerlitz.or@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 18 ++++++++++++++++++
 lib/iov_iter.c          | 14 --------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index d2c70d3772a3..736b6a9f144d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -276,4 +276,22 @@ static inline void copy_highpage(struct page *to, struct page *from)
 
 #endif
 
+static inline void memcpy_from_page(char *to, struct page *page,
+				    size_t offset, size_t len)
+{
+	char *from = kmap_atomic(page);
+
+	memcpy(to, from + offset, len);
+	kunmap_atomic(from);
+}
+
+static inline void memcpy_to_page(struct page *page, size_t offset,
+				  const char *from, size_t len)
+{
+	char *to = kmap_atomic(page);
+
+	memcpy(to + offset, from, len);
+	kunmap_atomic(to);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index a21e6a5792c5..9889e9903cdf 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -466,20 +466,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
-{
-	char *from = kmap_atomic(page);
-	memcpy(to, from + offset, len);
-	kunmap_atomic(from);
-}
-
-static void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)
-{
-	char *to = kmap_atomic(page);
-	memcpy(to + offset, from, len);
-	kunmap_atomic(to);
-}
-
 static void memzero_page(struct page *page, size_t offset, size_t len)
 {
 	char *addr = kmap_atomic(page);
-- 
cgit v1.2.3


From 61b205f579911a11f0b576f73275eca2aed0d108 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 9 Feb 2021 22:22:15 -0800
Subject: mm/highmem: Convert memcpy_[to|from]_page() to kmap_local_page()

kmap_local_page() is more efficient and is well suited for these calls.
Convert the kmap() to kmap_local_page()

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 736b6a9f144d..c17a175fe5fe 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -279,19 +279,19 @@ static inline void copy_highpage(struct page *to, struct page *from)
 static inline void memcpy_from_page(char *to, struct page *page,
 				    size_t offset, size_t len)
 {
-	char *from = kmap_atomic(page);
+	char *from = kmap_local_page(page);
 
 	memcpy(to, from + offset, len);
-	kunmap_atomic(from);
+	kunmap_local(from);
 }
 
 static inline void memcpy_to_page(struct page *page, size_t offset,
 				  const char *from, size_t len)
 {
-	char *to = kmap_atomic(page);
+	char *to = kmap_local_page(page);
 
 	memcpy(to + offset, from, len);
-	kunmap_atomic(to);
+	kunmap_local(to);
 }
 
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 6a0996db6879cf09f989c5f44f9edd38240cb346 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 9 Feb 2021 22:22:16 -0800
Subject: mm/highmem: Introduce memcpy_page(), memmove_page(), and
 memset_page()

3 more common kmap patterns are kmap/memcpy/kunmap, kmap/memmove/kunmap.
and kmap/memset/kunmap.

Add helper functions for those patterns which use kmap_local_page().

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index c17a175fe5fe..0b5d89621cb9 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -276,6 +276,39 @@ static inline void copy_highpage(struct page *to, struct page *from)
 
 #endif
 
+static inline void memcpy_page(struct page *dst_page, size_t dst_off,
+			       struct page *src_page, size_t src_off,
+			       size_t len)
+{
+	char *dst = kmap_local_page(dst_page);
+	char *src = kmap_local_page(src_page);
+
+	memcpy(dst + dst_off, src + src_off, len);
+	kunmap_local(src);
+	kunmap_local(dst);
+}
+
+static inline void memmove_page(struct page *dst_page, size_t dst_off,
+			       struct page *src_page, size_t src_off,
+			       size_t len)
+{
+	char *dst = kmap_local_page(dst_page);
+	char *src = kmap_local_page(src_page);
+
+	memmove(dst + dst_off, src + src_off, len);
+	kunmap_local(src);
+	kunmap_local(dst);
+}
+
+static inline void memset_page(struct page *page, size_t offset, int val,
+			       size_t len)
+{
+	char *addr = kmap_local_page(page);
+
+	memset(addr + offset, val, len);
+	kunmap_local(addr);
+}
+
 static inline void memcpy_from_page(char *to, struct page *page,
 				    size_t offset, size_t len)
 {
-- 
cgit v1.2.3


From ca18f6ea012bf30236b76c3480ac2c97131b6f8f Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 10 Feb 2021 09:49:28 -0800
Subject: mm/highmem: Add VM_BUG_ON() to mem*_page() calls

Add VM_BUG_ON bounds checks to ensure the newly lifted and created page
memory operations do not result in corrupted data in neighbor pages.[1][2]

[1] https://lore.kernel.org/lkml/20201210053502.GS1563847@iweiny-DESK2.sc.intel.com/
[2] https://lore.kernel.org/lkml/20210209110931.00f00e47d9a0529fcee2ff01@linux-foundation.org/

Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/highmem.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0b5d89621cb9..44170f312ae7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -283,6 +283,7 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off,
 	char *dst = kmap_local_page(dst_page);
 	char *src = kmap_local_page(src_page);
 
+	VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
 	memcpy(dst + dst_off, src + src_off, len);
 	kunmap_local(src);
 	kunmap_local(dst);
@@ -295,6 +296,7 @@ static inline void memmove_page(struct page *dst_page, size_t dst_off,
 	char *dst = kmap_local_page(dst_page);
 	char *src = kmap_local_page(src_page);
 
+	VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
 	memmove(dst + dst_off, src + src_off, len);
 	kunmap_local(src);
 	kunmap_local(dst);
@@ -305,6 +307,7 @@ static inline void memset_page(struct page *page, size_t offset, int val,
 {
 	char *addr = kmap_local_page(page);
 
+	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memset(addr + offset, val, len);
 	kunmap_local(addr);
 }
@@ -314,6 +317,7 @@ static inline void memcpy_from_page(char *to, struct page *page,
 {
 	char *from = kmap_local_page(page);
 
+	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memcpy(to, from + offset, len);
 	kunmap_local(from);
 }
@@ -323,6 +327,7 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
 {
 	char *to = kmap_local_page(page);
 
+	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memcpy(to + offset, from, len);
 	kunmap_local(to);
 }
-- 
cgit v1.2.3


From f2ad937b62d984fd1ede3994798fe39f5fabc8d7 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 26 Jan 2021 12:45:38 +0000
Subject: clk: spear: Move prototype to accessible header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following W=1 kernel build warning(s):

 drivers/clk/spear/spear1310_clock.c:385:13: warning: no previous prototype for ‘spear1310_clk_init’ [-Wmissing-prototypes]
 drivers/clk/spear/spear1340_clock.c:442:13: warning: no previous prototype for ‘spear1340_clk_init’ [-Wmissing-prototypes]

Cc: Viresh Kumar <vireshk@kernel.org>
Cc: Shiraz Hashim <shiraz.linux.kernel@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Rajeev Kumar <rajeev-dlh.kumar@st.com>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20210126124540.3320214-20-lee.jones@linaro.org
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 arch/arm/mach-spear/generic.h       | 12 ------------
 arch/arm/mach-spear/spear13xx.c     |  1 +
 drivers/clk/spear/spear1310_clock.c |  1 +
 drivers/clk/spear/spear1340_clock.c |  1 +
 include/linux/clk/spear.h           | 23 +++++++++++++++++++++++
 5 files changed, 26 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/clk/spear.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-spear/generic.h b/arch/arm/mach-spear/generic.h
index 25b4c5e66e39..8ec2b92dca19 100644
--- a/arch/arm/mach-spear/generic.h
+++ b/arch/arm/mach-spear/generic.h
@@ -43,16 +43,4 @@ void spear13xx_cpu_die(unsigned int cpu);
 
 extern const struct smp_operations spear13xx_smp_ops;
 
-#ifdef CONFIG_MACH_SPEAR1310
-void __init spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base);
-#else
-static inline void spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base) {}
-#endif
-
-#ifdef CONFIG_MACH_SPEAR1340
-void __init spear1340_clk_init(void __iomem *misc_base);
-#else
-static inline void spear1340_clk_init(void __iomem *misc_base) {}
-#endif
-
 #endif /* __MACH_GENERIC_H */
diff --git a/arch/arm/mach-spear/spear13xx.c b/arch/arm/mach-spear/spear13xx.c
index 31c43cabf362..74d1ca2a529a 100644
--- a/arch/arm/mach-spear/spear13xx.c
+++ b/arch/arm/mach-spear/spear13xx.c
@@ -15,6 +15,7 @@
 
 #include <linux/amba/pl022.h>
 #include <linux/clk.h>
+#include <linux/clk/spear.h>
 #include <linux/clocksource.h>
 #include <linux/err.h>
 #include <linux/of.h>
diff --git a/drivers/clk/spear/spear1310_clock.c b/drivers/clk/spear/spear1310_clock.c
index 591248c9a88e..8c8974866789 100644
--- a/drivers/clk/spear/spear1310_clock.c
+++ b/drivers/clk/spear/spear1310_clock.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/clkdev.h>
+#include <linux/clk/spear.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of_platform.h>
diff --git a/drivers/clk/spear/spear1340_clock.c b/drivers/clk/spear/spear1340_clock.c
index 9163bbb46411..c0dc94355c87 100644
--- a/drivers/clk/spear/spear1340_clock.c
+++ b/drivers/clk/spear/spear1340_clock.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/clkdev.h>
+#include <linux/clk/spear.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of_platform.h>
diff --git a/include/linux/clk/spear.h b/include/linux/clk/spear.h
new file mode 100644
index 000000000000..a64d034ceddd
--- /dev/null
+++ b/include/linux/clk/spear.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 STMicroelectronics - All Rights Reserved
+ *
+ * Author: Lee Jones <lee.jones@linaro.org>
+ */
+
+#ifndef __LINUX_CLK_SPEAR_H
+#define __LINUX_CLK_SPEAR_H
+
+#ifdef CONFIG_MACH_SPEAR1310
+void __init spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base);
+#else
+static inline void spear1310_clk_init(void __iomem *misc_base, void __iomem *ras_base) {}
+#endif
+
+#ifdef CONFIG_MACH_SPEAR1340
+void __init spear1340_clk_init(void __iomem *misc_base);
+#else
+static inline void spear1340_clk_init(void __iomem *misc_base) {}
+#endif
+
+#endif
-- 
cgit v1.2.3


From 4217a64e18a1647a0dbc68cb3169a5a06f054ec8 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 9 Feb 2021 17:38:52 +0100
Subject: net: phy: introduce phydev->port

At the moment, PORT_MII is reported in the ethtool ops. This is odd
because it is an interface between the MAC and the PHY and no external
port. Some network card drivers will overwrite the port to twisted pair
or fiber, though. Even worse, the MDI/MDIX setting is only used by
ethtool if the port is twisted pair.

Set the port to PORT_TP by default because most PHY drivers are copper
ones. If there is fibre support and it is enabled, the PHY driver will
set it to PORT_FIBRE.

This will change reporting PORT_MII to either PORT_TP or PORT_FIBRE;
except for the genphy fallback driver.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c   |  2 ++
 drivers/net/phy/dp83822.c    |  3 +++
 drivers/net/phy/dp83869.c    |  4 ++++
 drivers/net/phy/lxt.c        |  1 +
 drivers/net/phy/marvell.c    |  1 +
 drivers/net/phy/marvell10g.c |  2 ++
 drivers/net/phy/micrel.c     | 14 +++++++++++---
 drivers/net/phy/phy.c        |  2 +-
 drivers/net/phy/phy_device.c |  9 +++++++++
 include/linux/phy.h          |  2 ++
 10 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 3142ba768313..0472b3470c59 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -410,6 +410,8 @@ static int bcm54616s_probe(struct phy_device *phydev)
 		 */
 		if (!(val & BCM54616S_100FX_MODE))
 			phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX;
+
+		phydev->port = PORT_FIBRE;
 	}
 
 	return 0;
diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c
index fff371ca1086..be1224b4447b 100644
--- a/drivers/net/phy/dp83822.c
+++ b/drivers/net/phy/dp83822.c
@@ -554,6 +554,9 @@ static int dp83822_probe(struct phy_device *phydev)
 
 	dp83822_of_init(phydev);
 
+	if (dp83822->fx_enabled)
+		phydev->port = PORT_FIBRE;
+
 	return 0;
 }
 
diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c
index b30bc142d82e..755220c6451f 100644
--- a/drivers/net/phy/dp83869.c
+++ b/drivers/net/phy/dp83869.c
@@ -855,6 +855,10 @@ static int dp83869_probe(struct phy_device *phydev)
 	if (ret)
 		return ret;
 
+	if (dp83869->mode == DP83869_RGMII_100_BASE ||
+	    dp83869->mode == DP83869_RGMII_1000_BASE)
+		phydev->port = PORT_FIBRE;
+
 	return dp83869_config_init(phydev);
 }
 
diff --git a/drivers/net/phy/lxt.c b/drivers/net/phy/lxt.c
index 0ee23d29c0d4..bde3356a2f86 100644
--- a/drivers/net/phy/lxt.c
+++ b/drivers/net/phy/lxt.c
@@ -292,6 +292,7 @@ static int lxt973_probe(struct phy_device *phydev)
 		phy_write(phydev, MII_BMCR, val);
 		/* Remember that the port is in fiber mode. */
 		phydev->priv = lxt973_probe;
+		phydev->port = PORT_FIBRE;
 	} else {
 		phydev->priv = NULL;
 	}
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index b523aa37ebf0..3238d0fbf437 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -1552,6 +1552,7 @@ static int marvell_read_status_page(struct phy_device *phydev, int page)
 	phydev->asym_pause = 0;
 	phydev->speed = SPEED_UNKNOWN;
 	phydev->duplex = DUPLEX_UNKNOWN;
+	phydev->port = fiber ? PORT_FIBRE : PORT_TP;
 
 	if (phydev->autoneg == AUTONEG_ENABLE)
 		err = marvell_read_status_page_an(phydev, fiber, status);
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 1901ba277413..b1bb9b8e1e4e 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -631,6 +631,7 @@ static int mv3310_read_status_10gbaser(struct phy_device *phydev)
 	phydev->link = 1;
 	phydev->speed = SPEED_10000;
 	phydev->duplex = DUPLEX_FULL;
+	phydev->port = PORT_FIBRE;
 
 	return 0;
 }
@@ -690,6 +691,7 @@ static int mv3310_read_status_copper(struct phy_device *phydev)
 
 	phydev->duplex = cssr1 & MV_PCS_CSSR1_DUPLEX_FULL ?
 			 DUPLEX_FULL : DUPLEX_HALF;
+	phydev->port = PORT_TP;
 	phydev->mdix = cssr1 & MV_PCS_CSSR1_MDIX ?
 		       ETH_TP_MDI_X : ETH_TP_MDI;
 
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 494abf608b8f..7ec6f70d6a82 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -341,14 +341,19 @@ static int kszphy_config_init(struct phy_device *phydev)
 	return kszphy_config_reset(phydev);
 }
 
+static int ksz8041_fiber_mode(struct phy_device *phydev)
+{
+	struct device_node *of_node = phydev->mdio.dev.of_node;
+
+	return of_property_read_bool(of_node, "micrel,fiber-mode");
+}
+
 static int ksz8041_config_init(struct phy_device *phydev)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
 
-	struct device_node *of_node = phydev->mdio.dev.of_node;
-
 	/* Limit supported and advertised modes in fiber mode */
-	if (of_property_read_bool(of_node, "micrel,fiber-mode")) {
+	if (ksz8041_fiber_mode(phydev)) {
 		phydev->dev_flags |= MICREL_PHY_FXEN;
 		linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, mask);
 		linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, mask);
@@ -1176,6 +1181,9 @@ static int kszphy_probe(struct phy_device *phydev)
 		}
 	}
 
+	if (ksz8041_fiber_mode(phydev))
+		phydev->port = PORT_FIBRE;
+
 	/* Support legacy board-file configuration */
 	if (phydev->dev_flags & MICREL_PHY_50MHZ_CLK) {
 		priv->rmii_ref_clk_sel = true;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 9cb7e4dbf8f4..fdb914b5b857 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -308,7 +308,7 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev,
 	if (phydev->interface == PHY_INTERFACE_MODE_MOCA)
 		cmd->base.port = PORT_BNC;
 	else
-		cmd->base.port = PORT_MII;
+		cmd->base.port = phydev->port;
 	cmd->base.transceiver = phy_is_internal(phydev) ?
 				XCVR_INTERNAL : XCVR_EXTERNAL;
 	cmd->base.phy_address = phydev->mdio.addr;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 8447e56ba572..30a20a29ae05 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -606,6 +606,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 	dev->pause = 0;
 	dev->asym_pause = 0;
 	dev->link = 0;
+	dev->port = PORT_TP;
 	dev->interface = PHY_INTERFACE_MODE_GMII;
 
 	dev->autoneg = AUTONEG_ENABLE;
@@ -1403,6 +1404,14 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 	phydev->state = PHY_READY;
 
+	/* Port is set to PORT_TP by default and the actual PHY driver will set
+	 * it to different value depending on the PHY configuration. If we have
+	 * the generic PHY driver we can't figure it out, thus set the old
+	 * legacy PORT_MII value.
+	 */
+	if (using_genphy)
+		phydev->port = PORT_MII;
+
 	/* Initial carrier state is off as the phy is about to be
 	 * (re)initialized.
 	 */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bc323fbdd21e..c130788306c8 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -503,6 +503,7 @@ struct macsec_ops;
  *
  * @speed: Current link speed
  * @duplex: Current duplex
+ * @port: Current port
  * @pause: Current pause
  * @asym_pause: Current asymmetric pause
  * @supported: Combined MAC/PHY supported linkmodes
@@ -581,6 +582,7 @@ struct phy_device {
 	 */
 	int speed;
 	int duplex;
+	int port;
 	int pause;
 	int asym_pause;
 	u8 master_slave_get;
-- 
cgit v1.2.3


From dcf0cd1cc58b8e88793ad6531db9b3a47324ca09 Mon Sep 17 00:00:00 2001
From: George McCollister <george.mccollister@gmail.com>
Date: Tue, 9 Feb 2021 19:02:11 -0600
Subject: net: hsr: add offloading support

Add support for offloading of HSR/PRP (IEC 62439-3) tag insertion
tag removal, duplicate generation and forwarding.

For HSR, insertion involves the switch adding a 6 byte HSR header after
the 14 byte Ethernet header. For PRP it adds a 6 byte trailer.

Tag removal involves automatically stripping the HSR/PRP header/trailer
in the switch. This is possible when the switch also performs auto
deduplication using the HSR/PRP header/trailer (making it no longer
required).

Forwarding involves automatically forwarding between redundant ports in
an HSR. This is crucial because delay is accumulated as a frame passes
through each node in the ring.

Duplication involves the switch automatically sending a single frame
from the CPU port to both redundant ports. This is required because the
inserted HSR/PRP header/trailer must contain the same sequence number
on the frames sent out both redundant ports.

Export is_hsr_master so DSA can tell them apart from other devices in
dsa_slave_changeupper.

Signed-off-by: George McCollister <george.mccollister@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-features.rst | 21 +++++++++++++++++++++
 include/linux/if_hsr.h                       | 27 +++++++++++++++++++++++++++
 include/linux/netdev_features.h              |  9 +++++++++
 net/ethtool/common.c                         |  4 ++++
 net/hsr/hsr_device.c                         | 14 +++-----------
 net/hsr/hsr_device.h                         |  1 -
 net/hsr/hsr_forward.c                        | 27 ++++++++++++++++++++++++---
 net/hsr/hsr_forward.h                        |  1 +
 net/hsr/hsr_framereg.c                       |  2 ++
 net/hsr/hsr_main.c                           | 11 +++++++++++
 net/hsr/hsr_main.h                           |  8 +-------
 net/hsr/hsr_slave.c                          | 10 ++++++----
 12 files changed, 109 insertions(+), 26 deletions(-)
 create mode 100644 include/linux/if_hsr.h

(limited to 'include/linux')

diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst
index a2d7d7160e39..d7b15bb64deb 100644
--- a/Documentation/networking/netdev-features.rst
+++ b/Documentation/networking/netdev-features.rst
@@ -182,3 +182,24 @@ stricter than Hardware LRO.  A packet stream merged by Hardware GRO must
 be re-segmentable by GSO or TSO back to the exact original packet stream.
 Hardware GRO is dependent on RXCSUM since every packet successfully merged
 by hardware must also have the checksum verified by hardware.
+
+* hsr-tag-ins-offload
+
+This should be set for devices which insert an HSR (High-availability Seamless
+Redundancy) or PRP (Parallel Redundancy Protocol) tag automatically.
+
+* hsr-tag-rm-offload
+
+This should be set for devices which remove HSR (High-availability Seamless
+Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically.
+
+* hsr-fwd-offload
+
+This should be set for devices which forward HSR (High-availability Seamless
+Redundancy) frames from one port to another in hardware.
+
+* hsr-dup-offload
+
+This should be set for devices which duplicate outgoing HSR (High-availability
+Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically
+frames in hardware.
diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h
new file mode 100644
index 000000000000..38bbc537d4e4
--- /dev/null
+++ b/include/linux/if_hsr.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IF_HSR_H_
+#define _LINUX_IF_HSR_H_
+
+/* used to differentiate various protocols */
+enum hsr_version {
+	HSR_V0 = 0,
+	HSR_V1,
+	PRP_V1,
+};
+
+#if IS_ENABLED(CONFIG_HSR)
+extern bool is_hsr_master(struct net_device *dev);
+extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver);
+#else
+static inline bool is_hsr_master(struct net_device *dev)
+{
+	return false;
+}
+static inline int hsr_get_version(struct net_device *dev,
+				  enum hsr_version *ver)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_HSR */
+
+#endif /*_LINUX_IF_HSR_H_*/
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index c06d6aaba9df..3de38d6a0aea 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -86,6 +86,11 @@ enum {
 	NETIF_F_HW_MACSEC_BIT,		/* Offload MACsec operations */
 	NETIF_F_GRO_UDP_FWD_BIT,	/* Allow UDP GRO for forwarding */
 
+	NETIF_F_HW_HSR_TAG_INS_BIT,	/* Offload HSR tag insertion */
+	NETIF_F_HW_HSR_TAG_RM_BIT,	/* Offload HSR tag removal */
+	NETIF_F_HW_HSR_FWD_BIT,		/* Offload HSR forwarding */
+	NETIF_F_HW_HSR_DUP_BIT,		/* Offload HSR duplication */
+
 	/*
 	 * Add your fresh new feature above and remember to update
 	 * netdev_features_strings[] in net/core/ethtool.c and maybe
@@ -159,6 +164,10 @@ enum {
 #define NETIF_F_GSO_FRAGLIST	__NETIF_F(GSO_FRAGLIST)
 #define NETIF_F_HW_MACSEC	__NETIF_F(HW_MACSEC)
 #define NETIF_F_GRO_UDP_FWD	__NETIF_F(GRO_UDP_FWD)
+#define NETIF_F_HW_HSR_TAG_INS	__NETIF_F(HW_HSR_TAG_INS)
+#define NETIF_F_HW_HSR_TAG_RM	__NETIF_F(HW_HSR_TAG_RM)
+#define NETIF_F_HW_HSR_FWD	__NETIF_F(HW_HSR_FWD)
+#define NETIF_F_HW_HSR_DUP	__NETIF_F(HW_HSR_DUP)
 
 /* Finds the next feature with the highest number of the range of start till 0.
  */
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 835b9bba3e7e..c6a383dfd6c2 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -69,6 +69,10 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
 	[NETIF_F_GRO_FRAGLIST_BIT] =	 "rx-gro-list",
 	[NETIF_F_HW_MACSEC_BIT] =	 "macsec-hw-offload",
 	[NETIF_F_GRO_UDP_FWD_BIT] =	 "rx-udp-gro-forwarding",
+	[NETIF_F_HW_HSR_TAG_INS_BIT] =	 "hsr-tag-ins-offload",
+	[NETIF_F_HW_HSR_TAG_RM_BIT] =	 "hsr-tag-rm-offload",
+	[NETIF_F_HW_HSR_FWD_BIT] =	 "hsr-fwd-offload",
+	[NETIF_F_HW_HSR_DUP_BIT] =	 "hsr-dup-offload",
 };
 
 const char
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index ec6a68b403d5..7444ec6e298e 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -417,6 +417,7 @@ static struct hsr_proto_ops hsr_ops = {
 	.send_sv_frame = send_hsr_supervision_frame,
 	.create_tagged_frame = hsr_create_tagged_frame,
 	.get_untagged_frame = hsr_get_untagged_frame,
+	.drop_frame = hsr_drop_frame,
 	.fill_frame_info = hsr_fill_frame_info,
 	.invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame,
 };
@@ -464,10 +465,11 @@ void hsr_dev_setup(struct net_device *dev)
 
 /* Return true if dev is a HSR master; return false otherwise.
  */
-inline bool is_hsr_master(struct net_device *dev)
+bool is_hsr_master(struct net_device *dev)
 {
 	return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
 }
+EXPORT_SYMBOL(is_hsr_master);
 
 /* Default multicast address for HSR Supervision frames */
 static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
@@ -520,16 +522,6 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 
 	hsr->prot_version = protocol_version;
 
-	/* FIXME: should I modify the value of these?
-	 *
-	 * - hsr_dev->flags - i.e.
-	 *			IFF_MASTER/SLAVE?
-	 * - hsr_dev->priv_flags - i.e.
-	 *			IFF_EBRIDGE?
-	 *			IFF_TX_SKB_SHARING?
-	 *			IFF_HSR_MASTER/SLAVE?
-	 */
-
 	/* Make sure the 1st call to netif_carrier_on() gets through */
 	netif_carrier_off(hsr_dev);
 
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 868373822ee4..9060c92168f9 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -19,6 +19,5 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 		     unsigned char multicast_spec, u8 protocol_version,
 		     struct netlink_ext_ack *extack);
 void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
-bool is_hsr_master(struct net_device *dev);
 int hsr_get_max_mtu(struct hsr_priv *hsr);
 #endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index d32cd87d5c5b..ed82a470b6e1 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -249,6 +249,8 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame,
 		/* set the lane id properly */
 		hsr_set_path_id(hsr_ethhdr, port);
 		return skb_clone(frame->skb_hsr, GFP_ATOMIC);
+	} else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
+		return skb_clone(frame->skb_std, GFP_ATOMIC);
 	}
 
 	/* Create the new skb with enough headroom to fit the HSR tag */
@@ -291,6 +293,8 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame,
 			return NULL;
 		}
 		return skb_clone(frame->skb_prp, GFP_ATOMIC);
+	} else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
+		return skb_clone(frame->skb_std, GFP_ATOMIC);
 	}
 
 	skb = skb_copy_expand(frame->skb_std, 0,
@@ -343,6 +347,14 @@ bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
 		 port->type ==  HSR_PT_SLAVE_A));
 }
 
+bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
+{
+	if (port->dev->features & NETIF_F_HW_HSR_FWD)
+		return prp_drop_frame(frame, port);
+
+	return false;
+}
+
 /* Forward the frame through all devices except:
  * - Back through the receiving device
  * - If it's a HSR frame: through a device where it has passed before
@@ -359,6 +371,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
 {
 	struct hsr_port *port;
 	struct sk_buff *skb;
+	bool sent = false;
 
 	hsr_for_each_port(frame->port_rcv->hsr, port) {
 		struct hsr_priv *hsr = port->hsr;
@@ -374,6 +387,12 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
 		if (port->type != HSR_PT_MASTER && frame->is_local_exclusive)
 			continue;
 
+		/* If hardware duplicate generation is enabled, only send out
+		 * one port.
+		 */
+		if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent)
+			continue;
+
 		/* Don't send frame over port where it has been sent before.
 		 * Also fro SAN, this shouldn't be done.
 		 */
@@ -405,10 +424,12 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
 		}
 
 		skb->dev = port->dev;
-		if (port->type == HSR_PT_MASTER)
+		if (port->type == HSR_PT_MASTER) {
 			hsr_deliver_master(skb, port->dev, frame->node_src);
-		else
-			hsr_xmit(skb, port, frame);
+		} else {
+			if (!hsr_xmit(skb, port, frame))
+				sent = true;
+		}
 	}
 }
 
diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h
index 618140d484ad..b6acaafa83fc 100644
--- a/net/hsr/hsr_forward.h
+++ b/net/hsr/hsr_forward.h
@@ -23,6 +23,7 @@ struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame,
 struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame,
 				       struct hsr_port *port);
 bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port);
+bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port);
 void prp_fill_frame_info(__be16 proto, struct sk_buff *skb,
 			 struct hsr_frame_info *frame);
 void hsr_fill_frame_info(__be16 proto, struct sk_buff *skb,
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 5c97de459905..f9a8cc82ae2e 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -277,6 +277,8 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
 		skb = frame->skb_hsr;
 	else if (frame->skb_prp)
 		skb = frame->skb_prp;
+	else if (frame->skb_std)
+		skb = frame->skb_std;
 	if (!skb)
 		return;
 
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index 2fd1976e5b1c..f7e284f23b1f 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -131,6 +131,17 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
 	return NULL;
 }
 
+int hsr_get_version(struct net_device *dev, enum hsr_version *ver)
+{
+	struct hsr_priv *hsr;
+
+	hsr = netdev_priv(dev);
+	*ver = hsr->prot_version;
+
+	return 0;
+}
+EXPORT_SYMBOL(hsr_get_version);
+
 static struct notifier_block hsr_nb = {
 	.notifier_call = hsr_netdev_notify,	/* Slave event notifications */
 };
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index a9c30a608e35..a169808ee78a 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/list.h>
 #include <linux/if_vlan.h>
+#include <linux/if_hsr.h>
 
 /* Time constants as specified in the HSR specification (IEC-62439-3 2010)
  * Table 8.
@@ -171,13 +172,6 @@ struct hsr_port {
 	enum hsr_port_type	type;
 };
 
-/* used by driver internally to differentiate various protocols */
-enum hsr_version {
-	HSR_V0 = 0,
-	HSR_V1,
-	PRP_V1,
-};
-
 struct hsr_frame_info;
 struct hsr_node;
 
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 36d5fcf09c61..c5227d42faf5 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -48,12 +48,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
 		goto finish_consume;
 	}
 
-	/* For HSR, only tagged frames are expected, but for PRP
-	 * there could be non tagged frames as well from Single
-	 * attached nodes (SANs).
+	/* For HSR, only tagged frames are expected (unless the device offloads
+	 * HSR tag removal), but for PRP there could be non tagged frames as
+	 * well from Single attached nodes (SANs).
 	 */
 	protocol = eth_hdr(skb)->h_proto;
-	if (hsr->proto_ops->invalid_dan_ingress_frame &&
+
+	if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
+	    hsr->proto_ops->invalid_dan_ingress_frame &&
 	    hsr->proto_ops->invalid_dan_ingress_frame(protocol))
 		goto finish_pass;
 
-- 
cgit v1.2.3


From efbbdaa22bb78761bff8dfdde027ad04bedd47ce Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 15 Oct 2020 23:55:07 +0900
Subject: tracing: Show real address for trace event arguments

To help debugging kernel, show real address for trace event arguments
in tracefs/trace{,pipe} instead of hashed pointer value.

Since ftrace human-readable format uses vsprintf(), all %p are
translated to hash values instead of pointer address.

However, when debugging the kernel, raw address value gives a
hint when comparing with the memory mapping in the kernel.
(Those are sometimes used with crash log, which is not hashed too)
So converting %p with %px when calling trace_seq_printf().

Moreover, this is not improving the security because the tracefs
can be used only by root user and the raw address values are readable
from tracefs/percpu/cpu*/trace_pipe_raw file.

Link: https://lkml.kernel.org/r/160277370703.29307.5134475491761971203.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  4 +++
 include/trace/trace_events.h |  2 +-
 kernel/trace/trace.c         | 71 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h         |  2 ++
 kernel/trace/trace_output.c  | 12 +++++++-
 5 files changed, 88 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5d1eeac4bfbe..7077fec653bb 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -55,6 +55,8 @@ struct trace_event;
 
 int trace_raw_output_prep(struct trace_iterator *iter,
 			  struct trace_event *event);
+extern __printf(2, 3)
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...);
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
@@ -87,6 +89,8 @@ struct trace_iterator {
 	unsigned long		iter_flags;
 	void			*temp;	/* temp holder */
 	unsigned int		temp_size;
+	char			*fmt;	/* modified format holder */
+	unsigned int		fmt_size;
 
 	/* trace_seq for __print_flags() and __print_symbolic() etc. */
 	struct trace_seq	tmp_seq;
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 7785961d82ba..e6a8b3febc49 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -364,7 +364,7 @@ trace_raw_output_##call(struct trace_iterator *iter, int flags,		\
 	if (ret != TRACE_TYPE_HANDLED)					\
 		return ret;						\
 									\
-	trace_seq_printf(s, print);					\
+	trace_event_printf(iter, print);				\
 									\
 	return trace_handle_return(s);					\
 }									\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b79bcacdd6f9..39f8a537196e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3530,6 +3530,62 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 	return next;
 }
 
+#define STATIC_FMT_BUF_SIZE	128
+static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
+
+static char *trace_iter_expand_format(struct trace_iterator *iter)
+{
+	char *tmp;
+
+	if (iter->fmt == static_fmt_buf)
+		return NULL;
+
+	tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
+		       GFP_KERNEL);
+	if (tmp) {
+		iter->fmt_size += STATIC_FMT_BUF_SIZE;
+		iter->fmt = tmp;
+	}
+
+	return tmp;
+}
+
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+{
+	const char *p, *new_fmt;
+	char *q;
+
+	if (WARN_ON_ONCE(!fmt))
+		return fmt;
+
+	p = fmt;
+	new_fmt = q = iter->fmt;
+	while (*p) {
+		if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
+			if (!trace_iter_expand_format(iter))
+				return fmt;
+
+			q += iter->fmt - new_fmt;
+			new_fmt = iter->fmt;
+		}
+
+		*q++ = *p++;
+
+		/* Replace %p with %px */
+		if (p[-1] == '%') {
+			if (p[0] == '%') {
+				*q++ = *p++;
+			} else if (p[0] == 'p' && !isalnum(p[1])) {
+				*q++ = *p++;
+				*q++ = 'x';
+			}
+		}
+	}
+	*q = '\0';
+
+	return new_fmt;
+}
+
 #define STATIC_TEMP_BUF_SIZE	128
 static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
 
@@ -4322,6 +4378,16 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->temp)
 		iter->temp_size = 128;
 
+	/*
+	 * trace_event_printf() may need to modify given format
+	 * string to replace %p with %px so that it shows real address
+	 * instead of hash value. However, that is only for the event
+	 * tracing, other tracer may not need. Defer the allocation
+	 * until it is needed.
+	 */
+	iter->fmt = NULL;
+	iter->fmt_size = 0;
+
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
 	 * changes on it while we are reading.
@@ -4473,6 +4539,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	mutex_destroy(&iter->mutex);
 	free_cpumask_var(iter->started);
+	kfree(iter->fmt);
 	kfree(iter->temp);
 	kfree(iter->trace);
 	kfree(iter->buffer_iter);
@@ -9331,9 +9398,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 
 	/* Simulate the iterator */
 	trace_init_global_iter(&iter);
-	/* Can not use kmalloc for iter.temp */
+	/* Can not use kmalloc for iter.temp and iter.fmt */
 	iter.temp = static_temp_buf;
 	iter.temp_size = STATIC_TEMP_BUF_SIZE;
+	iter.fmt = static_fmt_buf;
+	iter.fmt_size = STATIC_FMT_BUF_SIZE;
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a9e13bd5a41b..6c3ea6f95e68 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -581,6 +581,8 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
 					struct ring_buffer_event *event);
 
+const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+
 int trace_empty(struct trace_iterator *iter);
 
 void *trace_find_next_entry_inc(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 92b1575ae0ca..61255bad7e01 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -312,13 +312,23 @@ int trace_raw_output_prep(struct trace_iterator *iter,
 }
 EXPORT_SYMBOL(trace_raw_output_prep);
 
+void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+	va_end(ap);
+}
+EXPORT_SYMBOL(trace_event_printf);
+
 static int trace_output_raw(struct trace_iterator *iter, char *name,
 			    char *fmt, va_list ap)
 {
 	struct trace_seq *s = &iter->seq;
 
 	trace_seq_printf(s, "%s: ", name);
-	trace_seq_vprintf(s, fmt, ap);
+	trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
 
 	return trace_handle_return(s);
 }
-- 
cgit v1.2.3


From 4c236d5dc8b86222dc155cd68e7934624264150f Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Thu, 11 Feb 2021 21:28:27 +0530
Subject: octeontx2-pf: cn10k: Use LMTST lines for NPA/NIX operations

This patch adds support to use new LMTST lines for NPA batch free
and burst SQE flush. Adds new dev_hw_ops structure to hold platform
specific functions and create new files cn10k.c and cn10k.h.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/nic/Makefile    |   2 +-
 drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c | 181 +++++++++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h |  17 ++
 .../ethernet/marvell/octeontx2/nic/otx2_common.c   |  84 ++++------
 .../ethernet/marvell/octeontx2/nic/otx2_common.h   |  68 +++++++-
 .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   |  36 +---
 .../net/ethernet/marvell/octeontx2/nic/otx2_reg.h  |   1 +
 .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.c |  39 ++---
 .../net/ethernet/marvell/octeontx2/nic/otx2_txrx.h |   7 +
 .../net/ethernet/marvell/octeontx2/nic/otx2_vf.c   |  28 +---
 include/linux/soc/marvell/octeontx2/asm.h          |   8 +
 11 files changed, 336 insertions(+), 135 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
index 29c82b94b2dc..745aa8a19499 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_OCTEONTX2_PF) += rvu_nicpf.o
 obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o
 
 rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
-		     otx2_ptp.o otx2_flows.o
+		     otx2_ptp.o otx2_flows.o cn10k.o
 rvu_nicvf-y := otx2_vf.o
 
 ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
new file mode 100644
index 000000000000..d6ca809edaed
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell OcteonTx2 RVU Physcial Function ethernet driver
+ *
+ * Copyright (C) 2020 Marvell.
+ */
+
+#include "cn10k.h"
+#include "otx2_reg.h"
+#include "otx2_struct.h"
+
+static struct dev_hw_ops	otx2_hw_ops = {
+	.sq_aq_init = otx2_sq_aq_init,
+	.sqe_flush = otx2_sqe_flush,
+	.aura_freeptr = otx2_aura_freeptr,
+	.refill_pool_ptrs = otx2_refill_pool_ptrs,
+};
+
+static struct dev_hw_ops cn10k_hw_ops = {
+	.sq_aq_init = cn10k_sq_aq_init,
+	.sqe_flush = cn10k_sqe_flush,
+	.aura_freeptr = cn10k_aura_freeptr,
+	.refill_pool_ptrs = cn10k_refill_pool_ptrs,
+};
+
+int cn10k_pf_lmtst_init(struct otx2_nic *pf)
+{
+	int size, num_lines;
+	u64 base;
+
+	if (!test_bit(CN10K_LMTST, &pf->hw.cap_flag)) {
+		pf->hw_ops = &otx2_hw_ops;
+		return 0;
+	}
+
+	pf->hw_ops = &cn10k_hw_ops;
+	base = pci_resource_start(pf->pdev, PCI_MBOX_BAR_NUM) +
+		       (MBOX_SIZE * (pf->total_vfs + 1));
+
+	size = pci_resource_len(pf->pdev, PCI_MBOX_BAR_NUM) -
+	       (MBOX_SIZE * (pf->total_vfs + 1));
+
+	pf->hw.lmt_base = ioremap(base, size);
+
+	if (!pf->hw.lmt_base) {
+		dev_err(pf->dev, "Unable to map PF LMTST region\n");
+		return -ENOMEM;
+	}
+
+	/* FIXME: Get the num of LMTST lines from LMT table */
+	pf->tot_lmt_lines = size / LMT_LINE_SIZE;
+	num_lines = (pf->tot_lmt_lines - NIX_LMTID_BASE) /
+			    pf->hw.tx_queues;
+	/* Number of LMT lines per SQ queues */
+	pf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
+
+	pf->nix_lmt_size = pf->nix_lmt_lines * LMT_LINE_SIZE;
+	return 0;
+}
+
+int cn10k_vf_lmtst_init(struct otx2_nic *vf)
+{
+	int size, num_lines;
+
+	if (!test_bit(CN10K_LMTST, &vf->hw.cap_flag)) {
+		vf->hw_ops = &otx2_hw_ops;
+		return 0;
+	}
+
+	vf->hw_ops = &cn10k_hw_ops;
+	size = pci_resource_len(vf->pdev, PCI_MBOX_BAR_NUM);
+	vf->hw.lmt_base = ioremap_wc(pci_resource_start(vf->pdev,
+							PCI_MBOX_BAR_NUM),
+				     size);
+	if (!vf->hw.lmt_base) {
+		dev_err(vf->dev, "Unable to map VF LMTST region\n");
+		return -ENOMEM;
+	}
+
+	vf->tot_lmt_lines = size / LMT_LINE_SIZE;
+	/* LMTST lines per SQ */
+	num_lines = (vf->tot_lmt_lines - NIX_LMTID_BASE) /
+			    vf->hw.tx_queues;
+	vf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
+	vf->nix_lmt_size = vf->nix_lmt_lines * LMT_LINE_SIZE;
+	return 0;
+}
+EXPORT_SYMBOL(cn10k_vf_lmtst_init);
+
+int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
+{
+	struct nix_cn10k_aq_enq_req *aq;
+	struct otx2_nic *pfvf = dev;
+	struct otx2_snd_queue *sq;
+
+	sq = &pfvf->qset.sq[qidx];
+	sq->lmt_addr = (__force u64 *)((u64)pfvf->hw.nix_lmt_base +
+			       (qidx * pfvf->nix_lmt_size));
+
+	/* Get memory to put this msg */
+	aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox);
+	if (!aq)
+		return -ENOMEM;
+
+	aq->sq.cq = pfvf->hw.rx_queues + qidx;
+	aq->sq.max_sqe_size = NIX_MAXSQESZ_W16; /* 128 byte */
+	aq->sq.cq_ena = 1;
+	aq->sq.ena = 1;
+	/* Only one SMQ is allocated, map all SQ's to that SMQ  */
+	aq->sq.smq = pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0];
+	/* FIXME: set based on NIX_AF_DWRR_RPM_MTU*/
+	aq->sq.smq_rr_weight = OTX2_MAX_MTU;
+	aq->sq.default_chan = pfvf->hw.tx_chan_base;
+	aq->sq.sqe_stype = NIX_STYPE_STF; /* Cache SQB */
+	aq->sq.sqb_aura = sqb_aura;
+	aq->sq.sq_int_ena = NIX_SQINT_BITS;
+	aq->sq.qint_idx = 0;
+	/* Due pipelining impact minimum 2000 unused SQ CQE's
+	 * need to maintain to avoid CQ overflow.
+	 */
+	aq->sq.cq_limit = ((SEND_CQ_SKID * 256) / (pfvf->qset.sqe_cnt));
+
+	/* Fill AQ info */
+	aq->qidx = qidx;
+	aq->ctype = NIX_AQ_CTYPE_SQ;
+	aq->op = NIX_AQ_INSTOP_INIT;
+
+	return otx2_sync_mbox_msg(&pfvf->mbox);
+}
+
+#define NPA_MAX_BURST 16
+void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq)
+{
+	struct otx2_nic *pfvf = dev;
+	u64 ptrs[NPA_MAX_BURST];
+	int num_ptrs = 1;
+	dma_addr_t bufptr;
+
+	/* Refill pool with new buffers */
+	while (cq->pool_ptrs) {
+		if (otx2_alloc_buffer(pfvf, cq, &bufptr)) {
+			if (num_ptrs--)
+				__cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs,
+						     num_ptrs,
+						     cq->rbpool->lmt_addr);
+			break;
+		}
+		cq->pool_ptrs--;
+		ptrs[num_ptrs] = (u64)bufptr + OTX2_HEAD_ROOM;
+		num_ptrs++;
+		if (num_ptrs == NPA_MAX_BURST || cq->pool_ptrs == 0) {
+			__cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs,
+					     num_ptrs,
+					     cq->rbpool->lmt_addr);
+			num_ptrs = 1;
+		}
+	}
+}
+
+void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx)
+{
+	struct otx2_nic *pfvf = dev;
+	int lmt_id = NIX_LMTID_BASE + (qidx * pfvf->nix_lmt_lines);
+	u64 val = 0, tar_addr = 0;
+
+	/* FIXME: val[0:10] LMT_ID.
+	 * [12:15] no of LMTST - 1 in the burst.
+	 * [19:63] data size of each LMTST in the burst except first.
+	 */
+	val = (lmt_id & 0x7FF);
+	/* Target address for LMTST flush tells HW how many 128bit
+	 * words are present.
+	 * tar_addr[6:4] size of first LMTST - 1 in units of 128b.
+	 */
+	tar_addr |= sq->io_addr | (((size / 16) - 1) & 0x7) << 4;
+	dma_wmb();
+	memcpy(sq->lmt_addr, sq->sqe_base, size);
+	cn10k_lmt_flush(val, tar_addr);
+
+	sq->head++;
+	sq->head &= (sq->sqe_cnt - 1);
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h
new file mode 100644
index 000000000000..e0bc595cbb78
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Marvell OcteonTx2 RVU Ethernet driver
+ *
+ * Copyright (C) 2020 Marvell.
+ */
+
+#ifndef CN10K_H
+#define CN10K_H
+
+#include "otx2_common.h"
+
+void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq);
+void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx);
+int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
+int cn10k_pf_lmtst_init(struct otx2_nic *pf);
+int cn10k_vf_lmtst_init(struct otx2_nic *vf);
+#endif /* CN10K_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index dbbdc3453f1a..2779802eed84 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -15,6 +15,7 @@
 #include "otx2_reg.h"
 #include "otx2_common.h"
 #include "otx2_struct.h"
+#include "cn10k.h"
 
 static void otx2_nix_rq_op_stats(struct queue_stats *stats,
 				 struct otx2_nic *pfvf, int qidx)
@@ -526,6 +527,26 @@ static int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
 	return ret;
 }
 
+int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq,
+		      dma_addr_t *dma)
+{
+	if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) {
+		struct refill_work *work;
+		struct delayed_work *dwork;
+
+		work = &pfvf->refill_wrk[cq->cq_idx];
+		dwork = &work->pool_refill_work;
+		/* Schedule a task if no other task is running */
+		if (!cq->refill_task_sched) {
+			cq->refill_task_sched = true;
+			schedule_delayed_work(dwork,
+					      msecs_to_jiffies(100));
+		}
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 void otx2_tx_timeout(struct net_device *netdev, unsigned int txq)
 {
 	struct otx2_nic *pfvf = netdev_priv(netdev);
@@ -728,9 +749,6 @@ void otx2_sqb_flush(struct otx2_nic *pfvf)
 #define RQ_PASS_LVL_AURA (255 - ((95 * 256) / 100)) /* RED when 95% is full */
 #define RQ_DROP_LVL_AURA (255 - ((99 * 256) / 100)) /* Drop when 99% is full */
 
-/* Send skid of 2000 packets required for CQ size of 4K CQEs. */
-#define SEND_CQ_SKID	2000
-
 static int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura)
 {
 	struct otx2_qset *qset = &pfvf->qset;
@@ -764,45 +782,14 @@ static int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura)
 	return otx2_sync_mbox_msg(&pfvf->mbox);
 }
 
-static int cn10k_sq_aq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura)
-{
-	struct nix_cn10k_aq_enq_req *aq;
-
-	/* Get memory to put this msg */
-	aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox);
-	if (!aq)
-		return -ENOMEM;
-
-	aq->sq.cq = pfvf->hw.rx_queues + qidx;
-	aq->sq.max_sqe_size = NIX_MAXSQESZ_W16; /* 128 byte */
-	aq->sq.cq_ena = 1;
-	aq->sq.ena = 1;
-	/* Only one SMQ is allocated, map all SQ's to that SMQ  */
-	aq->sq.smq = pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0];
-	/* FIXME: set based on NIX_AF_DWRR_RPM_MTU*/
-	aq->sq.smq_rr_weight = OTX2_MAX_MTU;
-	aq->sq.default_chan = pfvf->hw.tx_chan_base;
-	aq->sq.sqe_stype = NIX_STYPE_STF; /* Cache SQB */
-	aq->sq.sqb_aura = sqb_aura;
-	aq->sq.sq_int_ena = NIX_SQINT_BITS;
-	aq->sq.qint_idx = 0;
-	/* Due pipelining impact minimum 2000 unused SQ CQE's
-	 * need to maintain to avoid CQ overflow.
-	 */
-	aq->sq.cq_limit = ((SEND_CQ_SKID * 256) / (pfvf->qset.sqe_cnt));
-
-	/* Fill AQ info */
-	aq->qidx = qidx;
-	aq->ctype = NIX_AQ_CTYPE_SQ;
-	aq->op = NIX_AQ_INSTOP_INIT;
-
-	return otx2_sync_mbox_msg(&pfvf->mbox);
-}
-
-static int otx2_sq_aq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura)
+int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
 {
+	struct otx2_nic *pfvf = dev;
+	struct otx2_snd_queue *sq;
 	struct nix_aq_enq_req *aq;
 
+	sq = &pfvf->qset.sq[qidx];
+	sq->lmt_addr = (__force u64 *)(pfvf->reg_base + LMT_LF_LMTLINEX(qidx));
 	/* Get memory to put this msg */
 	aq = otx2_mbox_alloc_msg_nix_aq_enq(&pfvf->mbox);
 	if (!aq)
@@ -873,16 +860,12 @@ static int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura)
 	sq->sqe_thresh = ((sq->num_sqbs * sq->sqe_per_sqb) * 10) / 100;
 	sq->aura_id = sqb_aura;
 	sq->aura_fc_addr = pool->fc_addr->base;
-	sq->lmt_addr = (__force u64 *)(pfvf->reg_base + LMT_LF_LMTLINEX(qidx));
 	sq->io_addr = (__force u64)otx2_get_regaddr(pfvf, NIX_LF_OP_SENDX(0));
 
 	sq->stats.bytes = 0;
 	sq->stats.pkts = 0;
 
-	if (is_dev_otx2(pfvf->pdev))
-		return otx2_sq_aq_init(pfvf, qidx, sqb_aura);
-	else
-		return cn10k_sq_aq_init(pfvf, qidx, sqb_aura);
+	return pfvf->hw_ops->sq_aq_init(pfvf, qidx, sqb_aura);
 
 }
 
@@ -987,7 +970,7 @@ static void otx2_pool_refill_task(struct work_struct *work)
 			}
 			return;
 		}
-		otx2_aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM);
+		pfvf->hw_ops->aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM);
 		cq->pool_ptrs--;
 	}
 	cq->refill_task_sched = false;
@@ -1231,6 +1214,11 @@ static int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
 
 	pool->rbsize = buf_size;
 
+	/* Set LMTST addr for NPA batch free */
+	if (test_bit(CN10K_LMTST, &pfvf->hw.cap_flag))
+		pool->lmt_addr = (__force u64 *)((u64)pfvf->hw.npa_lmt_base +
+						 (pool_id * LMT_LINE_SIZE));
+
 	/* Initialize this pool's context via AF */
 	aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox);
 	if (!aq) {
@@ -1319,7 +1307,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf)
 		for (ptr = 0; ptr < num_sqbs; ptr++) {
 			if (otx2_alloc_rbuf(pfvf, pool, &bufptr))
 				return -ENOMEM;
-			otx2_aura_freeptr(pfvf, pool_id, bufptr);
+			pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr);
 			sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr;
 		}
 	}
@@ -1369,8 +1357,8 @@ int otx2_rq_aura_pool_init(struct otx2_nic *pfvf)
 		for (ptr = 0; ptr < num_ptrs; ptr++) {
 			if (otx2_alloc_rbuf(pfvf, pool, &bufptr))
 				return -ENOMEM;
-			otx2_aura_freeptr(pfvf, pool_id,
-					  bufptr + OTX2_HEAD_ROOM);
+			pfvf->hw_ops->aura_freeptr(pfvf, pool_id,
+						   bufptr + OTX2_HEAD_ROOM);
 		}
 	}
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index 9ac9b420dd95..51aaa6ae0fd3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -50,6 +50,9 @@ enum arua_mapped_qtypes {
 #define NIX_LF_ERR_VEC				0x81
 #define NIX_LF_POISON_VEC			0x82
 
+/* Send skid of 2000 packets required for CQ size of 4K CQEs. */
+#define SEND_CQ_SKID	2000
+
 /* RSS configuration */
 struct otx2_rss_ctx {
 	u8  ind_tbl[MAX_RSS_INDIR_TBL_SIZE];
@@ -275,9 +278,18 @@ struct otx2_flow_config {
 	struct list_head	flow_list;
 };
 
+struct dev_hw_ops {
+	int	(*sq_aq_init)(void *dev, u16 qidx, u16 sqb_aura);
+	void	(*sqe_flush)(void *dev, struct otx2_snd_queue *sq,
+			     int size, int qidx);
+	void	(*refill_pool_ptrs)(void *dev, struct otx2_cq_queue *cq);
+	void	(*aura_freeptr)(void *dev, int aura, u64 buf);
+};
+
 struct otx2_nic {
 	void __iomem		*reg_base;
 	struct net_device	*netdev;
+	struct dev_hw_ops	*hw_ops;
 	void			*iommu_domain;
 	u16			max_frs;
 	u16			rbsize; /* Receive buffer size */
@@ -507,10 +519,51 @@ static inline u64 otx2_atomic64_add(u64 incr, u64 *ptr)
 }
 
 #else
-#define otx2_write128(lo, hi, addr)
+#define otx2_write128(lo, hi, addr)		writeq((hi) | (lo), addr)
 #define otx2_atomic64_add(incr, ptr)		({ *ptr += incr; })
 #endif
 
+static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura,
+					u64 *ptrs, u64 num_ptrs,
+					u64 *lmt_addr)
+{
+	u64 size = 0, count_eot = 0;
+	u64 tar_addr, val = 0;
+
+	tar_addr = (__force u64)otx2_get_regaddr(pfvf, NPA_LF_AURA_BATCH_FREE0);
+	/* LMTID is same as AURA Id */
+	val = (aura & 0x7FF) | BIT_ULL(63);
+	/* Set if [127:64] of last 128bit word has a valid pointer */
+	count_eot = (num_ptrs % 2) ? 0ULL : 1ULL;
+	/* Set AURA ID to free pointer */
+	ptrs[0] = (count_eot << 32) | (aura & 0xFFFFF);
+	/* Target address for LMTST flush tells HW how many 128bit
+	 * words are valid from NPA_LF_AURA_BATCH_FREE0.
+	 *
+	 * tar_addr[6:4] is LMTST size-1 in units of 128b.
+	 */
+	if (num_ptrs > 2) {
+		size = (sizeof(u64) * num_ptrs) / 16;
+		if (!count_eot)
+			size++;
+		tar_addr |=  ((size - 1) & 0x7) << 4;
+	}
+	memcpy(lmt_addr, ptrs, sizeof(u64) * num_ptrs);
+	/* Perform LMTST flush */
+	cn10k_lmt_flush(val, tar_addr);
+}
+
+static inline void cn10k_aura_freeptr(void *dev, int aura, u64 buf)
+{
+	struct otx2_nic *pfvf = dev;
+	struct otx2_pool *pool;
+	u64 ptrs[2];
+
+	pool = &pfvf->qset.pool[aura];
+	ptrs[1] = buf;
+	__cn10k_aura_freeptr(pfvf, aura, ptrs, 2, pool->lmt_addr);
+}
+
 /* Alloc pointer from pool/aura */
 static inline u64 otx2_aura_allocptr(struct otx2_nic *pfvf, int aura)
 {
@@ -522,11 +575,12 @@ static inline u64 otx2_aura_allocptr(struct otx2_nic *pfvf, int aura)
 }
 
 /* Free pointer to a pool/aura */
-static inline void otx2_aura_freeptr(struct otx2_nic *pfvf,
-				     int aura, u64 buf)
+static inline void otx2_aura_freeptr(void *dev, int aura, u64 buf)
 {
-	otx2_write128(buf, (u64)aura | BIT_ULL(63),
-		      otx2_get_regaddr(pfvf, NPA_LF_AURA_OP_FREE0));
+	struct otx2_nic *pfvf = dev;
+	void __iomem *addr = otx2_get_regaddr(pfvf, NPA_LF_AURA_OP_FREE0);
+
+	otx2_write128(buf, (u64)aura | BIT_ULL(63), addr);
 }
 
 static inline int otx2_get_pool_idx(struct otx2_nic *pfvf, int type, int idx)
@@ -681,6 +735,10 @@ void otx2_ctx_disable(struct mbox *mbox, int type, bool npa);
 int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable);
 void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq);
 void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq);
+int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
+int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
+int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq,
+		      dma_addr_t *dma);
 
 /* RSS configuration APIs*/
 int otx2_rss_init(struct otx2_nic *pfvf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index afafaec88c8b..f87cfcfc2832 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -22,6 +22,7 @@
 #include "otx2_txrx.h"
 #include "otx2_struct.h"
 #include "otx2_ptp.h"
+#include "cn10k.h"
 #include <rvu_trace.h>
 
 #define DRV_NAME	"rvu_nicpf"
@@ -46,39 +47,6 @@ enum {
 static int otx2_config_hw_tx_tstamp(struct otx2_nic *pfvf, bool enable);
 static int otx2_config_hw_rx_tstamp(struct otx2_nic *pfvf, bool enable);
 
-static int cn10k_lmtst_init(struct otx2_nic *pf)
-{
-	int size, num_lines;
-	u64 base;
-
-	if (!test_bit(CN10K_LMTST, &pf->hw.cap_flag))
-		return 0;
-
-	base = pci_resource_start(pf->pdev, PCI_MBOX_BAR_NUM) +
-		       (MBOX_SIZE * (pf->total_vfs + 1));
-
-	size = pci_resource_len(pf->pdev, PCI_MBOX_BAR_NUM) -
-	       (MBOX_SIZE * (pf->total_vfs + 1));
-
-	pf->hw.lmt_base = ioremap(base, size);
-
-	if (!pf->hw.lmt_base) {
-		dev_err(pf->dev, "Unable to map PF LMTST region\n");
-		return -ENOMEM;
-	}
-
-	/* FIXME: Get the num of LMTST lines from LMT table */
-	pf->tot_lmt_lines = size / LMT_LINE_SIZE;
-	num_lines = (pf->tot_lmt_lines - NIX_LMTID_BASE) /
-			    pf->hw.tx_queues;
-	/* Number of LMT lines per SQ queues */
-	pf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
-
-	pf->nix_lmt_size = pf->nix_lmt_lines * LMT_LINE_SIZE;
-
-	return 0;
-}
-
 static int otx2_change_mtu(struct net_device *netdev, int new_mtu)
 {
 	bool if_up = netif_running(netdev);
@@ -2404,7 +2372,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_detach_rsrc;
 
-	err = cn10k_lmtst_init(pf);
+	err = cn10k_pf_lmtst_init(pf);
 	if (err)
 		goto err_detach_rsrc;
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
index 1e052d76a580..21b811c6ee0f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
@@ -94,6 +94,7 @@
 #define NPA_LF_QINTX_INT_W1S(a)         (NPA_LFBASE | 0x318 | (a) << 12)
 #define NPA_LF_QINTX_ENA_W1S(a)         (NPA_LFBASE | 0x320 | (a) << 12)
 #define NPA_LF_QINTX_ENA_W1C(a)         (NPA_LFBASE | 0x330 | (a) << 12)
+#define NPA_LF_AURA_BATCH_FREE0         (NPA_LFBASE | 0x400)
 
 /* NIX LF registers */
 #define	NIX_LFBASE			(BLKTYPE_NIX << RVU_FUNC_BLKADDR_SHIFT)
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
index 68f80e75c8c7..59a7bd88d907 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
@@ -17,6 +17,7 @@
 #include "otx2_struct.h"
 #include "otx2_txrx.h"
 #include "otx2_ptp.h"
+#include "cn10k.h"
 
 #define CQE_ADDR(CQ, idx) ((CQ)->cqe_base + ((CQ)->cqe_size * (idx)))
 
@@ -199,7 +200,8 @@ static void otx2_free_rcv_seg(struct otx2_nic *pfvf, struct nix_cqe_rx_s *cqe,
 		sg = (struct nix_rx_sg_s *)start;
 		seg_addr = &sg->seg_addr;
 		for (seg = 0; seg < sg->segs; seg++, seg_addr++)
-			otx2_aura_freeptr(pfvf, qidx, *seg_addr & ~0x07ULL);
+			pfvf->hw_ops->aura_freeptr(pfvf, qidx,
+						   *seg_addr & ~0x07ULL);
 		start += sizeof(*sg);
 	}
 }
@@ -304,7 +306,6 @@ static int otx2_rx_napi_handler(struct otx2_nic *pfvf,
 {
 	struct nix_cqe_rx_s *cqe;
 	int processed_cqe = 0;
-	dma_addr_t bufptr;
 
 	while (likely(processed_cqe < budget)) {
 		cqe = (struct nix_cqe_rx_s *)CQE_ADDR(cq, cq->cq_head);
@@ -330,28 +331,23 @@ static int otx2_rx_napi_handler(struct otx2_nic *pfvf,
 
 	if (unlikely(!cq->pool_ptrs))
 		return 0;
-
 	/* Refill pool with new buffers */
+	pfvf->hw_ops->refill_pool_ptrs(pfvf, cq);
+
+	return processed_cqe;
+}
+
+void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq)
+{
+	struct otx2_nic *pfvf = dev;
+	dma_addr_t bufptr;
+
 	while (cq->pool_ptrs) {
-		if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, &bufptr))) {
-			struct refill_work *work;
-			struct delayed_work *dwork;
-
-			work = &pfvf->refill_wrk[cq->cq_idx];
-			dwork = &work->pool_refill_work;
-			/* Schedule a task if no other task is running */
-			if (!cq->refill_task_sched) {
-				cq->refill_task_sched = true;
-				schedule_delayed_work(dwork,
-						      msecs_to_jiffies(100));
-			}
+		if (otx2_alloc_buffer(pfvf, cq, &bufptr))
 			break;
-		}
 		otx2_aura_freeptr(pfvf, cq->cq_idx, bufptr + OTX2_HEAD_ROOM);
 		cq->pool_ptrs--;
 	}
-
-	return processed_cqe;
 }
 
 static int otx2_tx_napi_handler(struct otx2_nic *pfvf,
@@ -438,7 +434,8 @@ int otx2_napi_handler(struct napi_struct *napi, int budget)
 	return workdone;
 }
 
-static void otx2_sqe_flush(struct otx2_snd_queue *sq, int size)
+void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq,
+		    int size, int qidx)
 {
 	u64 status;
 
@@ -796,7 +793,7 @@ static void otx2_sq_append_tso(struct otx2_nic *pfvf, struct otx2_snd_queue *sq,
 		sqe_hdr->sizem1 = (offset / 16) - 1;
 
 		/* Flush SQE to HW */
-		otx2_sqe_flush(sq, offset);
+		pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx);
 	}
 }
 
@@ -915,7 +912,7 @@ bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq,
 	netdev_tx_sent_queue(txq, skb->len);
 
 	/* Flush SQE to HW */
-	otx2_sqe_flush(sq, offset);
+	pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx);
 
 	return true;
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
index 73af15685657..d2b26b3357f3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
@@ -114,6 +114,7 @@ struct otx2_cq_poll {
 struct otx2_pool {
 	struct qmem		*stack;
 	struct qmem		*fc_addr;
+	u64			*lmt_addr;
 	u16			rbsize;
 };
 
@@ -156,4 +157,10 @@ static inline u64 otx2_iova_to_phys(void *iommu_domain, dma_addr_t dma_addr)
 int otx2_napi_handler(struct napi_struct *napi, int budget);
 bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq,
 			struct sk_buff *skb, u16 qidx);
+void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq,
+		     int size, int qidx);
+void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq,
+		    int size, int qidx);
+void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq);
+void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq);
 #endif /* OTX2_TXRX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
index 9ed850b75d59..31e03253e612 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
@@ -7,6 +7,7 @@
 
 #include "otx2_common.h"
 #include "otx2_reg.h"
+#include "cn10k.h"
 
 #define DRV_NAME	"rvu_nicvf"
 #define DRV_STRING	"Marvell RVU NIC Virtual Function Driver"
@@ -27,31 +28,6 @@ enum {
 	RVU_VF_INT_VEC_MBOX = 0x0,
 };
 
-static int cn10k_lmtst_init(struct otx2_nic *vf)
-{
-	int size, num_lines;
-
-	if (!test_bit(CN10K_LMTST, &vf->hw.cap_flag))
-		return 0;
-
-	size = pci_resource_len(vf->pdev, PCI_MBOX_BAR_NUM);
-	vf->hw.lmt_base = ioremap_wc(pci_resource_start(vf->pdev,
-							PCI_MBOX_BAR_NUM),
-				     size);
-	if (!vf->hw.lmt_base) {
-		dev_err(vf->dev, "Unable to map VF LMTST region\n");
-		return -ENOMEM;
-	}
-
-	vf->tot_lmt_lines = size / LMT_LINE_SIZE;
-	/* LMTST lines per SQ */
-	num_lines = (vf->tot_lmt_lines - NIX_LMTID_BASE) /
-			    vf->hw.tx_queues;
-	vf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
-	vf->nix_lmt_size = vf->nix_lmt_lines * LMT_LINE_SIZE;
-	return 0;
-}
-
 static void otx2vf_process_vfaf_mbox_msg(struct otx2_nic *vf,
 					 struct mbox_msghdr *msg)
 {
@@ -585,7 +561,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_detach_rsrc;
 
-	err = cn10k_lmtst_init(vf);
+	err = cn10k_vf_lmtst_init(vf);
 	if (err)
 		goto err_detach_rsrc;
 
diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h
index ae2279fe830a..28c04d918f0f 100644
--- a/include/linux/soc/marvell/octeontx2/asm.h
+++ b/include/linux/soc/marvell/octeontx2/asm.h
@@ -22,8 +22,16 @@
 			 : [rs]"r" (ioaddr));           \
 	(result);                                       \
 })
+#define cn10k_lmt_flush(val, addr)			\
+({							\
+	__asm__ volatile(".cpu  generic+lse\n"		\
+			 "steor %x[rf],[%[rs]]"		\
+			 : [rf]"+r"(val)		\
+			 : [rs]"r"(addr));		\
+})
 #else
 #define otx2_lmt_flush(ioaddr)          ({ 0; })
+#define cn10k_lmt_flush(val, addr)	({ addr = val; })
 #endif
 
 #endif /* __SOC_OTX2_ASM_H */
-- 
cgit v1.2.3


From c5dbb89fc2ac013afe67b9e4fcb3743c02b567cd Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Wed, 10 Feb 2021 12:14:03 +0100
Subject: bpf: Expose bpf_get_socket_cookie to tracing programs

This needs a new helper that:
- can work in a sleepable context (using sock_gen_cookie)
- takes a struct sock pointer and checks that it's not NULL

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210210111406.785541-2-revest@chromium.org
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 net/core/filter.c              | 12 ++++++++++++
 tools/include/uapi/linux/bpf.h |  8 ++++++++
 5 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1c8ea682c007..cccaef1088ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1885,6 +1885,7 @@ extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
 extern const struct bpf_func_proto bpf_sock_from_file_proto;
+extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dbf10bf08582..07cc2e404291 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1673,6 +1673,14 @@ union bpf_attr {
  * 	Return
  * 		A 8-byte long unique number.
  *
+ * u64 bpf_get_socket_cookie(struct sock *sk)
+ * 	Description
+ * 		Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * 		*sk*, but gets socket from a BTF **struct sock**. This helper
+ * 		also works for sleepable programs.
+ * 	Return
+ * 		A 8-byte long unique number or 0 if *sk* is NULL.
+ *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6c0018abe68a..845b2168e006 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1760,6 +1760,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_storage_delete_tracing_proto;
 	case BPF_FUNC_sock_from_file:
 		return &bpf_sock_from_file_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_ptr_cookie_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
diff --git a/net/core/filter.c b/net/core/filter.c
index e15d4741719a..57aaed478362 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4631,6 +4631,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+{
+	return sk ? sock_gen_cookie(sk) : 0;
+}
+
+const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+	.func		= bpf_get_socket_ptr_cookie,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
 {
 	return __sock_gen_cookie(ctx->sk);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index dbf10bf08582..07cc2e404291 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1673,6 +1673,14 @@ union bpf_attr {
  * 	Return
  * 		A 8-byte long unique number.
  *
+ * u64 bpf_get_socket_cookie(struct sock *sk)
+ * 	Description
+ * 		Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * 		*sk*, but gets socket from a BTF **struct sock**. This helper
+ * 		also works for sleepable programs.
+ * 	Return
+ * 		A 8-byte long unique number or 0 if *sk* is NULL.
+ *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket
-- 
cgit v1.2.3


From 3b23a32a63219f51a5298bc55a65ecee866e79d0 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Thu, 11 Feb 2021 11:34:10 -0800
Subject: net: fix dev_ifsioc_locked() race condition

dev_ifsioc_locked() is called with only RCU read lock, so when
there is a parallel writer changing the mac address, it could
get a partially updated mac address, as shown below:

Thread 1			Thread 2
// eth_commit_mac_addr_change()
memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
				// dev_ifsioc_locked()
				memcpy(ifr->ifr_hwaddr.sa_data,
					dev->dev_addr,...);

Close this race condition by guarding them with a RW semaphore,
like netdev_get_name(). We can not use seqlock here as it does not
allow blocking. The writers already take RTNL anyway, so this does
not affect the slow path. To avoid bothering existing
dev_set_mac_address() callers in drivers, introduce a new wrapper
just for user-facing callers on ioctl and rtnetlink paths.

Note, bonding also changes slave mac addresses but that requires
a separate patch due to the complexity of bonding code.

Fixes: 3710becf8a58 ("net: RCU locking for simple ioctl()")
Reported-by: "Gong, Sishuai" <sishuai@purdue.edu>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c         |  7 +++----
 drivers/net/tun.c         |  5 ++---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/core/dev_ioctl.c      | 20 +++++++-------------
 net/core/rtnetlink.c      |  2 +-
 6 files changed, 58 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index ff4aa35979a1..8e3a28ba6b28 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1090,10 +1090,9 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 			return -ENOLINK;
 		}
 		ret = 0;
-		u = tap->dev->type;
+		dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name);
 		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
-		    copy_to_user(&ifr->ifr_hwaddr.sa_data, tap->dev->dev_addr, ETH_ALEN) ||
-		    put_user(u, &ifr->ifr_hwaddr.sa_family))
+		    copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa)))
 			ret = -EFAULT;
 		tap_put_tap_dev(tap);
 		rtnl_unlock();
@@ -1108,7 +1107,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 			rtnl_unlock();
 			return -ENOLINK;
 		}
-		ret = dev_set_mac_address(tap->dev, &sa, NULL);
+		ret = dev_set_mac_address_user(tap->dev, &sa, NULL);
 		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 62690baa19bc..fc86da7f1628 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3107,15 +3107,14 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 
 	case SIOCGIFHWADDR:
 		/* Get hw address */
-		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
-		ifr.ifr_hwaddr.sa_family = tun->dev->type;
+		dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
 		if (copy_to_user(argp, &ifr, ifreq_len))
 			ret = -EFAULT;
 		break;
 
 	case SIOCSIFHWADDR:
 		/* Set hw address */
-		ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr, NULL);
+		ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
 		break;
 
 	case TUNGETSNDBUF:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a20310ff5083..bfadf3b82f9c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3902,6 +3902,9 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 			      struct netlink_ext_ack *extack);
 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 			struct netlink_ext_ack *extack);
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+			     struct netlink_ext_ack *extack);
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
 int dev_change_carrier(struct net_device *, bool new_carrier);
 int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_item_id *ppid);
diff --git a/net/core/dev.c b/net/core/dev.c
index 321d41a110e7..ce6291bc2e16 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8937,6 +8937,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 }
 EXPORT_SYMBOL(dev_set_mac_address);
 
+static DECLARE_RWSEM(dev_addr_sem);
+
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+			     struct netlink_ext_ack *extack)
+{
+	int ret;
+
+	down_write(&dev_addr_sem);
+	ret = dev_set_mac_address(dev, sa, extack);
+	up_write(&dev_addr_sem);
+	return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+	size_t size = sizeof(sa->sa_data);
+	struct net_device *dev;
+	int ret = 0;
+
+	down_read(&dev_addr_sem);
+	rcu_read_lock();
+
+	dev = dev_get_by_name_rcu(net, dev_name);
+	if (!dev) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+	if (!dev->addr_len)
+		memset(sa->sa_data, 0, size);
+	else
+		memcpy(sa->sa_data, dev->dev_addr,
+		       min_t(size_t, size, dev->addr_len));
+	sa->sa_family = dev->type;
+
+unlock:
+	rcu_read_unlock();
+	up_read(&dev_addr_sem);
+	return ret;
+}
+EXPORT_SYMBOL(dev_get_mac_address);
+
 /**
  *	dev_change_carrier - Change device carrier
  *	@dev: device
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index db8a0ff86f36..478d032f34ac 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -123,17 +123,6 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
 		ifr->ifr_mtu = dev->mtu;
 		return 0;
 
-	case SIOCGIFHWADDR:
-		if (!dev->addr_len)
-			memset(ifr->ifr_hwaddr.sa_data, 0,
-			       sizeof(ifr->ifr_hwaddr.sa_data));
-		else
-			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
-			       min(sizeof(ifr->ifr_hwaddr.sa_data),
-				   (size_t)dev->addr_len));
-		ifr->ifr_hwaddr.sa_family = dev->type;
-		return 0;
-
 	case SIOCGIFSLAVE:
 		err = -EINVAL;
 		break;
@@ -274,7 +263,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 	case SIOCSIFHWADDR:
 		if (dev->addr_len > sizeof(struct sockaddr))
 			return -EINVAL;
-		return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL);
+		return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
 
 	case SIOCSIFHWBROADCAST:
 		if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -418,6 +407,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
 	 */
 
 	switch (cmd) {
+	case SIOCGIFHWADDR:
+		dev_load(net, ifr->ifr_name);
+		ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name);
+		if (colon)
+			*colon = ':';
+		return ret;
 	/*
 	 *	These ioctl calls:
 	 *	- can be done by all.
@@ -427,7 +422,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
 	case SIOCGIFFLAGS:
 	case SIOCGIFMETRIC:
 	case SIOCGIFMTU:
-	case SIOCGIFHWADDR:
 	case SIOCGIFSLAVE:
 	case SIOCGIFMAP:
 	case SIOCGIFINDEX:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c313aaf2bce1..0edc0b2baaa4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2660,7 +2660,7 @@ static int do_setlink(const struct sk_buff *skb,
 		sa->sa_family = dev->type;
 		memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
 		       dev->addr_len);
-		err = dev_set_mac_address(dev, sa, extack);
+		err = dev_set_mac_address_user(dev, sa, extack);
 		kfree(sa);
 		if (err)
 			goto errout;
-- 
cgit v1.2.3


From c294554111a835598b557db789d9ad2379b512a2 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Fri, 12 Feb 2021 10:00:23 +0200
Subject: regulator: bd718x7, bd71828, Fix dvs voltage levels

The ROHM BD718x7 and BD71828 drivers support setting HW state
specific voltages from device-tree. This is used also by various
in-tree DTS files.

These drivers do incorrectly try to compose bit-map using enum
values. By a chance this works for first two valid levels having
values 1 and 2 - but setting values for the rest of the levels
do indicate capability of setting values for first levels as
well. Luckily the regulators which support setting values for
SUSPEND/LPSR do usually also support setting values for RUN
and IDLE too - thus this has not been such a fatal issue.

Fix this by defining the old enum values as bits and fixing the
parsing code. This allows keeping existing IC specific drivers
intact and only slightly changing the rohm-regulator.c

Fixes: 21b72156ede8b ("regulator: bd718x7: Split driver to common and bd718x7 specific parts")
Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20210212080023.GA880728@localhost.localdomain
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/rohm-regulator.c |  9 ++++++---
 include/linux/mfd/rohm-generic.h   | 14 ++++++--------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/rohm-regulator.c b/drivers/regulator/rohm-regulator.c
index 399002383b28..5c558b153d55 100644
--- a/drivers/regulator/rohm-regulator.c
+++ b/drivers/regulator/rohm-regulator.c
@@ -52,9 +52,12 @@ int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs,
 	char *prop;
 	unsigned int reg, mask, omask, oreg = desc->enable_reg;
 
-	for (i = 0; i < ROHM_DVS_LEVEL_MAX && !ret; i++) {
-		if (dvs->level_map & (1 << i)) {
-			switch (i + 1) {
+	for (i = 0; i < ROHM_DVS_LEVEL_VALID_AMOUNT && !ret; i++) {
+		int bit;
+
+		bit = BIT(i);
+		if (dvs->level_map & bit) {
+			switch (bit) {
 			case ROHM_DVS_LEVEL_RUN:
 				prop = "rohm,dvs-run-voltage";
 				reg = dvs->run_reg;
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 4283b5b33e04..2b85b9deb03a 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -20,14 +20,12 @@ struct rohm_regmap_dev {
 	struct regmap *regmap;
 };
 
-enum {
-	ROHM_DVS_LEVEL_UNKNOWN,
-	ROHM_DVS_LEVEL_RUN,
-	ROHM_DVS_LEVEL_IDLE,
-	ROHM_DVS_LEVEL_SUSPEND,
-	ROHM_DVS_LEVEL_LPSR,
-	ROHM_DVS_LEVEL_MAX = ROHM_DVS_LEVEL_LPSR,
-};
+#define ROHM_DVS_LEVEL_RUN		BIT(0)
+#define ROHM_DVS_LEVEL_IDLE		BIT(1)
+#define ROHM_DVS_LEVEL_SUSPEND		BIT(2)
+#define ROHM_DVS_LEVEL_LPSR		BIT(3)
+#define ROHM_DVS_LEVEL_VALID_AMOUNT	4
+#define ROHM_DVS_LEVEL_UNKNOWN		0
 
 /**
  * struct rohm_dvs_config - dynamic voltage scaling register descriptions
-- 
cgit v1.2.3


From 38f3885edbef8a77b25c4d13f3de06a7b93d02de Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 12 Feb 2021 16:11:17 +0200
Subject: ACPI: property: Remove dead code

After the commit 3a7a2ab839ad couple of functions became a dead code.
Moreover, for all these years nobody used them. Remove.

Fixes: 3a7a2ab839ad ("ACPI / property: Extend fwnode_property_* to data-only subnodes")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 20 --------------------
 include/linux/acpi.h    | 21 ---------------------
 2 files changed, 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 16b28084c1ca..22ccab4e7c6d 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -841,20 +841,6 @@ static int acpi_data_prop_read_single(const struct acpi_device_data *data,
 	return ret;
 }
 
-int acpi_dev_prop_read_single(struct acpi_device *adev, const char *propname,
-			      enum dev_prop_type proptype, void *val)
-{
-	int ret;
-
-	if (!adev || !val)
-		return -EINVAL;
-
-	ret = acpi_data_prop_read_single(&adev->data, propname, proptype, val);
-	if (ret < 0 || proptype != ACPI_TYPE_STRING)
-		return ret;
-	return 0;
-}
-
 static int acpi_copy_property_array_u8(const union acpi_object *items, u8 *val,
 				       size_t nval)
 {
@@ -995,12 +981,6 @@ static int acpi_data_prop_read(const struct acpi_device_data *data,
 	return ret;
 }
 
-int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname,
-		       enum dev_prop_type proptype, void *val, size_t nval)
-{
-	return adev ? acpi_data_prop_read(&adev->data, propname, proptype, val, nval) : -EINVAL;
-}
-
 /**
  * acpi_node_prop_read - retrieve the value of an ACPI property with given name.
  * @fwnode: Firmware node to get the property from.
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 053bf05fb1f7..a47eaf448131 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1114,14 +1114,9 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
 
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
-int acpi_dev_prop_read_single(struct acpi_device *adev,
-			      const char *propname, enum dev_prop_type proptype,
-			      void *val);
 int acpi_node_prop_read(const struct fwnode_handle *fwnode,
 			const char *propname, enum dev_prop_type proptype,
 			void *val, size_t nval);
-int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname,
-		       enum dev_prop_type proptype, void *val, size_t nval);
 
 struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
@@ -1223,14 +1218,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static inline int acpi_dev_prop_read_single(const struct acpi_device *adev,
-					    const char *propname,
-					    enum dev_prop_type proptype,
-					    void *val)
-{
-	return -ENXIO;
-}
-
 static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode,
 				      const char *propname,
 				      enum dev_prop_type proptype,
@@ -1239,14 +1226,6 @@ static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static inline int acpi_dev_prop_read(const struct acpi_device *adev,
-				     const char *propname,
-				     enum dev_prop_type proptype,
-				     void *val, size_t nval)
-{
-	return -ENXIO;
-}
-
 static inline struct fwnode_handle *
 acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 		      struct fwnode_handle *child)
-- 
cgit v1.2.3


From 325aa816143228a0b3472074ffb50d55ac3f04fe Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 12 Feb 2021 16:11:18 +0200
Subject: ACPI: property: Make acpi_node_prop_read() static

There is no users outside of property.c. No need to export
acpi_node_prop_read(), hence make it static.

Fixes: 3708184afc77 ("device property: Move FW type specific functionality to FW specific files")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c |  6 +++---
 include/linux/acpi.h    | 11 -----------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 22ccab4e7c6d..2b65ad9b4c0d 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -993,9 +993,9 @@ static int acpi_data_prop_read(const struct acpi_device_data *data,
  * of the property.  Otherwise, read at most @nval values to the array at the
  * location pointed to by @val.
  */
-int acpi_node_prop_read(const struct fwnode_handle *fwnode,
-			const char *propname, enum dev_prop_type proptype,
-			void *val, size_t nval)
+static int acpi_node_prop_read(const struct fwnode_handle *fwnode,
+			       const char *propname, enum dev_prop_type proptype,
+			       void *val, size_t nval)
 {
 	return acpi_data_prop_read(acpi_device_data_of_node(fwnode),
 				   propname, proptype, val, nval);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a47eaf448131..dc6e1f39dc6f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1114,9 +1114,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
 
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
-int acpi_node_prop_read(const struct fwnode_handle *fwnode,
-			const char *propname, enum dev_prop_type proptype,
-			void *val, size_t nval);
 
 struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
@@ -1218,14 +1215,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode,
-				      const char *propname,
-				      enum dev_prop_type proptype,
-				      void *val, size_t nval)
-{
-	return -ENXIO;
-}
-
 static inline struct fwnode_handle *
 acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 		      struct fwnode_handle *child)
-- 
cgit v1.2.3


From f9ab49184af093f0bf6c0e6583f5b25da2c09ff5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 23 Jan 2021 21:10:27 +0100
Subject: blk-mq: Use llist_head for blk_cpu_done

With llist_head it is possible to avoid the locking (the irq-off region)
when items are added. This makes it possible to add items on a remote
CPU without additional locking.
llist_add() returns true if the list was previously empty. This can be
used to invoke the SMP function call / raise sofirq only if the first
item was added (otherwise it is already pending).
This simplifies the code a little and reduces the IRQ-off regions.

blk_mq_raise_softirq() needs a preempt-disable section to ensure the
request is enqueued on the same CPU as the softirq is raised.
Some callers (USB-storage) invoke this path in preemptible context.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 97 +++++++++++++++++++++-----------------------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 42 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90348ae51846..463de2981df8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,7 +41,7 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
 
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -567,68 +567,29 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
 }
 EXPORT_SYMBOL(blk_mq_end_request);
 
-/*
- * Softirq action handler - move entries to local list and loop over them
- * while passing them to the queue registered handler.
- */
-static __latent_entropy void blk_done_softirq(struct softirq_action *h)
+static void blk_complete_reqs(struct llist_head *list)
 {
-	struct list_head *cpu_list, local_list;
-
-	local_irq_disable();
-	cpu_list = this_cpu_ptr(&blk_cpu_done);
-	list_replace_init(cpu_list, &local_list);
-	local_irq_enable();
-
-	while (!list_empty(&local_list)) {
-		struct request *rq;
+	struct llist_node *entry = llist_reverse_order(llist_del_all(list));
+	struct request *rq, *next;
 
-		rq = list_entry(local_list.next, struct request, ipi_list);
-		list_del_init(&rq->ipi_list);
+	llist_for_each_entry_safe(rq, next, entry, ipi_list)
 		rq->q->mq_ops->complete(rq);
-	}
 }
 
-static void blk_mq_trigger_softirq(struct request *rq)
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
 {
-	struct list_head *list;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	list = this_cpu_ptr(&blk_cpu_done);
-	list_add_tail(&rq->ipi_list, list);
-
-	/*
-	 * If the list only contains our just added request, signal a raise of
-	 * the softirq.  If there are already entries there, someone already
-	 * raised the irq but it hasn't run yet.
-	 */
-	if (list->next == &rq->ipi_list)
-		raise_softirq_irqoff(BLOCK_SOFTIRQ);
-	local_irq_restore(flags);
+	blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
 }
 
 static int blk_softirq_cpu_dead(unsigned int cpu)
 {
-	/*
-	 * If a CPU goes away, splice its entries to the current CPU
-	 * and trigger a run of the softirq
-	 */
-	local_irq_disable();
-	list_splice_init(&per_cpu(blk_cpu_done, cpu),
-			 this_cpu_ptr(&blk_cpu_done));
-	raise_softirq_irqoff(BLOCK_SOFTIRQ);
-	local_irq_enable();
-
+	blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
 	return 0;
 }
 
-
 static void __blk_mq_complete_request_remote(void *data)
 {
-	struct request *rq = data;
-
-	blk_mq_trigger_softirq(rq);
+	__raise_softirq_irqoff(BLOCK_SOFTIRQ);
 }
 
 static inline bool blk_mq_complete_need_ipi(struct request *rq)
@@ -657,6 +618,30 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
 	return cpu_online(rq->mq_ctx->cpu);
 }
 
+static void blk_mq_complete_send_ipi(struct request *rq)
+{
+	struct llist_head *list;
+	unsigned int cpu;
+
+	cpu = rq->mq_ctx->cpu;
+	list = &per_cpu(blk_cpu_done, cpu);
+	if (llist_add(&rq->ipi_list, list)) {
+		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
+		smp_call_function_single_async(cpu, &rq->csd);
+	}
+}
+
+static void blk_mq_raise_softirq(struct request *rq)
+{
+	struct llist_head *list;
+
+	preempt_disable();
+	list = this_cpu_ptr(&blk_cpu_done);
+	if (llist_add(&rq->ipi_list, list))
+		raise_softirq(BLOCK_SOFTIRQ);
+	preempt_enable();
+}
+
 bool blk_mq_complete_request_remote(struct request *rq)
 {
 	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
@@ -669,15 +654,15 @@ bool blk_mq_complete_request_remote(struct request *rq)
 		return false;
 
 	if (blk_mq_complete_need_ipi(rq)) {
-		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
-		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
-	} else {
-		if (rq->q->nr_hw_queues > 1)
-			return false;
-		blk_mq_trigger_softirq(rq);
+		blk_mq_complete_send_ipi(rq);
+		return true;
 	}
 
-	return true;
+	if (rq->q->nr_hw_queues == 1) {
+		blk_mq_raise_softirq(rq);
+		return true;
+	}
+	return false;
 }
 EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
 
@@ -3892,7 +3877,7 @@ static int __init blk_mq_init(void)
 	int i;
 
 	for_each_possible_cpu(i)
-		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+		init_llist_head(&per_cpu(blk_cpu_done, i));
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
 
 	cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f94ee3089e01..89a444c5a583 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -153,7 +153,7 @@ struct request {
 	 */
 	union {
 		struct hlist_node hash;	/* merge hash */
-		struct list_head ipi_list;
+		struct llist_node ipi_list;
 	};
 
 	/*
-- 
cgit v1.2.3


From 731ecea3e5495aa6bd3cb8587f5267cf5e4220e2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 18 Dec 2020 14:07:58 +0000
Subject: mm: Remove arch_remap() and mm-arch-hooks.h

powerpc was the last provider of arch_remap() and the last
user of mm-arch-hooks.h.

Since commit 526a9c4a7234 ("powerpc/vdso: Provide vdso_remap()"),
arch_remap() hence mm-arch-hooks.h are not used anymore.

Remove them.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/asm/Kbuild          |  1 -
 include/asm-generic/Kbuild          |  1 -
 include/asm-generic/mm-arch-hooks.h | 16 ----------------
 include/linux/mm-arch-hooks.h       | 22 ----------------------
 mm/mremap.c                         |  3 ---
 5 files changed, 43 deletions(-)
 delete mode 100644 include/asm-generic/mm-arch-hooks.h
 delete mode 100644 include/linux/mm-arch-hooks.h

(limited to 'include/linux')

diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 1c63b260ecc4..314979467db1 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -14,7 +14,6 @@ generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
 generic-y += mcs_spinlock.h
-generic-y += mm-arch-hooks.h
 generic-y += mmiowb.h
 generic-y += module.lds.h
 generic-y += param.h
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 267f6dfb8960..7282c0f50c85 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -35,7 +35,6 @@ mandatory-y += kprobes.h
 mandatory-y += linkage.h
 mandatory-y += local.h
 mandatory-y += local64.h
-mandatory-y += mm-arch-hooks.h
 mandatory-y += mmiowb.h
 mandatory-y += mmu.h
 mandatory-y += mmu_context.h
diff --git a/include/asm-generic/mm-arch-hooks.h b/include/asm-generic/mm-arch-hooks.h
deleted file mode 100644
index 5ff0e5193f85..000000000000
--- a/include/asm-generic/mm-arch-hooks.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Architecture specific mm hooks
- */
-
-#ifndef _ASM_GENERIC_MM_ARCH_HOOKS_H
-#define _ASM_GENERIC_MM_ARCH_HOOKS_H
-
-/*
- * This file should be included through arch/../include/asm/Kbuild for
- * the architecture which doesn't need specific mm hooks.
- *
- * In that case, the generic hooks defined in include/linux/mm-arch-hooks.h
- * are used.
- */
-
-#endif /* _ASM_GENERIC_MM_ARCH_HOOKS_H */
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
deleted file mode 100644
index 9c4bedc95504..000000000000
--- a/include/linux/mm-arch-hooks.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Generic mm no-op hooks.
- *
- * Copyright (C) 2015, IBM Corporation
- * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
- */
-#ifndef _LINUX_MM_ARCH_HOOKS_H
-#define _LINUX_MM_ARCH_HOOKS_H
-
-#include <asm/mm-arch-hooks.h>
-
-#ifndef arch_remap
-static inline void arch_remap(struct mm_struct *mm,
-			      unsigned long old_start, unsigned long old_end,
-			      unsigned long new_start, unsigned long new_end)
-{
-}
-#define arch_remap arch_remap
-#endif
-
-#endif /* _LINUX_MM_ARCH_HOOKS_H */
diff --git a/mm/mremap.c b/mm/mremap.c
index f554320281cc..a488eb3d1447 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,7 +22,6 @@
 #include <linux/syscalls.h>
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
-#include <linux/mm-arch-hooks.h>
 #include <linux/userfaultfd_k.h>
 
 #include <asm/cacheflush.h>
@@ -562,8 +561,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		new_addr = err;
 	} else {
 		mremap_userfaultfd_prep(new_vma, uf);
-		arch_remap(mm, old_addr, old_addr + old_len,
-			   new_addr, new_addr + new_len);
 	}
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
-- 
cgit v1.2.3


From 5f7d57280c1982d993d5f4ff0edac310f820f607 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 9 Feb 2021 14:38:29 +0100
Subject: bpf: Drop MTU check when doing TC-BPF redirect to ingress

The use-case for dropping the MTU check when TC-BPF does redirect to
ingress, is described by Eyal Birger in email[0]. The summary is the
ability to increase packet size (e.g. with IPv6 headers for NAT64) and
ingress redirect packet and let normal netstack fragment packet as needed.

[0] https://lore.kernel.org/netdev/CAHsH6Gug-hsLGHQ6N0wtixdOa85LDZ3HNRHVd0opR=19Qo4W4Q@mail.gmail.com/

V15:
 - missing static for function declaration

V9:
 - Make net_device "up" (IFF_UP) check explicit in skb_do_redirect

V4:
 - Keep net_device "up" (IFF_UP) check.
 - Adjustment to handle bpf_redirect_peer() helper

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/161287790971.790810.11785274340154740591.stgit@firesoul
---
 include/linux/netdevice.h | 32 ++++++++++++++++++++++++++++++--
 net/core/dev.c            | 32 ++++++++++++++------------------
 net/core/filter.c         |  6 +++---
 3 files changed, 47 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ef517254367d..b9bcbfde7849 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3931,14 +3931,42 @@ int xdp_umem_query(struct net_device *dev, u16 queue_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
+int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
 			const struct sk_buff *skb);
 
+static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
+						 const struct sk_buff *skb,
+						 const bool check_mtu)
+{
+	const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
+	unsigned int len;
+
+	if (!(dev->flags & IFF_UP))
+		return false;
+
+	if (!check_mtu)
+		return true;
+
+	len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
+	if (skb->len <= len)
+		return true;
+
+	/* if TSO is enabled, we don't care about the length as the packet
+	 * could be forwarded without being segmented before
+	 */
+	if (skb_is_gso(skb))
+		return true;
+
+	return false;
+}
+
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
-					       struct sk_buff *skb)
+					       struct sk_buff *skb,
+					       const bool check_mtu)
 {
 	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
-	    unlikely(!is_skb_forwardable(dev, skb))) {
+	    unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
 		atomic_long_inc(&dev->rx_dropped);
 		kfree_skb(skb);
 		return NET_RX_DROP;
diff --git a/net/core/dev.c b/net/core/dev.c
index 07a0347c33fb..8c820fe0eff3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2194,28 +2194,14 @@ static inline void net_timestamp_set(struct sk_buff *skb)
 
 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 {
-	unsigned int len;
-
-	if (!(dev->flags & IFF_UP))
-		return false;
-
-	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
-	if (skb->len <= len)
-		return true;
-
-	/* if TSO is enabled, we don't care about the length as the packet
-	 * could be forwarded without being segmented before
-	 */
-	if (skb_is_gso(skb))
-		return true;
-
-	return false;
+	return __is_skb_forwardable(dev, skb, true);
 }
 EXPORT_SYMBOL_GPL(is_skb_forwardable);
 
-int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+			      bool check_mtu)
 {
-	int ret = ____dev_forward_skb(dev, skb);
+	int ret = ____dev_forward_skb(dev, skb, check_mtu);
 
 	if (likely(!ret)) {
 		skb->protocol = eth_type_trans(skb, dev);
@@ -2224,6 +2210,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
 	return ret;
 }
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	return __dev_forward_skb2(dev, skb, true);
+}
 EXPORT_SYMBOL_GPL(__dev_forward_skb);
 
 /**
@@ -2250,6 +2241,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
 
+int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+{
+	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+}
+
 static inline int deliver_skb(struct sk_buff *skb,
 			      struct packet_type *pt_prev,
 			      struct net_device *orig_dev)
diff --git a/net/core/filter.c b/net/core/filter.c
index 439f43f00483..7059cf604d94 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2083,13 +2083,13 @@ static const struct bpf_func_proto bpf_csum_level_proto = {
 
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
-	return dev_forward_skb(dev, skb);
+	return dev_forward_skb_nomtu(dev, skb);
 }
 
 static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
 				      struct sk_buff *skb)
 {
-	int ret = ____dev_forward_skb(dev, skb);
+	int ret = ____dev_forward_skb(dev, skb, false);
 
 	if (likely(!ret)) {
 		skb->dev = dev;
@@ -2480,7 +2480,7 @@ int skb_do_redirect(struct sk_buff *skb)
 			goto out_drop;
 		dev = ops->ndo_get_peer_dev(dev);
 		if (unlikely(!dev ||
-			     !is_skb_forwardable(dev, skb) ||
+			     !(dev->flags & IFF_UP) ||
 			     net_eq(net, dev_net(dev))))
 			goto out_drop;
 		skb->dev = dev;
-- 
cgit v1.2.3


From 83c4a4eec06a8fc46fc68c437424f9c89e4d9c72 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 11 Feb 2021 17:27:44 -0600
Subject: of: Remove of_dev_{get,put}()

of_dev_get() and of_dev_put are just wrappers for get_device()/put_device()
on a platform_device. There's also already platform_device_{get,put}()
wrappers for this purpose. Let's update the few users and remove
of_dev_{get,put}().

Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Patrice Chotard <patrice.chotard@st.com>
Cc: Felipe Balbi <balbi@kernel.org>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Gilles Muller <Gilles.Muller@inria.fr>
Cc: Nicolas Palix <nicolas.palix@imag.fr>
Cc: Michal Marek <michal.lkml@markovi.net>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: netdev@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-usb@vger.kernel.org
Cc: cocci@systeme.lip6.fr
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210211232745.1498137-2-robh@kernel.org
---
 arch/powerpc/platforms/pseries/ibmebus.c |  4 ++--
 drivers/net/ethernet/ibm/emac/core.c     | 15 ++++++++-------
 drivers/of/device.c                      | 21 ---------------------
 drivers/of/platform.c                    |  4 ++--
 drivers/of/unittest.c                    |  2 +-
 drivers/usb/dwc3/dwc3-st.c               |  2 +-
 include/linux/of_device.h                |  3 ---
 scripts/coccinelle/free/put_device.cocci |  1 -
 8 files changed, 14 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index 8c6e509f6967..a15ab33646b3 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -355,12 +355,12 @@ static int ibmebus_bus_device_probe(struct device *dev)
 	if (!drv->probe)
 		return error;
 
-	of_dev_get(of_dev);
+	get_device(dev);
 
 	if (of_driver_match_device(dev, dev->driver))
 		error = drv->probe(of_dev);
 	if (error)
-		of_dev_put(of_dev);
+		put_device(dev);
 
 	return error;
 }
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index c00b9097eeea..471be6ec7e8a 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -38,6 +38,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
+#include <linux/platform_device.h>
 #include <linux/slab.h>
 
 #include <asm/processor.h>
@@ -2390,11 +2391,11 @@ static int emac_check_deps(struct emac_instance *dev,
 
 static void emac_put_deps(struct emac_instance *dev)
 {
-	of_dev_put(dev->mal_dev);
-	of_dev_put(dev->zmii_dev);
-	of_dev_put(dev->rgmii_dev);
-	of_dev_put(dev->mdio_dev);
-	of_dev_put(dev->tah_dev);
+	platform_device_put(dev->mal_dev);
+	platform_device_put(dev->zmii_dev);
+	platform_device_put(dev->rgmii_dev);
+	platform_device_put(dev->mdio_dev);
+	platform_device_put(dev->tah_dev);
 }
 
 static int emac_of_bus_notify(struct notifier_block *nb, unsigned long action,
@@ -2435,7 +2436,7 @@ static int emac_wait_deps(struct emac_instance *dev)
 	for (i = 0; i < EMAC_DEP_COUNT; i++) {
 		of_node_put(deps[i].node);
 		if (err)
-			of_dev_put(deps[i].ofdev);
+			platform_device_put(deps[i].ofdev);
 	}
 	if (err == 0) {
 		dev->mal_dev = deps[EMAC_DEP_MAL_IDX].ofdev;
@@ -2444,7 +2445,7 @@ static int emac_wait_deps(struct emac_instance *dev)
 		dev->tah_dev = deps[EMAC_DEP_TAH_IDX].ofdev;
 		dev->mdio_dev = deps[EMAC_DEP_MDIO_IDX].ofdev;
 	}
-	of_dev_put(deps[EMAC_DEP_PREV_IDX].ofdev);
+	platform_device_put(deps[EMAC_DEP_PREV_IDX].ofdev);
 	return err;
 }
 
diff --git a/drivers/of/device.c b/drivers/of/device.c
index aedfaaafd3e7..9a748855b39d 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -33,27 +33,6 @@ const struct of_device_id *of_match_device(const struct of_device_id *matches,
 }
 EXPORT_SYMBOL(of_match_device);
 
-struct platform_device *of_dev_get(struct platform_device *dev)
-{
-	struct device *tmp;
-
-	if (!dev)
-		return NULL;
-	tmp = get_device(&dev->dev);
-	if (tmp)
-		return to_platform_device(tmp);
-	else
-		return NULL;
-}
-EXPORT_SYMBOL(of_dev_get);
-
-void of_dev_put(struct platform_device *dev)
-{
-	if (dev)
-		put_device(&dev->dev);
-}
-EXPORT_SYMBOL(of_dev_put);
-
 int of_device_add(struct platform_device *ofdev)
 {
 	BUG_ON(ofdev->dev.of_node == NULL);
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 79bd5f5a1bf1..020bf860c72c 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -687,7 +687,7 @@ static int of_platform_notify(struct notifier_block *nb,
 		pdev_parent = of_find_device_by_node(rd->dn->parent);
 		pdev = of_platform_device_create(rd->dn, NULL,
 				pdev_parent ? &pdev_parent->dev : NULL);
-		of_dev_put(pdev_parent);
+		platform_device_put(pdev_parent);
 
 		if (pdev == NULL) {
 			pr_err("%s: failed to create for '%pOF'\n",
@@ -712,7 +712,7 @@ static int of_platform_notify(struct notifier_block *nb,
 		of_platform_device_destroy(&pdev->dev, &children_left);
 
 		/* and put the reference of the find */
-		of_dev_put(pdev);
+		platform_device_put(pdev);
 		break;
 	}
 
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index eb51bc147440..eb100627c186 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1286,7 +1286,7 @@ static void __init of_unittest_platform_populate(void)
 			unittest(pdev,
 				 "Could not create device for node '%pOFn'\n",
 				 grandchild);
-			of_dev_put(pdev);
+			platform_device_put(pdev);
 		}
 	}
 
diff --git a/drivers/usb/dwc3/dwc3-st.c b/drivers/usb/dwc3/dwc3-st.c
index e733be840545..b06b7092b1a2 100644
--- a/drivers/usb/dwc3/dwc3-st.c
+++ b/drivers/usb/dwc3/dwc3-st.c
@@ -274,7 +274,7 @@ static int st_dwc3_probe(struct platform_device *pdev)
 
 	dwc3_data->dr_mode = usb_get_dr_mode(&child_pdev->dev);
 	of_node_put(child);
-	of_dev_put(child_pdev);
+	platform_device_put(child_pdev);
 
 	/*
 	 * Configure the USB port as device or host according to the static
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 937f32f6aecb..d7a407dfeecb 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -26,9 +26,6 @@ static inline int of_driver_match_device(struct device *dev,
 	return of_match_device(drv->of_match_table, dev) != NULL;
 }
 
-extern struct platform_device *of_dev_get(struct platform_device *dev);
-extern void of_dev_put(struct platform_device *dev);
-
 extern int of_device_add(struct platform_device *pdev);
 extern int of_device_register(struct platform_device *ofdev);
 extern void of_device_unregister(struct platform_device *ofdev);
diff --git a/scripts/coccinelle/free/put_device.cocci b/scripts/coccinelle/free/put_device.cocci
index 120921366e84..f09f1e79bfa6 100644
--- a/scripts/coccinelle/free/put_device.cocci
+++ b/scripts/coccinelle/free/put_device.cocci
@@ -21,7 +21,6 @@ id = of_find_device_by_node@p1(x)
 if (id == NULL || ...) { ... return ...; }
 ... when != put_device(&id->dev)
     when != platform_device_put(id)
-    when != of_dev_put(id)
     when != if (id) { ... put_device(&id->dev) ... }
     when != e1 = (T)id
     when != e1 = (T)(&id->dev)
-- 
cgit v1.2.3


From cb8be8b4b27f6eea88268d6991175df1a27e557e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 11 Feb 2021 17:27:45 -0600
Subject: driver core: platform: Drop of_device_node_put() wrapper

of_device_node_put() is just a wrapper for of_node_put(). The platform
driver core is already polluted with of_node pointers and the only 'get'
already uses of_node_get() (though typically the get would happen in
of_device_alloc()).

Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20210211232745.1498137-3-robh@kernel.org
---
 drivers/base/platform.c   | 2 +-
 include/linux/of_device.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 95fd1549f87d..9d5171e5f967 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -571,7 +571,7 @@ static void platform_device_release(struct device *dev)
 	struct platform_object *pa = container_of(dev, struct platform_object,
 						  pdev.dev);
 
-	of_device_node_put(&pa->pdev.dev);
+	of_node_put(pa->pdev.dev.of_node);
 	kfree(pa->pdev.dev.platform_data);
 	kfree(pa->pdev.mfd_cell);
 	kfree(pa->pdev.resource);
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index d7a407dfeecb..1d7992a02e36 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -38,11 +38,6 @@ extern int of_device_request_module(struct device *dev);
 extern void of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
 extern int of_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env);
 
-static inline void of_device_node_put(struct device *dev)
-{
-	of_node_put(dev->of_node);
-}
-
 static inline struct device_node *of_cpu_device_node_get(int cpu)
 {
 	struct device *cpu_dev;
@@ -94,8 +89,6 @@ static inline int of_device_uevent_modalias(struct device *dev,
 	return -ENODEV;
 }
 
-static inline void of_device_node_put(struct device *dev) { }
-
 static inline const struct of_device_id *of_match_device(
 		const struct of_device_id *matches, const struct device *dev)
 {
-- 
cgit v1.2.3


From e5069b9c23b3857db986c58801bebe450cff3392 Mon Sep 17 00:00:00 2001
From: Dmitrii Banshchikov <me@ubique.spb.ru>
Date: Sat, 13 Feb 2021 00:56:41 +0400
Subject: bpf: Support pointers in global func args

Add an ability to pass a pointer to a type with known size in arguments
of a global function. Such pointers may be used to overcome the limit on
the maximum number of arguments, avoid expensive and tricky workarounds
and to have multiple output arguments.

A referenced type may contain pointers but indirect access through them
isn't supported.

The implementation consists of two parts.  If a global function has an
argument that is a pointer to a type with known size then:

  1) In btf_check_func_arg_match(): check that the corresponding
register points to NULL or to a valid memory region that is large enough
to contain the expected argument's type.

  2) In btf_prepare_func_args(): set the corresponding register type to
PTR_TO_MEM_OR_NULL and its size to the size of the expected type.

Only global functions are supported because allowance of pointers for
static functions might break validation. Consider the following
scenario. A static function has a pointer argument. A caller passes
pointer to its stack memory. Because the callee can change referenced
memory verifier cannot longer assume any particular slot type of the
caller's stack memory hence the slot type is changed to SLOT_MISC.  If
there is an operation that relies on slot type other than SLOT_MISC then
verifier won't be able to infer safety of the operation.

When verifier sees a static function that has a pointer argument
different from PTR_TO_CTX then it skips arguments check and continues
with "inline" validation with more information available. The operation
that relies on the particular slot type now succeeds.

Because global functions were not allowed to have pointer arguments
different from PTR_TO_CTX it's not possible to break existing and valid
code.

Signed-off-by: Dmitrii Banshchikov <me@ubique.spb.ru>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210212205642.620788-4-me@ubique.spb.ru
---
 include/linux/bpf_verifier.h |  2 ++
 kernel/bpf/btf.c             | 55 ++++++++++++++++++++++++++++++++++++--------
 kernel/bpf/verifier.c        | 30 ++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 532c97836d0d..971b33aca13d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -471,6 +471,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
 int check_ctx_reg(struct bpf_verifier_env *env,
 		  const struct bpf_reg_state *reg, int regno);
+int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+		   u32 regno, u32 mem_size);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index bd5d2c563693..2efeb5f4b343 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5297,9 +5297,10 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 	struct bpf_prog *prog = env->prog;
 	struct btf *btf = prog->aux->btf;
 	const struct btf_param *args;
-	const struct btf_type *t;
-	u32 i, nargs, btf_id;
+	const struct btf_type *t, *ref_t;
+	u32 i, nargs, btf_id, type_size;
 	const char *tname;
+	bool is_global;
 
 	if (!prog->aux->func_info)
 		return -EINVAL;
@@ -5333,6 +5334,8 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 		bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
 		goto out;
 	}
+
+	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
 	/* check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
@@ -5349,10 +5352,6 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 			goto out;
 		}
 		if (btf_type_is_ptr(t)) {
-			if (reg->type == SCALAR_VALUE) {
-				bpf_log(log, "R%d is not a pointer\n", i + 1);
-				goto out;
-			}
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
@@ -5367,6 +5366,25 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 					goto out;
 				continue;
 			}
+
+			if (!is_global)
+				goto out;
+
+			t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+			ref_t = btf_resolve_size(btf, t, &type_size);
+			if (IS_ERR(ref_t)) {
+				bpf_log(log,
+				    "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+				    i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+					PTR_ERR(ref_t));
+				goto out;
+			}
+
+			if (check_mem_reg(env, reg, i + 1, type_size))
+				goto out;
+
+			continue;
 		}
 		bpf_log(log, "Unrecognized arg#%d type %s\n",
 			i, btf_kind_str[BTF_INFO_KIND(t->info)]);
@@ -5397,7 +5415,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	enum bpf_prog_type prog_type = prog->type;
 	struct btf *btf = prog->aux->btf;
 	const struct btf_param *args;
-	const struct btf_type *t;
+	const struct btf_type *t, *ref_t;
 	u32 i, nargs, btf_id;
 	const char *tname;
 
@@ -5470,9 +5488,26 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			reg->type = SCALAR_VALUE;
 			continue;
 		}
-		if (btf_type_is_ptr(t) &&
-		    btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-			reg->type = PTR_TO_CTX;
+		if (btf_type_is_ptr(t)) {
+			if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+				reg->type = PTR_TO_CTX;
+				continue;
+			}
+
+			t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+			ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+			if (IS_ERR(ref_t)) {
+				bpf_log(log,
+				    "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+				    i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+					PTR_ERR(ref_t));
+				return -EINVAL;
+			}
+
+			reg->type = PTR_TO_MEM_OR_NULL;
+			reg->id = ++env->id_gen;
+
 			continue;
 		}
 		bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9f75bd7f8d3..11a242932a2c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4272,6 +4272,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+		   u32 regno, u32 mem_size)
+{
+	if (register_is_null(reg))
+		return 0;
+
+	if (reg_type_may_be_null(reg->type)) {
+		/* Assuming that the register contains a value check if the memory
+		 * access is safe. Temporarily save and restore the register's state as
+		 * the conversion shouldn't be visible to a caller.
+		 */
+		const struct bpf_reg_state saved_reg = *reg;
+		int rv;
+
+		mark_ptr_not_null_reg(reg);
+		rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
+		*reg = saved_reg;
+		return rv;
+	}
+
+	return check_helper_mem_access(env, regno, mem_size, true, NULL);
+}
+
 /* Implementation details:
  * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
  * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -11960,6 +11983,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				mark_reg_known_zero(env, regs, i);
 			else if (regs[i].type == SCALAR_VALUE)
 				mark_reg_unknown(env, regs, i);
+			else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+				const u32 mem_size = regs[i].mem_size;
+
+				mark_reg_known_zero(env, regs, i);
+				regs[i].mem_size = mem_size;
+				regs[i].id = ++env->id_gen;
+			}
 		}
 	} else {
 		/* 1st arg to a function */
-- 
cgit v1.2.3


From 6d4e9a8efe3d59f31367d79e970c2f328da139a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 10 Feb 2021 10:56:39 +0100
Subject: driver core: lift dma_default_coherent into common code

Lift the dma_default_coherent variable from the mips architecture code
to the driver core.  This allows an architecture to sdefault all device
to be DMA coherent at run time, even if the kernel is build with support
for DMA noncoherent device.  By allowing device_initialize to set the
->dma_coherent field to this default the amount of arch hooks required
for this behavior can be greatly reduced.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/Kconfig                     |  1 -
 arch/mips/alchemy/common/setup.c      |  2 +-
 arch/mips/include/asm/dma-coherence.h | 22 ----------------------
 arch/mips/kernel/setup.c              |  4 ----
 arch/mips/mm/c-r4k.c                  |  2 +-
 arch/mips/mm/dma-noncoherent.c        |  1 -
 arch/mips/mti-malta/malta-setup.c     |  2 +-
 arch/mips/pci/pci-alchemy.c           |  2 +-
 arch/mips/pistachio/init.c            |  1 -
 drivers/base/core.c                   |  6 ++++++
 include/linux/dma-map-ops.h           |  5 ++---
 kernel/dma/Kconfig                    |  3 ---
 kernel/dma/mapping.c                  |  2 ++
 13 files changed, 14 insertions(+), 39 deletions(-)
 delete mode 100644 arch/mips/include/asm/dma-coherence.h

(limited to 'include/linux')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 57b06798706c..a6d73c763be1 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1163,7 +1163,6 @@ config ARCH_SUPPORTS_UPROBES
 	bool
 
 config DMA_MAYBE_COHERENT
-	select ARCH_HAS_DMA_COHERENCE_H
 	select DMA_NONCOHERENT
 	bool
 
diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c
index 39e5b9cd882b..2388d68786f4 100644
--- a/arch/mips/alchemy/common/setup.c
+++ b/arch/mips/alchemy/common/setup.c
@@ -28,8 +28,8 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/dma-map-ops.h> /* for dma_default_coherent */
 
-#include <asm/dma-coherence.h>
 #include <asm/mipsregs.h>
 
 #include <au1000.h>
diff --git a/arch/mips/include/asm/dma-coherence.h b/arch/mips/include/asm/dma-coherence.h
deleted file mode 100644
index 846c5ade30d1..000000000000
--- a/arch/mips/include/asm/dma-coherence.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2006  Ralf Baechle <ralf@linux-mips.org>
- *
- */
-#ifndef __ASM_DMA_COHERENCE_H
-#define __ASM_DMA_COHERENCE_H
-
-#ifdef CONFIG_DMA_MAYBE_COHERENT
-extern bool dma_default_coherent;
-static inline bool dev_is_dma_coherent(struct device *dev)
-{
-	return dma_default_coherent;
-}
-#else
-#define dma_default_coherent	(!IS_ENABLED(CONFIG_DMA_NONCOHERENT))
-#endif
-
-#endif
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 6008f45ad081..f0f533294311 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -36,7 +36,6 @@
 #include <asm/cdmm.h>
 #include <asm/cpu.h>
 #include <asm/debug.h>
-#include <asm/dma-coherence.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/smp-ops.h>
@@ -803,9 +802,6 @@ arch_initcall(debugfs_mips);
 #endif
 
 #ifdef CONFIG_DMA_MAYBE_COHERENT
-bool dma_default_coherent;
-EXPORT_SYMBOL_GPL(dma_default_coherent);
-
 static int __init setcoherentio(char *str)
 {
 	dma_default_coherent = true;
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index bbfab94194b9..74b09e801c3a 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/bitops.h>
+#include <linux/dma-map-ops.h> /* for dma_default_coherent */
 
 #include <asm/bcache.h>
 #include <asm/bootinfo.h>
@@ -35,7 +36,6 @@
 #include <asm/war.h>
 #include <asm/cacheflush.h> /* for run_uncached() */
 #include <asm/traps.h>
-#include <asm/dma-coherence.h>
 #include <asm/mips-cps.h>
 
 /*
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 38d3d9143b47..90b562753eb8 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -10,7 +10,6 @@
 
 #include <asm/cache.h>
 #include <asm/cpu-type.h>
-#include <asm/dma-coherence.h>
 #include <asm/io.h>
 
 /*
diff --git a/arch/mips/mti-malta/malta-setup.c b/arch/mips/mti-malta/malta-setup.c
index 1cdcb76d393e..21cb3ac1237b 100644
--- a/arch/mips/mti-malta/malta-setup.c
+++ b/arch/mips/mti-malta/malta-setup.c
@@ -13,8 +13,8 @@
 #include <linux/pci.h>
 #include <linux/screen_info.h>
 #include <linux/time.h>
+#include <linux/dma-map-ops.h> /* for dma_default_coherent */
 
-#include <asm/dma-coherence.h>
 #include <asm/fw/fw.h>
 #include <asm/mips-cps.h>
 #include <asm/mips-boards/generic.h>
diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c
index 54c86b40d304..1c722dd0c130 100644
--- a/arch/mips/pci/pci-alchemy.c
+++ b/arch/mips/pci/pci-alchemy.c
@@ -17,8 +17,8 @@
 #include <linux/init.h>
 #include <linux/syscore_ops.h>
 #include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h> /* for dma_default_coherent */
 
-#include <asm/dma-coherence.h>
 #include <asm/mach-au1x00/au1000.h>
 #include <asm/tlbmisc.h>
 
diff --git a/arch/mips/pistachio/init.c b/arch/mips/pistachio/init.c
index 8e83262e9da8..e0bacfc3c6b4 100644
--- a/arch/mips/pistachio/init.c
+++ b/arch/mips/pistachio/init.c
@@ -13,7 +13,6 @@
 #include <linux/of_fdt.h>
 
 #include <asm/cacheflush.h>
-#include <asm/dma-coherence.h>
 #include <asm/fw/fw.h>
 #include <asm/mips-boards/generic.h>
 #include <asm/mips-cps.h>
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 25e08e5f40bd..fb2988f955cb 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -28,6 +28,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/sysfs.h>
+#include <linux/dma-map-ops.h> /* for dma_default_coherent */
 
 #include "base.h"
 #include "power/power.h"
@@ -2585,6 +2586,11 @@ void device_initialize(struct device *dev)
 	INIT_LIST_HEAD(&dev->links.suppliers);
 	INIT_LIST_HEAD(&dev->links.defer_sync);
 	dev->links.status = DL_DEV_NO_DRIVER;
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+	dev->dma_coherent = dma_default_coherent;
+#endif
 }
 EXPORT_SYMBOL_GPL(device_initialize);
 
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 70fcd0f610ea..1e98b8c1e055 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -229,11 +229,10 @@ bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
 		dma_addr_t dma_start, u64 size);
 
-#ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H
-#include <asm/dma-coherence.h>
-#elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
 	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
 	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+extern bool dma_default_coherent;
 static inline bool dev_is_dma_coherent(struct device *dev)
 {
 	return dev->dma_coherent;
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 479fc145acfc..77b405508743 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -33,9 +33,6 @@ config NEED_DMA_MAP_STATE
 config ARCH_DMA_ADDR_T_64BIT
 	def_bool 64BIT || PHYS_ADDR_T_64BIT
 
-config ARCH_HAS_DMA_COHERENCE_H
-	bool
-
 config ARCH_HAS_DMA_SET_MASK
 	bool
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f87a89d08654..84de6b1c5fab 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -16,6 +16,8 @@
 #include "debug.h"
 #include "direct.h"
 
+bool dma_default_coherent;
+
 /*
  * Managed DMA API
  */
-- 
cgit v1.2.3


From 96c0a6a72d181a330db6dc9848ff2e6584b1aa5b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Wed, 10 Feb 2021 21:51:02 +0100
Subject: s390,alpha: switch to 64-bit ino_t

s390 and alpha are the only 64 bit architectures with a 32-bit ino_t.
Since this is quite unusual this causes bugs from time to time.

See e.g. commit ebce3eb2f7ef ("ceph: fix inode number handling on
arches with 32-bit ino_t") for an example.

This (obviously) also prevents s390 and alpha to use 64-bit ino_t for
tmpfs. See commit b85a7a8bb573 ("tmpfs: disallow CONFIG_TMPFS_INODE64
on s390").

Therefore switch both s390 and alpha to 64-bit ino_t. This should only
have an effect on the ustat system call. To prevent ABI breakage
define struct ustat compatible to the old layout and change
sys_ustat() accordingly.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/Kconfig          | 4 ++++
 arch/alpha/Kconfig    | 1 +
 arch/s390/Kconfig     | 1 +
 fs/statfs.c           | 5 ++++-
 include/linux/types.h | 8 ++++++--
 5 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 24862d15f3a3..383c98e86a70 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -327,6 +327,10 @@ config ARCH_32BIT_OFF_T
 	  still support 32-bit off_t. This option is enabled for all such
 	  architectures explicitly.
 
+# Selected by 64 bit architectures which have a 32 bit f_tinode in struct ustat
+config ARCH_32BIT_USTAT_F_TINODE
+	bool
+
 config HAVE_ASM_MODVERSIONS
 	bool
 	help
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 1f51437d5765..96ce6565890e 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -2,6 +2,7 @@
 config ALPHA
 	bool
 	default y
+	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select ARCH_NO_PREEMPT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5de9f409e4d0..b68e6952866d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -58,6 +58,7 @@ config S390
 	# Note: keep this list sorted alphabetically
 	#
 	imply IMA_SECURE_AND_OR_TRUSTED_BOOT
+	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_BINFMT_ELF_STATE
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DEBUG_WX
diff --git a/fs/statfs.c b/fs/statfs.c
index 68cb07788750..0ba34c135593 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -255,7 +255,10 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 
 	memset(&tmp,0,sizeof(struct ustat));
 	tmp.f_tfree = sbuf.f_bfree;
-	tmp.f_tinode = sbuf.f_ffree;
+	if (IS_ENABLED(CONFIG_ARCH_32BIT_USTAT_F_TINODE))
+		tmp.f_tinode = min_t(u64, sbuf.f_ffree, UINT_MAX);
+	else
+		tmp.f_tinode = sbuf.f_ffree;
 
 	return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
 }
diff --git a/include/linux/types.h b/include/linux/types.h
index a147977602b5..ac825ad90e44 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -14,7 +14,7 @@ typedef u32 __kernel_dev_t;
 
 typedef __kernel_fd_set		fd_set;
 typedef __kernel_dev_t		dev_t;
-typedef __kernel_ino_t		ino_t;
+typedef __kernel_ulong_t	ino_t;
 typedef __kernel_mode_t		mode_t;
 typedef unsigned short		umode_t;
 typedef u32			nlink_t;
@@ -189,7 +189,11 @@ struct hlist_node {
 
 struct ustat {
 	__kernel_daddr_t	f_tfree;
-	__kernel_ino_t		f_tinode;
+#ifdef CONFIG_ARCH_32BIT_USTAT_F_TINODE
+	unsigned int		f_tinode;
+#else
+	unsigned long		f_tinode;
+#endif
 	char			f_fname[6];
 	char			f_fpack[6];
 };
-- 
cgit v1.2.3


From fec6e49b63989657bc4076dad99fa51d5ece34da Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sat, 13 Feb 2021 14:12:02 +0000
Subject: skbuff: remove __kfree_skb_flush()

This function isn't much needed as NAPI skb queue gets bulk-freed
anyway when there's no more room, and even may reduce the efficiency
of bulk operations.
It will be even less needed after reusing skb cache on allocation path,
so remove it and this way lighten network softirqs a bit.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 -
 net/core/dev.c         |  7 +------
 net/core/skbuff.c      | 12 ------------
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a4e91a2f873..0e0707296098 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2919,7 +2919,6 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 }
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
-void __kfree_skb_flush(void);
 void __kfree_skb_defer(struct sk_buff *skb);
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index ce6291bc2e16..631807c196ad 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4944,8 +4944,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 			else
 				__kfree_skb_defer(skb);
 		}
-
-		__kfree_skb_flush();
 	}
 
 	if (sd->output_queue) {
@@ -7012,7 +7010,6 @@ static int napi_threaded_poll(void *data)
 			__napi_poll(napi, &repoll);
 			netpoll_poll_unlock(have);
 
-			__kfree_skb_flush();
 			local_bh_enable();
 
 			if (!repoll)
@@ -7042,7 +7039,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 
 		if (list_empty(&list)) {
 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
-				goto out;
+				return;
 			break;
 		}
 
@@ -7069,8 +7066,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 	net_rps_action_and_irq_enable(sd);
-out:
-	__kfree_skb_flush();
 }
 
 struct netdev_adjacent {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1c6f6ef70339..4be2bb969535 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -838,18 +838,6 @@ void __consume_stateless_skb(struct sk_buff *skb)
 	kfree_skbmem(skb);
 }
 
-void __kfree_skb_flush(void)
-{
-	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
-	/* flush skb_cache if containing objects */
-	if (nc->skb_count) {
-		kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
-				     nc->skb_cache);
-		nc->skb_count = 0;
-	}
-}
-
 static inline void _kfree_skb_defer(struct sk_buff *skb)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-- 
cgit v1.2.3


From f450d539c05a14c103dd174718f81bb2fe65cb4b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sat, 13 Feb 2021 14:12:25 +0000
Subject: skbuff: introduce {,__}napi_build_skb() which reuses NAPI cache heads

Instead of just bulk-flushing skbuff_heads queued up through
napi_consume_skb() or __kfree_skb_defer(), try to reuse them
on allocation path.
If the cache is empty on allocation, bulk-allocate the first
16 elements, which is more efficient than per-skb allocation.
If the cache is full on freeing, bulk-wipe the second half of
the cache (32 elements).
This also includes custom KASAN poisoning/unpoisoning to be
double sure there are no use-after-free cases.

To not change current behaviour, introduce a new function,
napi_build_skb(), to optionally use a new approach later
in drivers.

Note on selected bulk size, 16:
 - this equals to XDP_BULK_QUEUE_SIZE, DEV_MAP_BULK_SIZE
   and especially VETH_XDP_BATCH, which is also used to
   bulk-allocate skbuff_heads and was tested on powerful
   setups;
 - this also showed the best performance in the actual
   test series (from the array of {8, 16, 32}).

Suggested-by: Edward Cree <ecree.xilinx@gmail.com> # Divide on two halves
Suggested-by: Eric Dumazet <edumazet@google.com>   # KASAN poisoning
Cc: Dmitry Vyukov <dvyukov@google.com>             # Help with KASAN
Cc: Paolo Abeni <pabeni@redhat.com>                # Reduced batch size
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 net/core/skbuff.c      | 94 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0e0707296098..906122eac82a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1087,6 +1087,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size);
 struct sk_buff *build_skb_around(struct sk_buff *skb,
 				 void *data, unsigned int frag_size);
 
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
+
 /**
  * alloc_skb - allocate a network buffer
  * @size: size to allocate
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 860a9d4f752f..9e1a8ded4acc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -120,6 +120,8 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
 }
 
 #define NAPI_SKB_CACHE_SIZE	64
+#define NAPI_SKB_CACHE_BULK	16
+#define NAPI_SKB_CACHE_HALF	(NAPI_SKB_CACHE_SIZE / 2)
 
 struct napi_alloc_cache {
 	struct page_frag_cache page;
@@ -164,6 +166,25 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_frag_align);
 
+static struct sk_buff *napi_skb_cache_get(void)
+{
+	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct sk_buff *skb;
+
+	if (unlikely(!nc->skb_count))
+		nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+						      GFP_ATOMIC,
+						      NAPI_SKB_CACHE_BULK,
+						      nc->skb_cache);
+	if (unlikely(!nc->skb_count))
+		return NULL;
+
+	skb = nc->skb_cache[--nc->skb_count];
+	kasan_unpoison_object_data(skbuff_head_cache, skb);
+
+	return skb;
+}
+
 /* Caller must provide SKB that is memset cleared */
 static void __build_skb_around(struct sk_buff *skb, void *data,
 			       unsigned int frag_size)
@@ -265,6 +286,53 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(build_skb_around);
 
+/**
+ * __napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __build_skb() that uses NAPI percpu caches to obtain
+ * skbuff_head instead of inplace allocation.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
+{
+	struct sk_buff *skb;
+
+	skb = napi_skb_cache_get();
+	if (unlikely(!skb))
+		return NULL;
+
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	__build_skb_around(skb, data, frag_size);
+
+	return skb;
+}
+
+/**
+ * napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __napi_build_skb() that takes care of skb->head_frag
+ * and skb->pfmemalloc when the data is a page or page fragment.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
+{
+	struct sk_buff *skb = __napi_build_skb(data, frag_size);
+
+	if (likely(skb) && frag_size) {
+		skb->head_frag = 1;
+		skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+	}
+
+	return skb;
+}
+EXPORT_SYMBOL(napi_build_skb);
+
 /*
  * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
  * the caller if emergency pfmemalloc reserves are being used. If it is and
@@ -838,31 +906,31 @@ void __consume_stateless_skb(struct sk_buff *skb)
 	kfree_skbmem(skb);
 }
 
-static inline void _kfree_skb_defer(struct sk_buff *skb)
+static void napi_skb_cache_put(struct sk_buff *skb)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	u32 i;
 
 	/* drop skb->head and call any destructors for packet */
 	skb_release_all(skb);
 
-	/* record skb to CPU local list */
+	kasan_poison_object_data(skbuff_head_cache, skb);
 	nc->skb_cache[nc->skb_count++] = skb;
 
-#ifdef CONFIG_SLUB
-	/* SLUB writes into objects when freeing */
-	prefetchw(skb);
-#endif
-
-	/* flush skb_cache if it is filled */
 	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
-		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
-				     nc->skb_cache);
-		nc->skb_count = 0;
+		for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
+			kasan_unpoison_object_data(skbuff_head_cache,
+						   nc->skb_cache[i]);
+
+		kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+				     nc->skb_cache + NAPI_SKB_CACHE_HALF);
+		nc->skb_count = NAPI_SKB_CACHE_HALF;
 	}
 }
+
 void __kfree_skb_defer(struct sk_buff *skb)
 {
-	_kfree_skb_defer(skb);
+	napi_skb_cache_put(skb);
 }
 
 void napi_consume_skb(struct sk_buff *skb, int budget)
@@ -887,7 +955,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
-	_kfree_skb_defer(skb);
+	napi_skb_cache_put(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);
 
-- 
cgit v1.2.3


From 9243adfc311a20371c3f4d8eaf0af4b135e6fac3 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sat, 13 Feb 2021 14:13:09 +0000
Subject: skbuff: queue NAPI_MERGED_FREE skbs into NAPI cache instead of
 freeing

napi_frags_finish() and napi_skb_finish() can only be called inside
NAPI Rx context, so we can feed NAPI cache with skbuff_heads that
got NAPI_MERGED_FREE verdict instead of immediate freeing.
Replace __kfree_skb() with __kfree_skb_defer() in napi_skb_finish()
and move napi_skb_free_stolen_head() to skbuff.c, so it can drop skbs
to NAPI cache.
As many drivers call napi_alloc_skb()/napi_get_frags() on their
receive path, this becomes especially useful.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 +
 net/core/dev.c         |  9 +--------
 net/core/skbuff.c      | 12 +++++++++---
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 906122eac82a..6d0a33d1c0db 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2921,6 +2921,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 }
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
+void napi_skb_free_stolen_head(struct sk_buff *skb);
 void __kfree_skb_defer(struct sk_buff *skb);
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 631807c196ad..ea9b46318d23 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6095,13 +6095,6 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
 }
 EXPORT_SYMBOL(gro_find_complete_by_type);
 
-static void napi_skb_free_stolen_head(struct sk_buff *skb)
-{
-	skb_dst_drop(skb);
-	skb_ext_put(skb);
-	kmem_cache_free(skbuff_head_cache, skb);
-}
-
 static gro_result_t napi_skb_finish(struct napi_struct *napi,
 				    struct sk_buff *skb,
 				    gro_result_t ret)
@@ -6115,7 +6108,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 			napi_skb_free_stolen_head(skb);
 		else
-			__kfree_skb(skb);
+			__kfree_skb_defer(skb);
 		break;
 
 	case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 875e1a453f7e..545a472273a5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -916,9 +916,6 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	u32 i;
 
-	/* drop skb->head and call any destructors for packet */
-	skb_release_all(skb);
-
 	kasan_poison_object_data(skbuff_head_cache, skb);
 	nc->skb_cache[nc->skb_count++] = skb;
 
@@ -935,6 +932,14 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 
 void __kfree_skb_defer(struct sk_buff *skb)
 {
+	skb_release_all(skb);
+	napi_skb_cache_put(skb);
+}
+
+void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+	skb_dst_drop(skb);
+	skb_ext_put(skb);
 	napi_skb_cache_put(skb);
 }
 
@@ -960,6 +965,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
+	skb_release_all(skb);
 	napi_skb_cache_put(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);
-- 
cgit v1.2.3


From 258e0815e2b1706e87c0d874211097aa8a7aa52f Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Sun, 14 Feb 2021 17:16:33 +0000
Subject: percpu: fix clang modpost section mismatch

pcpu_build_alloc_info() is an __init function that makes a call to
cpumask_clear_cpu(). With CONFIG_GCOV_PROFILE_ALL enabled, the inline
heuristics are modified and such cpumask_clear_cpu() which is marked
inline doesn't get inlined. Because it works on mask in __initdata,
modpost throws a section mismatch error.

Arnd sent a patch with the flatten attribute as an alternative [2]. I've
added it to compiler_attributes.h.

modpost complaint:
  WARNING: modpost: vmlinux.o(.text+0x735425): Section mismatch in reference from the function cpumask_clear_cpu() to the variable .init.data:pcpu_build_alloc_info.mask
  The function cpumask_clear_cpu() references
  the variable __initdata pcpu_build_alloc_info.mask.
  This is often because cpumask_clear_cpu lacks a __initdata
  annotation or the annotation of pcpu_build_alloc_info.mask is wrong.

clang output:
  mm/percpu.c:2724:5: remark: cpumask_clear_cpu not inlined into pcpu_build_alloc_info because too costly to inline (cost=725, threshold=325) [-Rpass-missed=inline]

[1] https://lore.kernel.org/linux-mm/202012220454.9F6Bkz9q-lkp@intel.com/
[2] https://lore.kernel.org/lkml/CAK8P3a2ZWfNeXKSm8K_SUhhwkor17jFo3xApLXjzfPqX0eUDUA@mail.gmail.com/

Reported-by: kernel test robot <lkp@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/compiler_attributes.h | 6 ++++++
 mm/percpu.c                         | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index ea5e04e75845..c043b8d2b17b 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -210,6 +210,12 @@
 # define fallthrough                    do {} while (0)  /* fallthrough */
 #endif
 
+/*
+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#Common-Function-Attributes
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#flatten
+ */
+# define __flatten			__attribute__((flatten))
+
 /*
  * Note the missing underscores.
  *
diff --git a/mm/percpu.c b/mm/percpu.c
index 80f8f885a990..6596a0a4286e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2663,7 +2663,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * On success, pointer to the new allocation_info is returned.  On
  * failure, ERR_PTR value is returned.
  */
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
 				size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-- 
cgit v1.2.3


From 0b9266d295cee170509539635b8d572abe5267af Mon Sep 17 00:00:00 2001
From: Daniel Palmer <daniel@0x0f.com>
Date: Thu, 11 Feb 2021 14:22:02 +0900
Subject: clk: fixed: add devm helper for clk_hw_register_fixed_factor()

Add a devm helper for clk_hw_register_fixed_factor() so that drivers that internally
register fixed factor clocks for things like dividers don't need to manually unregister
them on remove or if probe fails.

Signed-off-by: Daniel Palmer <daniel@0x0f.com>
Link: https://lore.kernel.org/r/20210211052206.2955988-4-daniel@0x0f.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-fixed-factor.c | 39 +++++++++++++++++++++++++++++++++------
 include/linux/clk-provider.h   |  4 +++-
 2 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index 910e6e74ae90..4f7bf3929d6d 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -64,10 +64,16 @@ const struct clk_ops clk_fixed_factor_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_fixed_factor_ops);
 
+static void devm_clk_hw_register_fixed_factor_release(struct device *dev, void *res)
+{
+	clk_hw_unregister_fixed_factor(&((struct clk_fixed_factor *)res)->hw);
+}
+
 static struct clk_hw *
 __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
 		const char *name, const char *parent_name, int index,
-		unsigned long flags, unsigned int mult, unsigned int div)
+		unsigned long flags, unsigned int mult, unsigned int div,
+		bool devm)
 {
 	struct clk_fixed_factor *fix;
 	struct clk_init_data init = { };
@@ -75,7 +81,15 @@ __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
 	struct clk_hw *hw;
 	int ret;
 
-	fix = kmalloc(sizeof(*fix), GFP_KERNEL);
+	/* You can't use devm without a dev */
+	if (devm && !dev)
+		return ERR_PTR(-EINVAL);
+
+	if (devm)
+		fix = devres_alloc(devm_clk_hw_register_fixed_factor_release,
+				sizeof(*fix), GFP_KERNEL);
+	else
+		fix = kmalloc(sizeof(*fix), GFP_KERNEL);
 	if (!fix)
 		return ERR_PTR(-ENOMEM);
 
@@ -99,9 +113,13 @@ __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
 	else
 		ret = of_clk_hw_register(np, hw);
 	if (ret) {
-		kfree(fix);
+		if (devm)
+			devres_free(fix);
+		else
+			kfree(fix);
 		hw = ERR_PTR(ret);
-	}
+	} else if (devm)
+		devres_add(dev, fix);
 
 	return hw;
 }
@@ -111,7 +129,7 @@ struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
 		unsigned int mult, unsigned int div)
 {
 	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, -1,
-					      flags, mult, div);
+					      flags, mult, div, false);
 }
 EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor);
 
@@ -153,6 +171,15 @@ void clk_hw_unregister_fixed_factor(struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(clk_hw_unregister_fixed_factor);
 
+struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned int mult, unsigned int div)
+{
+	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, -1,
+			flags, mult, div, true);
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor);
+
 #ifdef CONFIG_OF
 static const struct of_device_id set_rate_parent_matches[] = {
 	{ .compatible = "allwinner,sun4i-a10-pll3-2x-clk" },
@@ -185,7 +212,7 @@ static struct clk_hw *_of_fixed_factor_clk_setup(struct device_node *node)
 		flags |= CLK_SET_RATE_PARENT;
 
 	hw = __clk_hw_register_fixed_factor(NULL, node, clk_name, NULL, 0,
-					    flags, mult, div);
+					    flags, mult, div, false);
 	if (IS_ERR(hw)) {
 		/*
 		 * Clear OF_POPULATED flag so that clock registration can be
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index e4316890661a..58f6fe866ae9 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -941,7 +941,9 @@ struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div);
 void clk_hw_unregister_fixed_factor(struct clk_hw *hw);
-
+struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
+		const char *name, const char *parent_name, unsigned long flags,
+		unsigned int mult, unsigned int div);
 /**
  * struct clk_fractional_divider - adjustable fractional divider clock
  *
-- 
cgit v1.2.3


From 40d3f295b5feda409784e569550057b5fbc2a295 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 14 Feb 2021 00:37:56 +0200
Subject: net: mscc: ocelot: use common tag parsing code with DSA

The Injection Frame Header and Extraction Frame Header that the switch
prepends to frames over the NPI port is also prepended to frames
delivered over the CPU port module's queues.

Let's unify the handling of the frame headers by making the ocelot
driver call some helpers exported by the DSA tagger. Among other things,
this allows us to get rid of the strange cpu_to_be32 when transmitting
the Injection Frame Header on ocelot, since the packing API uses
network byte order natively (when "quirks" is 0).

The comments above ocelot_gen_ifh talk about setting pop_cnt to 3, and
the cpu extraction queue mask to something, but the code doesn't do it,
so we don't do it either.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c             |   6 +-
 drivers/net/dsa/ocelot/felix_vsc9959.c     |   2 +-
 drivers/net/dsa/ocelot/seville_vsc9953.c   |   2 +-
 drivers/net/ethernet/mscc/ocelot.c         |  38 ++----
 drivers/net/ethernet/mscc/ocelot.h         |   9 --
 drivers/net/ethernet/mscc/ocelot_vsc7514.c |  54 +++-----
 include/linux/dsa/ocelot.h                 | 208 +++++++++++++++++++++++++++++
 include/soc/mscc/ocelot.h                  |   7 -
 net/dsa/tag_ocelot.c                       | 147 ++------------------
 9 files changed, 246 insertions(+), 227 deletions(-)
 create mode 100644 include/linux/dsa/ocelot.h

(limited to 'include/linux')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index d3180b0f2307..60e0d354adea 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -14,8 +14,8 @@
 #include <soc/mscc/ocelot_ptp.h>
 #include <soc/mscc/ocelot.h>
 #include <linux/dsa/8021q.h>
+#include <linux/dsa/ocelot.h>
 #include <linux/platform_device.h>
-#include <linux/packing.h>
 #include <linux/module.h>
 #include <linux/of_net.h>
 #include <linux/pci.h>
@@ -1161,9 +1161,9 @@ static int felix_hwtstamp_set(struct dsa_switch *ds, int port,
 static bool felix_rxtstamp(struct dsa_switch *ds, int port,
 			   struct sk_buff *skb, unsigned int type)
 {
+	u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN;
 	struct skb_shared_hwtstamps *shhwtstamps;
 	struct ocelot *ocelot = ds->priv;
-	u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN;
 	u32 tstamp_lo, tstamp_hi;
 	struct timespec64 ts;
 	u64 tstamp, val;
@@ -1171,7 +1171,7 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port,
 	ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
 	tstamp = ktime_set(ts.tv_sec, ts.tv_nsec);
 
-	packing(extraction, &val,  116, 85, OCELOT_TAG_LEN, UNPACK, 0);
+	ocelot_xfh_get_rew_val(extraction, &val);
 	tstamp_lo = (u32)val;
 
 	tstamp_hi = tstamp >> 32;
diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index e944868cc120..cacc6f9c0113 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -8,7 +8,7 @@
 #include <soc/mscc/ocelot_ptp.h>
 #include <soc/mscc/ocelot_sys.h>
 #include <soc/mscc/ocelot.h>
-#include <linux/packing.h>
+#include <linux/dsa/ocelot.h>
 #include <linux/pcs-lynx.h>
 #include <net/pkt_sched.h>
 #include <linux/iopoll.h>
diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c
index 512f677a6c1c..d7348ea4831e 100644
--- a/drivers/net/dsa/ocelot/seville_vsc9953.c
+++ b/drivers/net/dsa/ocelot/seville_vsc9953.c
@@ -8,7 +8,7 @@
 #include <soc/mscc/ocelot.h>
 #include <linux/of_platform.h>
 #include <linux/pcs-lynx.h>
-#include <linux/packing.h>
+#include <linux/dsa/ocelot.h>
 #include <linux/iopoll.h>
 #include "felix.h"
 
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 7106d9ee534a..699b0c1c1780 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -4,6 +4,7 @@
  *
  * Copyright (c) 2017 Microsemi Corporation
  */
+#include <linux/dsa/ocelot.h>
 #include <linux/if_bridge.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
@@ -628,26 +629,6 @@ void ocelot_get_txtstamp(struct ocelot *ocelot)
 }
 EXPORT_SYMBOL(ocelot_get_txtstamp);
 
-/* Generate the IFH for frame injection
- *
- * The IFH is a 128bit-value
- * bit 127: bypass the analyzer processing
- * bit 56-67: destination mask
- * bit 28-29: pop_cnt: 3 disables all rewriting of the frame
- * bit 20-27: cpu extraction queue mask
- * bit 16: tag type 0: C-tag, 1: S-tag
- * bit 0-11: VID
- */
-static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info)
-{
-	ifh[0] = IFH_INJ_BYPASS | ((0x1ff & info->rew_op) << 21);
-	ifh[1] = (0xf00 & info->port) >> 8;
-	ifh[2] = (0xff & info->port) << 24;
-	ifh[3] = (info->tag_type << 16) | info->vid;
-
-	return 0;
-}
-
 bool ocelot_can_inject(struct ocelot *ocelot, int grp)
 {
 	u32 val = ocelot_read(ocelot, QS_INJ_STATUS);
@@ -664,23 +645,20 @@ EXPORT_SYMBOL(ocelot_can_inject);
 void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
 			      u32 rew_op, struct sk_buff *skb)
 {
-	struct frame_info info = {};
-	u32 ifh[OCELOT_TAG_LEN / 4];
+	u32 ifh[OCELOT_TAG_LEN / 4] = {0};
 	unsigned int i, count, last;
 
 	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
 			 QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
 
-	info.port = BIT(port);
-	info.tag_type = IFH_TAG_TYPE_C;
-	info.vid = skb_vlan_tag_get(skb);
-	info.rew_op = rew_op;
-
-	ocelot_gen_ifh(ifh, &info);
+	ocelot_ifh_set_bypass(ifh, 1);
+	ocelot_ifh_set_dest(ifh, BIT(port));
+	ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
+	ocelot_ifh_set_vid(ifh, skb_vlan_tag_get(skb));
+	ocelot_ifh_set_rew_op(ifh, rew_op);
 
 	for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
-		ocelot_write_rix(ocelot, (__force u32)cpu_to_be32(ifh[i]),
-				 QS_INJ_WR, grp);
+		ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
 
 	count = DIV_ROUND_UP(skb->len, 4);
 	last = skb->len % 4;
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index c485795c606b..db6b1a4c3926 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -32,15 +32,6 @@
 
 #define OCELOT_PTP_QUEUE_SZ	128
 
-struct frame_info {
-	u32 len;
-	u16 port;
-	u16 vid;
-	u8 tag_type;
-	u16 rew_op;
-	u32 timestamp;	/* rew_val */
-};
-
 struct ocelot_port_tc {
 	bool block_shared;
 	unsigned long offload_cnt;
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index b075dc13354a..fe0f8d6a32ce 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -4,6 +4,7 @@
  *
  * Copyright (c) 2017 Microsemi Corporation
  */
+#include <linux/dsa/ocelot.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of_net.h>
@@ -18,8 +19,6 @@
 #include <soc/mscc/ocelot_hsio.h>
 #include "ocelot.h"
 
-#define IFH_EXTRACT_BITFIELD64(x, o, w) (((x) >> (o)) & GENMASK_ULL((w) - 1, 0))
-
 static const u32 ocelot_ana_regmap[] = {
 	REG(ANA_ADVLEARN,				0x009000),
 	REG(ANA_VLANMASK,				0x009004),
@@ -532,29 +531,6 @@ static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops)
 	return 0;
 }
 
-static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info)
-{
-	u8 llen, wlen;
-	u64 ifh[2];
-
-	ifh[0] = be64_to_cpu(((__force __be64 *)_ifh)[0]);
-	ifh[1] = be64_to_cpu(((__force __be64 *)_ifh)[1]);
-
-	wlen = IFH_EXTRACT_BITFIELD64(ifh[0], 7,  8);
-	llen = IFH_EXTRACT_BITFIELD64(ifh[0], 15,  6);
-
-	info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80;
-
-	info->timestamp = IFH_EXTRACT_BITFIELD64(ifh[0], 21, 32);
-
-	info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4);
-
-	info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16,  1);
-	info->vid = IFH_EXTRACT_BITFIELD64(ifh[1], 0,  12);
-
-	return 0;
-}
-
 static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh,
 				u32 *rval)
 {
@@ -609,20 +585,20 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 		struct ocelot_port_private *priv;
 		struct ocelot_port *ocelot_port;
 		u64 tod_in_ns, full_ts_in_ns;
-		struct frame_info info = {};
+		u64 src_port, len, timestamp;
 		struct net_device *dev;
-		u32 ifh[4], val, *buf;
+		u32 xfh[4], val, *buf;
 		struct timespec64 ts;
-		int sz, len, buf_len;
 		struct sk_buff *skb;
+		int sz, buf_len;
 
 		for (i = 0; i < OCELOT_TAG_LEN / 4; i++) {
-			err = ocelot_rx_frame_word(ocelot, grp, true, &ifh[i]);
+			err = ocelot_rx_frame_word(ocelot, grp, true, &xfh[i]);
 			if (err != 4)
 				goto out;
 		}
 
-		/* At this point the IFH was read correctly, so it is safe to
+		/* At this point the XFH was read correctly, so it is safe to
 		 * presume that there is no error. The err needs to be reset
 		 * otherwise a frame could come in CPU queue between the while
 		 * condition and the check for error later on. And in that case
@@ -630,21 +606,23 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 		 */
 		err = 0;
 
-		ocelot_parse_ifh(ifh, &info);
+		ocelot_xfh_get_src_port(xfh, &src_port);
+		ocelot_xfh_get_len(xfh, &len);
+		ocelot_xfh_get_rew_val(xfh, &timestamp);
 
-		ocelot_port = ocelot->ports[info.port];
+		ocelot_port = ocelot->ports[src_port];
 		priv = container_of(ocelot_port, struct ocelot_port_private,
 				    port);
 		dev = priv->dev;
 
-		skb = netdev_alloc_skb(dev, info.len);
+		skb = netdev_alloc_skb(dev, len);
 
 		if (unlikely(!skb)) {
 			netdev_err(dev, "Unable to allocate sk_buff\n");
 			err = -ENOMEM;
 			goto out;
 		}
-		buf_len = info.len - ETH_FCS_LEN;
+		buf_len = len - ETH_FCS_LEN;
 		buf = (u32 *)skb_put(skb, buf_len);
 
 		len = 0;
@@ -677,12 +655,12 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 			ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
 
 			tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
-			if ((tod_in_ns & 0xffffffff) < info.timestamp)
+			if ((tod_in_ns & 0xffffffff) < timestamp)
 				full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
-						info.timestamp;
+						timestamp;
 			else
 				full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
-						info.timestamp;
+						timestamp;
 
 			shhwtstamps = skb_hwtstamps(skb);
 			memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
@@ -692,7 +670,7 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 		/* Everything we see on an interface that is in the HW bridge
 		 * has already been forwarded.
 		 */
-		if (ocelot->bridge_mask & BIT(info.port))
+		if (ocelot->bridge_mask & BIT(src_port))
 			skb->offload_fwd_mark = 1;
 
 		skb->protocol = eth_type_trans(skb, dev);
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
new file mode 100644
index 000000000000..763dbbfaff7a
--- /dev/null
+++ b/include/linux/dsa/ocelot.h
@@ -0,0 +1,208 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright 2019-2021 NXP Semiconductors
+ */
+
+#ifndef _NET_DSA_TAG_OCELOT_H
+#define _NET_DSA_TAG_OCELOT_H
+
+#include <linux/packing.h>
+
+#define OCELOT_TAG_LEN			16
+#define OCELOT_SHORT_PREFIX_LEN		4
+#define OCELOT_LONG_PREFIX_LEN		16
+#define OCELOT_TOTAL_TAG_LEN	(OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN)
+
+/* The CPU injection header and the CPU extraction header can have 3 types of
+ * prefixes: long, short and no prefix. The format of the header itself is the
+ * same in all 3 cases.
+ *
+ * Extraction with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * | ff:ff:ff:ff:ff:ff | fe:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame |
+ * |                   |                   |      |      |   header   |       |
+ * +-------------------+-------------------+------+------+------------+-------+
+ *        48 bits             48 bits      16 bits 16 bits  128 bits
+ *
+ * Extraction with short prefix:
+ *
+ *                                         +------+------+------------+-------+
+ *                                         | 8880 | 000a | extraction | frame |
+ *                                         |      |      |   header   |       |
+ *                                         +------+------+------------+-------+
+ *                                         16 bits 16 bits  128 bits
+ *
+ * Extraction with no prefix:
+ *
+ *                                                       +------------+-------+
+ *                                                       | extraction | frame |
+ *                                                       |   header   |       |
+ *                                                       +------------+-------+
+ *                                                          128 bits
+ *
+ *
+ * Injection with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * |      any dmac     |      any smac     | 8880 | 000a | injection  | frame |
+ * |                   |                   |      |      |   header   |       |
+ * +-------------------+-------------------+------+------+------------+-------+
+ *        48 bits             48 bits      16 bits 16 bits  128 bits
+ *
+ * Injection with short prefix:
+ *
+ *                                         +------+------+------------+-------+
+ *                                         | 8880 | 000a | injection  | frame |
+ *                                         |      |      |   header   |       |
+ *                                         +------+------+------------+-------+
+ *                                         16 bits 16 bits  128 bits
+ *
+ * Injection with no prefix:
+ *
+ *                                                       +------------+-------+
+ *                                                       | injection  | frame |
+ *                                                       |   header   |       |
+ *                                                       +------------+-------+
+ *                                                          128 bits
+ *
+ * The injection header looks like this (network byte order, bit 127
+ * is part of lowest address byte in memory, bit 0 is part of highest
+ * address byte):
+ *
+ *         +------+------+------+------+------+------+------+------+
+ * 127:120 |BYPASS| MASQ |          MASQ_PORT        |REW_OP|REW_OP|
+ *         +------+------+------+------+------+------+------+------+
+ * 119:112 |                         REW_OP                        |
+ *         +------+------+------+------+------+------+------+------+
+ * 111:104 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ * 103: 96 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  95: 88 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  87: 80 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  79: 72 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  71: 64 |            RSV            |           DEST            |
+ *         +------+------+------+------+------+------+------+------+
+ *  63: 56 |                         DEST                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  55: 48 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  47: 40 |  RSV |         SRC_PORT          |     RSV     |TFRM_TIMER|
+ *         +------+------+------+------+------+------+------+------+
+ *  39: 32 |     TFRM_TIMER     |               RSV                |
+ *         +------+------+------+------+------+------+------+------+
+ *  31: 24 |  RSV |  DP  |   POP_CNT   |           CPUQ            |
+ *         +------+------+------+------+------+------+------+------+
+ *  23: 16 |           CPUQ            |      QOS_CLASS     |TAG_TYPE|
+ *         +------+------+------+------+------+------+------+------+
+ *  15:  8 |         PCP        |  DEI |            VID            |
+ *         +------+------+------+------+------+------+------+------+
+ *   7:  0 |                          VID                          |
+ *         +------+------+------+------+------+------+------+------+
+ *
+ * And the extraction header looks like this:
+ *
+ *         +------+------+------+------+------+------+------+------+
+ * 127:120 |  RSV |                  REW_OP                        |
+ *         +------+------+------+------+------+------+------+------+
+ * 119:112 |       REW_OP       |              REW_VAL             |
+ *         +------+------+------+------+------+------+------+------+
+ * 111:104 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ * 103: 96 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  95: 88 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  87: 80 |       REW_VAL      |               LLEN               |
+ *         +------+------+------+------+------+------+------+------+
+ *  79: 72 | LLEN |                      WLEN                      |
+ *         +------+------+------+------+------+------+------+------+
+ *  71: 64 | WLEN |                      RSV                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  63: 56 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  55: 48 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  47: 40 | RSV  |          SRC_PORT         |       ACL_ID       |
+ *         +------+------+------+------+------+------+------+------+
+ *  39: 32 |       ACL_ID       |  RSV |         SFLOW_ID          |
+ *         +------+------+------+------+------+------+------+------+
+ *  31: 24 |ACL_HIT| DP  |  LRN_FLAGS  |           CPUQ            |
+ *         +------+------+------+------+------+------+------+------+
+ *  23: 16 |           CPUQ            |      QOS_CLASS     |TAG_TYPE|
+ *         +------+------+------+------+------+------+------+------+
+ *  15:  8 |         PCP        |  DEI |            VID            |
+ *         +------+------+------+------+------+------+------+------+
+ *   7:  0 |                          VID                          |
+ *         +------+------+------+------+------+------+------+------+
+ */
+
+static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val)
+{
+	packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0);
+}
+
+static inline void ocelot_xfh_get_len(void *extraction, u64 *len)
+{
+	u64 llen, wlen;
+
+	packing(extraction, &llen, 84, 79, OCELOT_TAG_LEN, UNPACK, 0);
+	packing(extraction, &wlen, 78, 71, OCELOT_TAG_LEN, UNPACK, 0);
+
+	*len = 60 * wlen + llen - 80;
+}
+
+static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port)
+{
+	packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0);
+}
+
+static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class)
+{
+	packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
+}
+
+static inline void ocelot_xfh_get_tag_type(void *extraction, u64 *tag_type)
+{
+	packing(extraction, tag_type, 16, 16, OCELOT_TAG_LEN, UNPACK, 0);
+}
+
+static inline void ocelot_xfh_get_vlan_tci(void *extraction, u64 *vlan_tci)
+{
+	packing(extraction, vlan_tci, 15, 0, OCELOT_TAG_LEN, UNPACK, 0);
+}
+
+static inline void ocelot_ifh_set_bypass(void *injection, u64 bypass)
+{
+	packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0);
+}
+
+static inline void ocelot_ifh_set_rew_op(void *injection, u64 rew_op)
+{
+	packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0);
+}
+
+static inline void ocelot_ifh_set_dest(void *injection, u64 dest)
+{
+	packing(injection, &dest, 67, 56, OCELOT_TAG_LEN, PACK, 0);
+}
+
+static inline void ocelot_ifh_set_qos_class(void *injection, u64 qos_class)
+{
+	packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0);
+}
+
+static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type)
+{
+	packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0);
+}
+
+static inline void ocelot_ifh_set_vid(void *injection, u64 vid)
+{
+	packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0);
+}
+
+#endif
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index a22aa944e921..11cae2e968b5 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -87,9 +87,6 @@
 /* Source PGIDs, one per physical port */
 #define PGID_SRC			80
 
-#define IFH_INJ_BYPASS			BIT(31)
-#define IFH_INJ_POP_CNT_DISABLE		(3 << 28)
-
 #define IFH_TAG_TYPE_C			0
 #define IFH_TAG_TYPE_S			1
 
@@ -100,10 +97,6 @@
 #define IFH_REW_OP_ORIGIN_PTP		0x5
 
 #define OCELOT_NUM_TC			8
-#define OCELOT_TAG_LEN			16
-#define OCELOT_SHORT_PREFIX_LEN		4
-#define OCELOT_LONG_PREFIX_LEN		16
-#define OCELOT_TOTAL_TAG_LEN	(OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN)
 
 #define OCELOT_SPEED_2500		0
 #define OCELOT_SPEED_1000		1
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 225b145fd131..8ce0b26f3520 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -1,138 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright 2019 NXP Semiconductors
  */
+#include <linux/dsa/ocelot.h>
 #include <soc/mscc/ocelot.h>
-#include <linux/packing.h>
 #include "dsa_priv.h"
 
-/* The CPU injection header and the CPU extraction header can have 3 types of
- * prefixes: long, short and no prefix. The format of the header itself is the
- * same in all 3 cases.
- *
- * Extraction with long prefix:
- *
- * +-------------------+-------------------+------+------+------------+-------+
- * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame |
- * |                   |                   |      |      |   header   |       |
- * +-------------------+-------------------+------+------+------------+-------+
- *        48 bits             48 bits      16 bits 16 bits  128 bits
- *
- * Extraction with short prefix:
- *
- *                                         +------+------+------------+-------+
- *                                         | 8880 | 000a | extraction | frame |
- *                                         |      |      |   header   |       |
- *                                         +------+------+------------+-------+
- *                                         16 bits 16 bits  128 bits
- *
- * Extraction with no prefix:
- *
- *                                                       +------------+-------+
- *                                                       | extraction | frame |
- *                                                       |   header   |       |
- *                                                       +------------+-------+
- *                                                          128 bits
- *
- *
- * Injection with long prefix:
- *
- * +-------------------+-------------------+------+------+------------+-------+
- * |      any dmac     |      any smac     | 8880 | 000a | injection  | frame |
- * |                   |                   |      |      |   header   |       |
- * +-------------------+-------------------+------+------+------------+-------+
- *        48 bits             48 bits      16 bits 16 bits  128 bits
- *
- * Injection with short prefix:
- *
- *                                         +------+------+------------+-------+
- *                                         | 8880 | 000a | injection  | frame |
- *                                         |      |      |   header   |       |
- *                                         +------+------+------------+-------+
- *                                         16 bits 16 bits  128 bits
- *
- * Injection with no prefix:
- *
- *                                                       +------------+-------+
- *                                                       | injection  | frame |
- *                                                       |   header   |       |
- *                                                       +------------+-------+
- *                                                          128 bits
- *
- * The injection header looks like this (network byte order, bit 127
- * is part of lowest address byte in memory, bit 0 is part of highest
- * address byte):
- *
- *         +------+------+------+------+------+------+------+------+
- * 127:120 |BYPASS| MASQ |          MASQ_PORT        |REW_OP|REW_OP|
- *         +------+------+------+------+------+------+------+------+
- * 119:112 |                         REW_OP                        |
- *         +------+------+------+------+------+------+------+------+
- * 111:104 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- * 103: 96 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- *  95: 88 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- *  87: 80 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- *  79: 72 |                          RSV                          |
- *         +------+------+------+------+------+------+------+------+
- *  71: 64 |            RSV            |           DEST            |
- *         +------+------+------+------+------+------+------+------+
- *  63: 56 |                         DEST                          |
- *         +------+------+------+------+------+------+------+------+
- *  55: 48 |                          RSV                          |
- *         +------+------+------+------+------+------+------+------+
- *  47: 40 |  RSV |         SRC_PORT          |     RSV     |TFRM_TIMER|
- *         +------+------+------+------+------+------+------+------+
- *  39: 32 |     TFRM_TIMER     |               RSV                |
- *         +------+------+------+------+------+------+------+------+
- *  31: 24 |  RSV |  DP  |   POP_CNT   |           CPUQ            |
- *         +------+------+------+------+------+------+------+------+
- *  23: 16 |           CPUQ            |      QOS_CLASS     |TAG_TYPE|
- *         +------+------+------+------+------+------+------+------+
- *  15:  8 |         PCP        |  DEI |            VID            |
- *         +------+------+------+------+------+------+------+------+
- *   7:  0 |                          VID                          |
- *         +------+------+------+------+------+------+------+------+
- *
- * And the extraction header looks like this:
- *
- *         +------+------+------+------+------+------+------+------+
- * 127:120 |  RSV |                  REW_OP                        |
- *         +------+------+------+------+------+------+------+------+
- * 119:112 |       REW_OP       |              REW_VAL             |
- *         +------+------+------+------+------+------+------+------+
- * 111:104 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- * 103: 96 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- *  95: 88 |                         REW_VAL                       |
- *         +------+------+------+------+------+------+------+------+
- *  87: 80 |       REW_VAL      |               LLEN               |
- *         +------+------+------+------+------+------+------+------+
- *  79: 72 | LLEN |                      WLEN                      |
- *         +------+------+------+------+------+------+------+------+
- *  71: 64 | WLEN |                      RSV                       |
- *         +------+------+------+------+------+------+------+------+
- *  63: 56 |                          RSV                          |
- *         +------+------+------+------+------+------+------+------+
- *  55: 48 |                          RSV                          |
- *         +------+------+------+------+------+------+------+------+
- *  47: 40 | RSV  |          SRC_PORT         |       ACL_ID       |
- *         +------+------+------+------+------+------+------+------+
- *  39: 32 |       ACL_ID       |  RSV |         SFLOW_ID          |
- *         +------+------+------+------+------+------+------+------+
- *  31: 24 |ACL_HIT| DP  |  LRN_FLAGS  |           CPUQ            |
- *         +------+------+------+------+------+------+------+------+
- *  23: 16 |           CPUQ            |      QOS_CLASS     |TAG_TYPE|
- *         +------+------+------+------+------+------+------+------+
- *  15:  8 |         PCP        |  DEI |            VID            |
- *         +------+------+------+------+------+------+------+------+
- *   7:  0 |                          VID                          |
- *         +------+------+------+------+------+------+------+------+
- */
-
 static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 				   struct net_device *netdev)
 {
@@ -142,7 +14,6 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 	struct ocelot *ocelot = ds->priv;
 	struct ocelot_port *ocelot_port;
 	u8 *prefix, *injection;
-	u64 qos_class, rew_op;
 
 	ocelot_port = ocelot->ports[dp->index];
 
@@ -155,19 +26,19 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 	/* Fix up the fields which are not statically determined
 	 * in the template
 	 */
-	qos_class = skb->priority;
-	packing(injection, &qos_class, 19,  17, OCELOT_TAG_LEN, PACK, 0);
+	ocelot_ifh_set_qos_class(injection, skb->priority);
 
 	/* TX timestamping was requested */
 	if (clone) {
-		rew_op = ocelot_port->ptp_cmd;
+		u64 rew_op = ocelot_port->ptp_cmd;
+
 		/* Retrieve timestamp ID populated inside skb->cb[0] of the
 		 * clone by ocelot_port_add_txtstamp_skb
 		 */
 		if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP)
 			rew_op |= clone->cb[0] << 3;
 
-		packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0);
+		ocelot_ifh_set_rew_op(injection, rew_op);
 	}
 
 	return skb;
@@ -208,10 +79,10 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	/* Remove from inet csum the extraction header */
 	skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN);
 
-	packing(extraction, &src_port,  46, 43, OCELOT_TAG_LEN, UNPACK, 0);
-	packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
-	packing(extraction, &tag_type,  16, 16, OCELOT_TAG_LEN, UNPACK, 0);
-	packing(extraction, &vlan_tci,  15,  0, OCELOT_TAG_LEN, UNPACK, 0);
+	ocelot_xfh_get_src_port(extraction, &src_port);
+	ocelot_xfh_get_qos_class(extraction, &qos_class);
+	ocelot_xfh_get_tag_type(extraction, &tag_type);
+	ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
 
 	skb->dev = dsa_master_find_slave(netdev, 0, src_port);
 	if (!skb->dev)
-- 
cgit v1.2.3


From 7c4bb540e9173c914c2091fdd9b6aee3c2a3e1e5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 14 Feb 2021 00:37:58 +0200
Subject: net: dsa: tag_ocelot: create separate tagger for Seville

The ocelot tagger is a hot mess currently, it relies on memory
initialized by the attached driver for basic frame transmission.
This is against all that DSA tagging protocols stand for, which is that
the transmission and reception of a DSA-tagged frame, the data path,
should be independent from the switch control path, because the tag
protocol is in principle hot-pluggable and reusable across switches
(even if in practice it wasn't until very recently). But if another
driver like dsa_loop wants to make use of tag_ocelot, it couldn't.

This was done to have common code between Felix and Ocelot, which have
one bit difference in the frame header format. Quoting from commit
67c2404922c2 ("net: dsa: felix: create a template for the DSA tags on
xmit"):

    Other alternatives have been analyzed, such as:
    - Create a separate tag_seville.c: too much code duplication for just 1
      bit field difference.
    - Create a separate DSA_TAG_PROTO_SEVILLE under tag_ocelot.c, just like
      tag_brcm.c, which would have a separate .xmit function. Again, too
      much code duplication for just 1 bit field difference.
    - Allocate the template from the init function of the tag_ocelot.c
      module, instead of from the driver: couldn't figure out a method of
      accessing the correct port template corresponding to the correct
      tagger in the .xmit function.

The really interesting part is that Seville should have had its own
tagging protocol defined - it is not compatible on the wire with Ocelot,
even for that single bit. In principle, a packet generated by
DSA_TAG_PROTO_OCELOT when booted on NXP LS1028A would look in a certain
way, but when booted on NXP T1040 it would look differently. The reverse
is also true: a packet generated by a Seville switch would be
interpreted incorrectly by Wireshark if it was told it was generated by
an Ocelot switch.

Actually things are a bit more nuanced. If we concentrate only on the
DSA tag, what I said above is true, but Ocelot/Seville also support an
optional DSA tag prefix, which can be short or long, and it is possible
to distinguish the two taggers based on an integer constant put in that
prefix. Nonetheless, creating a separate tagger is still justified,
since the tag prefix is optional, and without it, there is again no way
to distinguish.

Claiming backwards binary compatibility is a bit more tough, since I've
already changed the format of tag_ocelot once, in commit 5124197ce58b
("net: dsa: tag_ocelot: use a short prefix on both ingress and egress").
Therefore I am not very concerned with treating this as a bugfix and
backporting it to stable kernels (which would be another mess due to the
fact that there would be lots of conflicts with the other DSA_TAG_PROTO*
definitions). It's just simpler to say that the string values of the
taggers have ABI value starting with kernel 5.12, which will be when the
changing of tag protocol via /sys/class/net/<dsa-master>/dsa/tagging
goes live.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c           | 18 ++-------
 drivers/net/dsa/ocelot/felix.h           |  1 -
 drivers/net/dsa/ocelot/felix_vsc9959.c   | 26 ------------
 drivers/net/dsa/ocelot/seville_vsc9953.c | 28 +------------
 include/linux/dsa/ocelot.h               | 10 +++++
 include/net/dsa.h                        |  2 +
 net/dsa/tag_ocelot.c                     | 68 +++++++++++++++++++++++++-------
 7 files changed, 70 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 60e0d354adea..7e1f6350139b 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -431,6 +431,7 @@ static int felix_set_tag_protocol(struct dsa_switch *ds, int cpu,
 	int err;
 
 	switch (proto) {
+	case DSA_TAG_PROTO_SEVILLE:
 	case DSA_TAG_PROTO_OCELOT:
 		err = felix_setup_tag_npi(ds, cpu);
 		break;
@@ -448,6 +449,7 @@ static void felix_del_tag_protocol(struct dsa_switch *ds, int cpu,
 				   enum dsa_tag_protocol proto)
 {
 	switch (proto) {
+	case DSA_TAG_PROTO_SEVILLE:
 	case DSA_TAG_PROTO_OCELOT:
 		felix_teardown_tag_npi(ds, cpu);
 		break;
@@ -471,7 +473,8 @@ static int felix_change_tag_protocol(struct dsa_switch *ds, int cpu,
 	enum dsa_tag_protocol old_proto = felix->tag_proto;
 	int err;
 
-	if (proto != DSA_TAG_PROTO_OCELOT &&
+	if (proto != DSA_TAG_PROTO_SEVILLE &&
+	    proto != DSA_TAG_PROTO_OCELOT &&
 	    proto != DSA_TAG_PROTO_OCELOT_8021Q)
 		return -EPROTONOSUPPORT;
 
@@ -1003,7 +1006,6 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports)
 	for (port = 0; port < num_phys_ports; port++) {
 		struct ocelot_port *ocelot_port;
 		struct regmap *target;
-		u8 *template;
 
 		ocelot_port = devm_kzalloc(ocelot->dev,
 					   sizeof(struct ocelot_port),
@@ -1029,22 +1031,10 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports)
 			return PTR_ERR(target);
 		}
 
-		template = devm_kzalloc(ocelot->dev, OCELOT_TOTAL_TAG_LEN,
-					GFP_KERNEL);
-		if (!template) {
-			dev_err(ocelot->dev,
-				"Failed to allocate memory for DSA tag\n");
-			kfree(port_phy_modes);
-			return -ENOMEM;
-		}
-
 		ocelot_port->phy_mode = port_phy_modes[port];
 		ocelot_port->ocelot = ocelot;
 		ocelot_port->target = target;
-		ocelot_port->xmit_template = template;
 		ocelot->ports[port] = ocelot_port;
-
-		felix->info->xmit_template_populate(ocelot, port);
 	}
 
 	kfree(port_phy_modes);
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 9d4459f2fffb..b2ea425c5803 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -34,7 +34,6 @@ struct felix_info {
 				 enum tc_setup_type type, void *type_data);
 	void	(*port_sched_speed_set)(struct ocelot *ocelot, int port,
 					u32 speed);
-	void	(*xmit_template_populate)(struct ocelot *ocelot, int port);
 };
 
 extern const struct dsa_switch_ops felix_switch_ops;
diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index cacc6f9c0113..32b885fcaf90 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1339,31 +1339,6 @@ static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port,
 	}
 }
 
-static void vsc9959_xmit_template_populate(struct ocelot *ocelot, int port)
-{
-	struct ocelot_port *ocelot_port = ocelot->ports[port];
-	u8 *template = ocelot_port->xmit_template;
-	u64 bypass, dest, src;
-	__be32 *prefix;
-	u8 *injection;
-
-	/* Set the source port as the CPU port module and not the
-	 * NPI port
-	 */
-	src = ocelot->num_phys_ports;
-	dest = BIT(port);
-	bypass = true;
-
-	injection = template + OCELOT_SHORT_PREFIX_LEN;
-	prefix = (__be32 *)template;
-
-	packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0);
-	packing(injection, &dest,    68,  56, OCELOT_TAG_LEN, PACK, 0);
-	packing(injection, &src,     46,  43, OCELOT_TAG_LEN, PACK, 0);
-
-	*prefix = cpu_to_be32(0x8880000a);
-}
-
 static const struct felix_info felix_info_vsc9959 = {
 	.target_io_res		= vsc9959_target_io_res,
 	.port_io_res		= vsc9959_port_io_res,
@@ -1386,7 +1361,6 @@ static const struct felix_info felix_info_vsc9959 = {
 	.prevalidate_phy_mode	= vsc9959_prevalidate_phy_mode,
 	.port_setup_tc		= vsc9959_port_setup_tc,
 	.port_sched_speed_set	= vsc9959_sched_speed_set,
-	.xmit_template_populate	= vsc9959_xmit_template_populate,
 };
 
 static irqreturn_t felix_irq_handler(int irq, void *data)
diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c
index d7348ea4831e..84f93a874d50 100644
--- a/drivers/net/dsa/ocelot/seville_vsc9953.c
+++ b/drivers/net/dsa/ocelot/seville_vsc9953.c
@@ -1165,31 +1165,6 @@ static void vsc9953_mdio_bus_free(struct ocelot *ocelot)
 	mdiobus_unregister(felix->imdio);
 }
 
-static void vsc9953_xmit_template_populate(struct ocelot *ocelot, int port)
-{
-	struct ocelot_port *ocelot_port = ocelot->ports[port];
-	u8 *template = ocelot_port->xmit_template;
-	u64 bypass, dest, src;
-	__be32 *prefix;
-	u8 *injection;
-
-	/* Set the source port as the CPU port module and not the
-	 * NPI port
-	 */
-	src = ocelot->num_phys_ports;
-	dest = BIT(port);
-	bypass = true;
-
-	injection = template + OCELOT_SHORT_PREFIX_LEN;
-	prefix = (__be32 *)template;
-
-	packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0);
-	packing(injection, &dest,    67,  57, OCELOT_TAG_LEN, PACK, 0);
-	packing(injection, &src,     46,  43, OCELOT_TAG_LEN, PACK, 0);
-
-	*prefix = cpu_to_be32(0x88800005);
-}
-
 static const struct felix_info seville_info_vsc9953 = {
 	.target_io_res		= vsc9953_target_io_res,
 	.port_io_res		= vsc9953_port_io_res,
@@ -1206,7 +1181,6 @@ static const struct felix_info seville_info_vsc9953 = {
 	.mdio_bus_free		= vsc9953_mdio_bus_free,
 	.phylink_validate	= vsc9953_phylink_validate,
 	.prevalidate_phy_mode	= vsc9953_prevalidate_phy_mode,
-	.xmit_template_populate	= vsc9953_xmit_template_populate,
 };
 
 static int seville_probe(struct platform_device *pdev)
@@ -1246,7 +1220,7 @@ static int seville_probe(struct platform_device *pdev)
 	ds->ops = &felix_switch_ops;
 	ds->priv = ocelot;
 	felix->ds = ds;
-	felix->tag_proto = DSA_TAG_PROTO_OCELOT;
+	felix->tag_proto = DSA_TAG_PROTO_SEVILLE;
 
 	err = dsa_register_switch(ds);
 	if (err) {
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index 763dbbfaff7a..c6bc45ae5e03 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -195,6 +195,16 @@ static inline void ocelot_ifh_set_qos_class(void *injection, u64 qos_class)
 	packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0);
 }
 
+static inline void seville_ifh_set_dest(void *injection, u64 dest)
+{
+	packing(injection, &dest, 67, 57, OCELOT_TAG_LEN, PACK, 0);
+}
+
+static inline void ocelot_ifh_set_src(void *injection, u64 src)
+{
+	packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0);
+}
+
 static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type)
 {
 	packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 74457aaffec7..b095ef114fe8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -48,6 +48,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_HELLCREEK_VALUE		18
 #define DSA_TAG_PROTO_XRS700X_VALUE		19
 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE	20
+#define DSA_TAG_PROTO_SEVILLE_VALUE		21
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -71,6 +72,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_HELLCREEK		= DSA_TAG_PROTO_HELLCREEK_VALUE,
 	DSA_TAG_PROTO_XRS700X		= DSA_TAG_PROTO_XRS700X_VALUE,
 	DSA_TAG_PROTO_OCELOT_8021Q	= DSA_TAG_PROTO_OCELOT_8021Q_VALUE,
+	DSA_TAG_PROTO_SEVILLE		= DSA_TAG_PROTO_SEVILLE_VALUE,
 };
 
 struct packet_type;
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index fe00519229d7..a7dd61c8e005 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -24,33 +24,52 @@ static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection,
 	ocelot_ifh_set_rew_op(injection, rew_op);
 }
 
-static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
-				   struct net_device *netdev)
+static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
+			       __be32 ifh_prefix, void **ifh)
 {
 	struct dsa_port *dp = dsa_slave_to_port(netdev);
 	struct sk_buff *clone = DSA_SKB_CB(skb)->clone;
 	struct dsa_switch *ds = dp->ds;
-	struct ocelot *ocelot = ds->priv;
-	struct ocelot_port *ocelot_port;
-	u8 *prefix, *injection;
-
-	ocelot_port = ocelot->ports[dp->index];
+	void *injection;
+	__be32 *prefix;
 
 	injection = skb_push(skb, OCELOT_TAG_LEN);
-
 	prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN);
 
-	memcpy(prefix, ocelot_port->xmit_template, OCELOT_TOTAL_TAG_LEN);
-
-	/* Fix up the fields which are not statically determined
-	 * in the template
-	 */
+	*prefix = ifh_prefix;
+	memset(injection, 0, OCELOT_TAG_LEN);
+	ocelot_ifh_set_bypass(injection, 1);
+	ocelot_ifh_set_src(injection, ds->num_ports);
 	ocelot_ifh_set_qos_class(injection, skb->priority);
 
 	/* TX timestamping was requested */
 	if (clone)
 		ocelot_xmit_ptp(dp, injection, clone);
 
+	*ifh = injection;
+}
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+				   struct net_device *netdev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	void *injection;
+
+	ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection);
+	ocelot_ifh_set_dest(injection, BIT(dp->index));
+
+	return skb;
+}
+
+static struct sk_buff *seville_xmit(struct sk_buff *skb,
+				    struct net_device *netdev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	void *injection;
+
+	ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection);
+	seville_ifh_set_dest(injection, BIT(dp->index));
+
 	return skb;
 }
 
@@ -147,7 +166,26 @@ static const struct dsa_device_ops ocelot_netdev_ops = {
 	.promisc_on_master	= true,
 };
 
-MODULE_LICENSE("GPL v2");
+DSA_TAG_DRIVER(ocelot_netdev_ops);
 MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT);
 
-module_dsa_tag_driver(ocelot_netdev_ops);
+static const struct dsa_device_ops seville_netdev_ops = {
+	.name			= "seville",
+	.proto			= DSA_TAG_PROTO_SEVILLE,
+	.xmit			= seville_xmit,
+	.rcv			= ocelot_rcv,
+	.overhead		= OCELOT_TOTAL_TAG_LEN,
+	.promisc_on_master	= true,
+};
+
+DSA_TAG_DRIVER(seville_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE);
+
+static struct dsa_tag_driver *ocelot_tag_driver_array[] = {
+	&DSA_TAG_DRIVER_NAME(ocelot_netdev_ops),
+	&DSA_TAG_DRIVER_NAME(seville_netdev_ops),
+};
+
+module_dsa_tag_drivers(ocelot_tag_driver_array);
+
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From b2498cb87c4ba87580e5975e049d589b6786ff75 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 22 Jan 2021 14:38:52 +0200
Subject: gpio: aggregator: Use compound literal from the header

Instead of doing it in place, convert GPIO_LOOKUP_IDX() and GPIO_HOG()
to be compund literals that's allow to use them as rvalue in assignments.

Due to above conversion, use compound literal from the header
in the gpio-aggregator.c.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/gpio/gpio-aggregator.c | 3 +--
 include/linux/gpio/machine.h   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c
index 40a081b095fb..13e473c97ce4 100644
--- a/drivers/gpio/gpio-aggregator.c
+++ b/drivers/gpio/gpio-aggregator.c
@@ -72,8 +72,7 @@ static int aggr_add_gpio(struct gpio_aggregator *aggr, const char *key,
 	if (!lookups)
 		return -ENOMEM;
 
-	lookups->table[*n] =
-		(struct gpiod_lookup)GPIO_LOOKUP_IDX(key, hwnum, NULL, *n, 0);
+	lookups->table[*n] = GPIO_LOOKUP_IDX(key, hwnum, NULL, *n, 0);
 
 	(*n)++;
 	memset(&lookups->table[*n], 0, sizeof(lookups->table[*n]));
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index 781a053abbb9..d755e529c1e3 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -75,7 +75,7 @@ struct gpiod_hog {
  * gpiod_get_index()
  */
 #define GPIO_LOOKUP_IDX(_key, _chip_hwnum, _con_id, _idx, _flags)         \
-{                                                                         \
+(struct gpiod_lookup) {                                                   \
 	.key = _key,                                                      \
 	.chip_hwnum = _chip_hwnum,                                        \
 	.con_id = _con_id,                                                \
@@ -87,7 +87,7 @@ struct gpiod_hog {
  * Simple definition of a single GPIO hog in an array.
  */
 #define GPIO_HOG(_chip_label, _chip_hwnum, _line_name, _lflags, _dflags)  \
-{                                                                         \
+(struct gpiod_hog) {                                                      \
 	.chip_label = _chip_label,                                        \
 	.chip_hwnum = _chip_hwnum,                                        \
 	.line_name = _line_name,                                          \
-- 
cgit v1.2.3


From 3019270282a175defc02c8331786c73e082cd2a8 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu@kernel.org>
Date: Mon, 15 Feb 2021 10:44:58 +0000
Subject: Revert "Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the
 ring buffer"

This reverts commit a8c3209998afb5c4941b49e35b513cea9050cb4a.

It is reported that the said commit caused regression in netvsc.

Reported-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel.c              |  9 ++---
 drivers/hv/hv_fcopy.c             |  1 -
 drivers/hv/hv_kvp.c               |  1 -
 drivers/hv/hyperv_vmbus.h         |  2 +-
 drivers/hv/ring_buffer.c          | 82 ++++++---------------------------------
 drivers/net/hyperv/hyperv_net.h   |  3 --
 drivers/net/hyperv/netvsc.c       |  2 -
 drivers/net/hyperv/rndis_filter.c |  2 -
 drivers/scsi/storvsc_drv.c        | 10 -----
 include/linux/hyperv.h            | 48 ++++-------------------
 net/vmw_vsock/hyperv_transport.c  |  4 +-
 11 files changed, 25 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 9aa789e5f22b..0bd202de7960 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -597,15 +597,12 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
 	newchannel->onchannel_callback = onchannelcallback;
 	newchannel->channel_callback_context = context;
 
-	if (!newchannel->max_pkt_size)
-		newchannel->max_pkt_size = VMBUS_DEFAULT_MAX_PKT_SIZE;
-
-	err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages, 0);
+	err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages);
 	if (err)
 		goto error_clean_ring;
 
-	err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
-				 recv_pages, newchannel->max_pkt_size);
+	err = hv_ringbuffer_init(&newchannel->inbound,
+				 &page[send_pages], recv_pages);
 	if (err)
 		goto error_clean_ring;
 
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 660036da7449..59ce85e00a02 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -349,7 +349,6 @@ int hv_fcopy_init(struct hv_util_service *srv)
 {
 	recv_buffer = srv->recv_buffer;
 	fcopy_transaction.recv_channel = srv->channel;
-	fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index c698592b83e4..b49962d312ce 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -757,7 +757,6 @@ hv_kvp_init(struct hv_util_service *srv)
 {
 	recv_buffer = srv->recv_buffer;
 	kvp_transaction.recv_channel = srv->channel;
-	kvp_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 4;
 
 	/*
 	 * When this driver loads, the user level daemon that
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 42f3d9d123a1..9416e09ebd58 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -174,7 +174,7 @@ extern int hv_synic_cleanup(unsigned int cpu);
 void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
 
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 pagecnt, u32 max_pkt_size);
+		       struct page *pages, u32 pagecnt);
 
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 29e90477363a..35833d4d1a1d 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -190,7 +190,7 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 
 /* Initialize the ring buffer. */
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 page_cnt, u32 max_pkt_size)
+		       struct page *pages, u32 page_cnt)
 {
 	int i;
 	struct page **pages_wraparound;
@@ -232,14 +232,6 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 		sizeof(struct hv_ring_buffer);
 	ring_info->priv_read_index = 0;
 
-	/* Initialize buffer that holds copies of incoming packets */
-	if (max_pkt_size) {
-		ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL);
-		if (!ring_info->pkt_buffer)
-			return -ENOMEM;
-		ring_info->pkt_buffer_size = max_pkt_size;
-	}
-
 	spin_lock_init(&ring_info->ring_lock);
 
 	return 0;
@@ -252,9 +244,6 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 	vunmap(ring_info->ring_buffer);
 	ring_info->ring_buffer = NULL;
 	mutex_unlock(&ring_info->ring_buffer_mutex);
-
-	kfree(ring_info->pkt_buffer);
-	ring_info->pkt_buffer_size = 0;
 }
 
 /* Write to the ring buffer. */
@@ -396,7 +385,7 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 	memcpy(buffer, (const char *)desc + offset, packetlen);
 
 	/* Advance ring index to next packet descriptor */
-	__hv_pkt_iter_next(channel, desc, true);
+	__hv_pkt_iter_next(channel, desc);
 
 	/* Notify host of update */
 	hv_pkt_iter_close(channel);
@@ -422,22 +411,6 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 		return (rbi->ring_datasize - priv_read_loc) + write_loc;
 }
 
-/*
- * Get first vmbus packet without copying it out of the ring buffer
- */
-struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel)
-{
-	struct hv_ring_buffer_info *rbi = &channel->inbound;
-
-	hv_debug_delay_test(channel, MESSAGE_DELAY);
-
-	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
-		return NULL;
-
-	return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
-}
-EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw);
-
 /*
  * Get first vmbus packet from ring buffer after read_index
  *
@@ -446,49 +419,17 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw);
 struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
-	struct vmpacket_descriptor *desc, *desc_copy;
-	u32 bytes_avail, pkt_len, pkt_offset;
+	struct vmpacket_descriptor *desc;
 
-	desc = hv_pkt_iter_first_raw(channel);
-	if (!desc)
+	hv_debug_delay_test(channel, MESSAGE_DELAY);
+	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
 		return NULL;
 
-	bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi));
-
-	/*
-	 * Ensure the compiler does not use references to incoming Hyper-V values (which
-	 * could change at any moment) when reading local variables later in the code
-	 */
-	pkt_len = READ_ONCE(desc->len8) << 3;
-	pkt_offset = READ_ONCE(desc->offset8) << 3;
-
-	/*
-	 * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and
-	 * rbi->pkt_buffer_size
-	 */
-	if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail)
-		pkt_len = bytes_avail;
-
-	/*
-	 * If pkt_offset is invalid, arbitrarily set it to
-	 * the size of vmpacket_descriptor
-	 */
-	if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len)
-		pkt_offset = sizeof(struct vmpacket_descriptor);
-
-	/* Copy the Hyper-V packet out of the ring buffer */
-	desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer;
-	memcpy(desc_copy, desc, pkt_len);
-
-	/*
-	 * Hyper-V could still change len8 and offset8 after the earlier read.
-	 * Ensure that desc_copy has legal values for len8 and offset8 that
-	 * are consistent with the copy we just made
-	 */
-	desc_copy->len8 = pkt_len >> 3;
-	desc_copy->offset8 = pkt_offset >> 3;
+	desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index;
+	if (desc)
+		prefetch((char *)desc + (desc->len8 << 3));
 
-	return desc_copy;
+	return desc;
 }
 EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
 
@@ -500,8 +441,7 @@ EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
  */
 struct vmpacket_descriptor *
 __hv_pkt_iter_next(struct vmbus_channel *channel,
-		   const struct vmpacket_descriptor *desc,
-		   bool copy)
+		   const struct vmpacket_descriptor *desc)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
 	u32 packetlen = desc->len8 << 3;
@@ -514,7 +454,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel,
 		rbi->priv_read_index -= dsize;
 
 	/* more data? */
-	return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel);
+	return hv_pkt_iter_first(channel);
 }
 EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 7ea6936f86ef..2a87cfa27ac0 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -860,12 +860,9 @@ static inline u32 netvsc_rqstor_size(unsigned long ringbytes)
 		ringbytes / NETVSC_MIN_IN_MSG_SIZE;
 }
 
-#define NETVSC_MAX_XFER_PAGE_RANGES 375
 #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \
 		(offsetof(struct vmtransfer_page_packet_header, ranges) + \
 		(rng_cnt) * sizeof(struct vmtransfer_page_range))
-#define NETVSC_MAX_PKT_SIZE (NETVSC_XFER_HEADER_SIZE(NETVSC_MAX_XFER_PAGE_RANGES) + \
-		sizeof(struct nvsp_message) + (sizeof(u32) * VRSS_SEND_TAB_SIZE))
 
 struct multi_send_data {
 	struct sk_buff *skb; /* skb containing the pkt */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 51005f2d4a82..2353623259f3 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1544,8 +1544,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 	/* Open the channel */
 	device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
-	device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
-
 	ret = vmbus_open(device->channel, netvsc_ring_bytes,
 			 netvsc_ring_bytes,  NULL, 0,
 			 netvsc_channel_cb, net_device->chan_table);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 7e6dee2f02a4..598713c0d5a8 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1174,8 +1174,6 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	nvchan->channel = new_sc;
 
 	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
-	new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE;
-
 	ret = vmbus_open(new_sc, netvsc_ring_bytes,
 			 netvsc_ring_bytes, NULL, 0,
 			 netvsc_channel_cb, nvchan);
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 7e59284dbf5b..2e4fa77445fd 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -414,14 +414,6 @@ static void storvsc_on_channel_callback(void *context);
 #define STORVSC_IDE_MAX_TARGETS				1
 #define STORVSC_IDE_MAX_CHANNELS			1
 
-/*
- * Upper bound on the size of a storvsc packet. vmscsi_size_delta is not
- * included in the calculation because it is set after STORVSC_MAX_PKT_SIZE
- * is used in storvsc_connect_to_vsp
- */
-#define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\
-			      sizeof(struct vstor_packet))
-
 struct storvsc_cmd_request {
 	struct scsi_cmnd *cmd;
 
@@ -706,7 +698,6 @@ static void handle_sc_creation(struct vmbus_channel *new_sc)
 		return;
 
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
-	new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 
 	/*
 	 * The size of vmbus_requestor is an upper bound on the number of requests
@@ -1289,7 +1280,6 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size,
 
 	memset(&props, 0, sizeof(struct vmstorage_channel_properties));
 
-	device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE;
 	/*
 	 * The size of vmbus_requestor is an upper bound on the number of requests
 	 * that can be in-progress at any one time across all channels.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 9dd22af1b7f6..f1d74dcf0353 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -181,10 +181,6 @@ struct hv_ring_buffer_info {
 	 * being freed while the ring buffer is being accessed.
 	 */
 	struct mutex ring_buffer_mutex;
-
-	/* Buffer that holds a copy of an incoming host packet */
-	void *pkt_buffer;
-	u32 pkt_buffer_size;
 };
 
 
@@ -792,8 +788,6 @@ struct vmbus_device {
 	bool allowed_in_isolated;
 };
 
-#define VMBUS_DEFAULT_MAX_PKT_SIZE 4096
-
 struct vmbus_channel {
 	struct list_head listentry;
 
@@ -1016,9 +1010,6 @@ struct vmbus_channel {
 	/* request/transaction ids for VMBus */
 	struct vmbus_requestor requestor;
 	u32 rqstor_size;
-
-	/* The max size of a packet on this channel */
-	u32 max_pkt_size;
 };
 
 u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr);
@@ -1660,44 +1651,15 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc)
 }
 
 
-struct vmpacket_descriptor *
-hv_pkt_iter_first_raw(struct vmbus_channel *channel);
-
 struct vmpacket_descriptor *
 hv_pkt_iter_first(struct vmbus_channel *channel);
 
 struct vmpacket_descriptor *
 __hv_pkt_iter_next(struct vmbus_channel *channel,
-		   const struct vmpacket_descriptor *pkt,
-		   bool copy);
+		   const struct vmpacket_descriptor *pkt);
 
 void hv_pkt_iter_close(struct vmbus_channel *channel);
 
-static inline struct vmpacket_descriptor *
-hv_pkt_iter_next_pkt(struct vmbus_channel *channel,
-		     const struct vmpacket_descriptor *pkt,
-		     bool copy)
-{
-	struct vmpacket_descriptor *nxt;
-
-	nxt = __hv_pkt_iter_next(channel, pkt, copy);
-	if (!nxt)
-		hv_pkt_iter_close(channel);
-
-	return nxt;
-}
-
-/*
- * Get next packet descriptor without copying it out of the ring buffer
- * If at end of list, return NULL and update host.
- */
-static inline struct vmpacket_descriptor *
-hv_pkt_iter_next_raw(struct vmbus_channel *channel,
-		     const struct vmpacket_descriptor *pkt)
-{
-	return hv_pkt_iter_next_pkt(channel, pkt, false);
-}
-
 /*
  * Get next packet descriptor from iterator
  * If at end of list, return NULL and update host.
@@ -1706,7 +1668,13 @@ static inline struct vmpacket_descriptor *
 hv_pkt_iter_next(struct vmbus_channel *channel,
 		 const struct vmpacket_descriptor *pkt)
 {
-	return hv_pkt_iter_next_pkt(channel, pkt, true);
+	struct vmpacket_descriptor *nxt;
+
+	nxt = __hv_pkt_iter_next(channel, pkt);
+	if (!nxt)
+		hv_pkt_iter_close(channel);
+
+	return nxt;
 }
 
 #define foreach_vmbus_pkt(pkt, channel) \
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index cd8b7c1ca9f1..630b851f8150 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -600,7 +600,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 		return -EOPNOTSUPP;
 
 	if (need_refill) {
-		hvs->recv_desc = hv_pkt_iter_first_raw(hvs->chan);
+		hvs->recv_desc = hv_pkt_iter_first(hvs->chan);
 		ret = hvs_update_recv_data(hvs);
 		if (ret)
 			return ret;
@@ -614,7 +614,7 @@ static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
 
 	hvs->recv_data_len -= to_read;
 	if (hvs->recv_data_len == 0) {
-		hvs->recv_desc = hv_pkt_iter_next_raw(hvs->chan, hvs->recv_desc);
+		hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc);
 		if (hvs->recv_desc) {
 			ret = hvs_update_recv_data(hvs);
 			if (ret)
-- 
cgit v1.2.3


From 2347961b11d4079deace3c81dceed460c08a8fc1 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <laurent@vivier.eu>
Date: Tue, 28 Jan 2020 14:25:39 +0100
Subject: binfmt_misc: pass binfmt_misc flags to the interpreter

It can be useful to the interpreter to know which flags are in use.

For instance, knowing if the preserve-argv[0] is in use would
allow to skip the pathname argument.

This patch uses an unused auxiliary vector, AT_FLAGS, to add a
flag to inform interpreter if the preserve-argv[0] is enabled.

Note by Helge Deller:
The real-world user of this patch is qemu-user, which needs to know
if it has to preserve the argv[0]. See Debian bug #970460.

Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Reviewed-by: YunQiang Su <ysu@wavecomp.com>
URL: http://bugs.debian.org/970460
Signed-off-by: Helge Deller <deller@gmx.de>
---
 fs/binfmt_elf.c              | 5 ++++-
 fs/binfmt_elf_fdpic.c        | 5 ++++-
 fs/binfmt_misc.c             | 4 +++-
 include/linux/binfmts.h      | 4 ++++
 include/uapi/linux/binfmts.h | 4 ++++
 5 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 950bc177238a..11fb68e68f65 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -186,6 +186,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 	unsigned char k_rand_bytes[16];
 	int items;
 	elf_addr_t *elf_info;
+	elf_addr_t flags = 0;
 	int ei_index;
 	const struct cred *cred = current_cred();
 	struct vm_area_struct *vma;
@@ -260,7 +261,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 	NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
 	NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
-	NEW_AUX_ENT(AT_FLAGS, 0);
+	if (bprm->interp_flags & BINPRM_FLAGS_PRESERVE_ARGV0)
+		flags |= AT_FLAGS_PRESERVE_ARGV0;
+	NEW_AUX_ENT(AT_FLAGS, flags);
 	NEW_AUX_ENT(AT_ENTRY, e_entry);
 	NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
 	NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index be4062b8ba75..7a78c91bc1c1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -506,6 +506,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	char __user *u_platform, *u_base_platform, *p;
 	int loop;
 	int nr;	/* reset for each csp adjustment */
+	unsigned long flags = 0;
 
 #ifdef CONFIG_MMU
 	/* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
@@ -648,7 +649,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_PHENT,	sizeof(struct elf_phdr));
 	NEW_AUX_ENT(AT_PHNUM,	exec_params->hdr.e_phnum);
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
-	NEW_AUX_ENT(AT_FLAGS,	0);
+	if (bprm->interp_flags & BINPRM_FLAGS_PRESERVE_ARGV0)
+		flags |= AT_FLAGS_PRESERVE_ARGV0;
+	NEW_AUX_ENT(AT_FLAGS,	flags);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
 	NEW_AUX_ENT(AT_UID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
 	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 3880a82da1dc..c457334de43f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -153,7 +153,9 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
 		goto ret;
 
-	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
+	if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) {
+		bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0;
+	} else {
 		retval = remove_arg_zero(bprm);
 		if (retval)
 			goto ret;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 0571701ab1c5..0abd93efc181 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -73,6 +73,10 @@ struct linux_binprm {
 #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
 #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
 
+/* preserve argv0 for the interpreter  */
+#define BINPRM_FLAGS_PRESERVE_ARGV0_BIT 3
+#define BINPRM_FLAGS_PRESERVE_ARGV0 (1 << BINPRM_FLAGS_PRESERVE_ARGV0_BIT)
+
 /* Function parameter for binfmt->coredump */
 struct coredump_params {
 	const kernel_siginfo_t *siginfo;
diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h
index 689025d9c185..c6f9450efc12 100644
--- a/include/uapi/linux/binfmts.h
+++ b/include/uapi/linux/binfmts.h
@@ -18,4 +18,8 @@ struct pt_regs;
 /* sizeof(linux_binprm->buf) */
 #define BINPRM_BUF_SIZE 256
 
+/* preserve argv0 for the interpreter  */
+#define AT_FLAGS_PRESERVE_ARGV0_BIT 0
+#define AT_FLAGS_PRESERVE_ARGV0 (1 << AT_FLAGS_PRESERVE_ARGV0_BIT)
+
 #endif /* _UAPI_LINUX_BINFMTS_H */
-- 
cgit v1.2.3


From 4590d98f5a4f466d17e5c81d7c9fc796da9a8cee Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 11 Feb 2021 15:40:02 +0200
Subject: sfi: Remove framework for deprecated firmware

SFI-based platforms are gone. So does this framework.

This removes mention of SFI through the drivers and other code as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-firmware-sfi       |  15 -
 Documentation/ABI/testing/sysfs-platform-kim       |   2 +-
 MAINTAINERS                                        |   7 -
 arch/x86/Kconfig                                   |   7 +-
 arch/x86/include/asm/intel-mid.h                   |  37 --
 arch/x86/include/asm/intel_scu_ipc_legacy.h        |  58 +--
 arch/x86/include/asm/platform_sst_audio.h          |   2 -
 arch/x86/kernel/apic/io_apic.c                     |   4 +-
 arch/x86/kernel/setup.c                            |   2 -
 arch/x86/pci/mmconfig-shared.c                     |   6 +-
 arch/x86/platform/Makefile                         |   1 -
 arch/x86/platform/intel-mid/Makefile               |   5 -
 arch/x86/platform/intel-mid/device_libs/Makefile   |  23 -
 .../intel-mid/device_libs/platform_bcm43xx.c       | 101 ----
 .../intel-mid/device_libs/platform_bma023.c        |  16 -
 .../platform/intel-mid/device_libs/platform_bt.c   | 101 ----
 .../intel-mid/device_libs/platform_emc1403.c       |  39 --
 .../intel-mid/device_libs/platform_gpio_keys.c     |  81 ----
 .../intel-mid/device_libs/platform_lis331.c        |  37 --
 .../intel-mid/device_libs/platform_max7315.c       |  77 ---
 .../intel-mid/device_libs/platform_mpu3050.c       |  32 --
 .../intel-mid/device_libs/platform_mrfld_pinctrl.c |  39 --
 .../intel-mid/device_libs/platform_mrfld_rtc.c     |  44 --
 .../intel-mid/device_libs/platform_mrfld_sd.c      |  43 --
 .../intel-mid/device_libs/platform_mrfld_spidev.c  |  50 --
 .../intel-mid/device_libs/platform_pcal9555a.c     |  95 ----
 .../intel-mid/device_libs/platform_tc35876x.c      |  42 --
 .../intel-mid/device_libs/platform_tca6416.c       |  53 ---
 arch/x86/platform/intel-mid/intel-mid.c            |   1 -
 arch/x86/platform/intel-mid/sfi.c                  | 419 -----------------
 arch/x86/platform/sfi/Makefile                     |   2 -
 arch/x86/platform/sfi/sfi.c                        | 100 ----
 drivers/Makefile                                   |   2 +-
 drivers/platform/x86/intel_scu_pcidrv.c            |  22 +-
 drivers/sfi/Kconfig                                |  18 -
 drivers/sfi/Makefile                               |   4 -
 drivers/sfi/sfi_acpi.c                             | 214 ---------
 drivers/sfi/sfi_core.c                             | 522 ---------------------
 drivers/sfi/sfi_core.h                             |  81 ----
 include/linux/sfi.h                                | 210 ---------
 include/linux/sfi_acpi.h                           |  93 ----
 init/main.c                                        |   2 -
 42 files changed, 14 insertions(+), 2695 deletions(-)
 delete mode 100644 Documentation/ABI/testing/sysfs-firmware-sfi
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/Makefile
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bma023.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_bt.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_lis331.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_max7315.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
 delete mode 100644 arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
 delete mode 100644 arch/x86/platform/intel-mid/sfi.c
 delete mode 100644 arch/x86/platform/sfi/Makefile
 delete mode 100644 arch/x86/platform/sfi/sfi.c
 delete mode 100644 drivers/sfi/Kconfig
 delete mode 100644 drivers/sfi/Makefile
 delete mode 100644 drivers/sfi/sfi_acpi.c
 delete mode 100644 drivers/sfi/sfi_core.c
 delete mode 100644 drivers/sfi/sfi_core.h
 delete mode 100644 include/linux/sfi.h
 delete mode 100644 include/linux/sfi_acpi.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-firmware-sfi b/Documentation/ABI/testing/sysfs-firmware-sfi
deleted file mode 100644
index 5210e0f06ddb..000000000000
--- a/Documentation/ABI/testing/sysfs-firmware-sfi
+++ /dev/null
@@ -1,15 +0,0 @@
-What:		/sys/firmware/sfi/tables/
-Date:		May 2010
-Contact:	Len Brown <lenb@kernel.org>
-Description:
-		SFI defines a number of small static memory tables
-		so the kernel can get platform information from firmware.
-
-		The tables are defined in the latest SFI specification:
-		http://simplefirmware.org/documentation
-
-		While the tables are used by the kernel, user-space
-		can observe them this way::
-
-		  # cd /sys/firmware/sfi/tables
-		  # cat $TABLENAME > $TABLENAME.bin
diff --git a/Documentation/ABI/testing/sysfs-platform-kim b/Documentation/ABI/testing/sysfs-platform-kim
index a7f81de68046..6a52d6d2b601 100644
--- a/Documentation/ABI/testing/sysfs-platform-kim
+++ b/Documentation/ABI/testing/sysfs-platform-kim
@@ -7,7 +7,7 @@ Description:
 		is connected. example: "/dev/ttyS0".
 
 		The device name flows down to architecture specific board
-		initialization file from the SFI/ATAGS bootloader
+		initialization file from the ATAGS bootloader
 		firmware. The name exposed is read from the user-space
 		dameon and opens the device when install is requested.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 8fdb7fe27537..fd41b96d71bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16234,13 +16234,6 @@ S:	Maintained
 F:	Documentation/fb/sm712fb.rst
 F:	drivers/video/fbdev/sm712*
 
-SIMPLE FIRMWARE INTERFACE (SFI)
-S:	Obsolete
-W:	http://simplefirmware.org/
-F:	arch/x86/platform/sfi/
-F:	drivers/sfi/
-F:	include/linux/sfi*.h
-
 SIMPLEFB FB DRIVER
 M:	Hans de Goede <hdegoede@redhat.com>
 L:	linux-fbdev@vger.kernel.org
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc73f98c3e42..7f5b77055266 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -444,7 +444,7 @@ config X86_X2APIC
 	  If you don't know what to do here, say N.
 
 config X86_MPPARSE
-	bool "Enable MPS table" if ACPI || SFI
+	bool "Enable MPS table" if ACPI
 	default y
 	depends on X86_LOCAL_APIC
 	help
@@ -603,7 +603,6 @@ config X86_INTEL_MID
 	depends on PCI
 	depends on X86_64 || (PCI_GOANY && X86_32)
 	depends on X86_IO_APIC
-	select SFI
 	select I2C
 	select DW_APB_TIMER
 	select APB_TIMER
@@ -2457,8 +2456,6 @@ source "kernel/power/Kconfig"
 
 source "drivers/acpi/Kconfig"
 
-source "drivers/sfi/Kconfig"
-
 config X86_APM_BOOT
 	def_bool y
 	depends on APM
@@ -2645,7 +2642,7 @@ config PCI_DIRECT
 config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access" if X86_64
 	default y
-	depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST)
+	depends on PCI && (ACPI || JAILHOUSE_GUEST)
 	depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG)
 
 config PCI_OLPC
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 7bda0587cf70..6306fe3e5c4a 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -7,7 +7,6 @@
 #ifndef _ASM_X86_INTEL_MID_H
 #define _ASM_X86_INTEL_MID_H
 
-#include <linux/sfi.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 
@@ -22,39 +21,6 @@ extern void intel_mid_pwr_power_off(void);
 
 extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
 
-extern int get_gpio_by_name(const char *name);
-
-/*
- * Here defines the array of devices platform data that IAFW would export
- * through SFI "DEVS" table, we use name and type to match the device and
- * its platform data.
- */
-struct devs_id {
-	char name[SFI_NAME_LEN + 1];
-	u8 type;
-	u8 delay;
-	void *(*get_platform_data)(void *info);
-};
-
-#define sfi_device(i)								\
-	static const struct devs_id *const __intel_mid_sfi_##i##_dev __used	\
-	__section(".x86_intel_mid_dev.init") = &i
-
-/**
-* struct mid_sd_board_info - template for SD device creation
-* @name:		identifies the driver
-* @bus_num:		board-specific identifier for a given SD controller
-* @max_clk:		the maximum frequency device supports
-* @platform_data:	the particular data stored there is driver-specific
-*/
-struct mid_sd_board_info {
-	char		name[SFI_NAME_LEN];
-	int		bus_num;
-	unsigned short	addr;
-	u32		max_clk;
-	void		*platform_data;
-};
-
 /*
  * Medfield is the follow-up of Moorestown, it combines two chip solution into
  * one. Other than that it also added always-on and constant tsc and lapic
@@ -99,7 +65,4 @@ static inline void intel_scu_devices_destroy(void) { }
 /* FSB 83MHz */
 #define BSEL_SOC_FUSE_111		0x7
 
-/* The offset for the mapping of global gpio pin to irq */
-#define INTEL_MID_IRQ_OFFSET		0x100
-
 #endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h
index f8b87d8face7..fa529e5ec142 100644
--- a/arch/x86/include/asm/intel_scu_ipc_legacy.h
+++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h
@@ -2,73 +2,17 @@
 #ifndef _ASM_X86_INTEL_SCU_IPC_LEGACY_H_
 #define _ASM_X86_INTEL_SCU_IPC_LEGACY_H_
 
-#include <linux/notifier.h>
-
-#define IPCMSG_INDIRECT_READ	0x02
-#define IPCMSG_INDIRECT_WRITE	0x05
+#include <linux/types.h>
 
 #define IPCMSG_COLD_OFF		0x80	/* Only for Tangier */
-
-#define IPCMSG_WARM_RESET	0xF0
 #define IPCMSG_COLD_RESET	0xF1
-#define IPCMSG_SOFT_RESET	0xF2
-#define IPCMSG_COLD_BOOT	0xF3
 
 /* Don't call these in new code - they will be removed eventually */
 
-/* Read a vector */
-static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
-{
-	return intel_scu_ipc_dev_readv(NULL, addr, data, len);
-}
-
-/* Write a vector */
-static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
-{
-	return intel_scu_ipc_dev_writev(NULL, addr, data, len);
-}
-
-/* Update single register based on the mask */
-static inline int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask)
-{
-	return intel_scu_ipc_dev_update(NULL, addr, data, mask);
-}
-
 /* Issue commands to the SCU with or without data */
 static inline int intel_scu_ipc_simple_command(int cmd, int sub)
 {
 	return intel_scu_ipc_dev_simple_command(NULL, cmd, sub);
 }
 
-static inline int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
-					u32 *out, int outlen)
-{
-	/* New API takes both inlen and outlen as bytes so convert here */
-	size_t inbytes = inlen * sizeof(u32);
-	size_t outbytes = outlen * sizeof(u32);
-
-	return intel_scu_ipc_dev_command_with_size(NULL, cmd, sub, in, inbytes,
-						   inlen, out, outbytes);
-}
-
-extern struct blocking_notifier_head intel_scu_notifier;
-
-static inline void intel_scu_notifier_add(struct notifier_block *nb)
-{
-	blocking_notifier_chain_register(&intel_scu_notifier, nb);
-}
-
-static inline void intel_scu_notifier_remove(struct notifier_block *nb)
-{
-	blocking_notifier_chain_unregister(&intel_scu_notifier, nb);
-}
-
-static inline int intel_scu_notifier_post(unsigned long v, void *p)
-{
-	return blocking_notifier_call_chain(&intel_scu_notifier, v, p);
-}
-
-#define		SCU_AVAILABLE		1
-#define		SCU_DOWN		2
-
 #endif
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
index 16b9f220bdeb..40f92270515b 100644
--- a/arch/x86/include/asm/platform_sst_audio.h
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -10,8 +10,6 @@
 #ifndef _PLATFORM_SST_AUDIO_H_
 #define _PLATFORM_SST_AUDIO_H_
 
-#include <linux/sfi.h>
-
 #define MAX_NUM_STREAMS_MRFLD	25
 #define MAX_NUM_STREAMS	MAX_NUM_STREAMS_MRFLD
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e4ab4804b20d..c3b60c37c728 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -198,7 +198,7 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
-/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+/* Will be called in mpparse/ACPI codes for saving IRQ info */
 void mp_save_irq(struct mpc_intsrc *m)
 {
 	int i;
@@ -2863,7 +2863,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 
 	/*
 	 * If mp_register_ioapic() is called during early boot stage when
-	 * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+	 * walking ACPI/DT tables, it's too early to create irqdomain,
 	 * we are still using bootmem allocator. So delay it to setup_IO_APIC().
 	 */
 	if (hotplug) {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 740f3bdb3f61..d883176ef2ce 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -16,7 +16,6 @@
 #include <linux/memblock.h>
 #include <linux/pci.h>
 #include <linux/root_dev.h>
-#include <linux/sfi.h>
 #include <linux/hugetlb.h>
 #include <linux/tboot.h>
 #include <linux/usb/xhci-dbgp.h>
@@ -1185,7 +1184,6 @@ void __init setup_arch(char **cmdline_p)
 	 * Read APIC and some other early information from ACPI tables.
 	 */
 	acpi_boot_init();
-	sfi_init();
 	x86_dtb_init();
 
 	/*
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 234998f196d4..de6bf0e7e8f8 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -11,9 +11,9 @@
  * themselves.
  */
 
+#include <linux/acpi.h>
 #include <linux/pci.h>
 #include <linux/init.h>
-#include <linux/sfi_acpi.h>
 #include <linux/bitmap.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
@@ -665,7 +665,7 @@ void __init pci_mmcfg_early_init(void)
 		if (pci_mmcfg_check_hostbridge())
 			known_bridge = 1;
 		else
-			acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+			acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 		__pci_mmcfg_init(1);
 
 		set_apei_filter();
@@ -683,7 +683,7 @@ void __init pci_mmcfg_late_init(void)
 
 	/* MMCONFIG hasn't been enabled yet, try again */
 	if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
-		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+		acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 		__pci_mmcfg_init(0);
 	}
 }
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index d0e835470d01..d1f5228225d9 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -11,6 +11,5 @@ obj-y	+= intel-mid/
 obj-y	+= intel-quark/
 obj-y	+= olpc/
 obj-y	+= scx200/
-obj-y	+= sfi/
 obj-y	+= ts5500/
 obj-y	+= uv/
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 5794e661050c..ddfc08783fb8 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,7 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o pwr.o
-
-# SFI specific code
-ifdef CONFIG_X86_INTEL_MID
-obj-$(CONFIG_SFI) += sfi.o device_libs/
-endif
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
deleted file mode 100644
index 4d008b053ac8..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Family-Level Interface Shim (FLIS)
-obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
-# SDHCI Devices
-obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
-# WiFi + BT
-obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
-obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o
-# SPI Devices
-obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o
-# I2C Devices
-obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o
-obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o
-obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o
-obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
-obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
-# I2C GPIO Expanders
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
-# MISC Devices
-obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
-obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
deleted file mode 100644
index 564c47c53f3a..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_bcm43xx.c: bcm43xx platform data initialization file
- *
- * (C) Copyright 2016 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/machine.h>
-#include <linux/regulator/fixed.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-
-#define WLAN_SFI_GPIO_IRQ_NAME		"WLAN-interrupt"
-#define WLAN_SFI_GPIO_ENABLE_NAME	"WLAN-enable"
-
-#define WLAN_DEV_NAME			"0000:00:01.3"
-
-static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
-	.dev_name		= WLAN_DEV_NAME,
-	.supply			= "vmmc",
-};
-
-static struct regulator_init_data bcm43xx_vmmc_data = {
-	.constraints = {
-		.valid_ops_mask		= REGULATOR_CHANGE_STATUS,
-	},
-	.num_consumer_supplies	= 1,
-	.consumer_supplies	= &bcm43xx_vmmc_supply,
-};
-
-static struct fixed_voltage_config bcm43xx_vmmc = {
-	.supply_name		= "bcm43xx-vmmc-regulator",
-	/*
-	 * Announce 2.0V here to be compatible with SDIO specification. The
-	 * real voltage and signaling are still 1.8V.
-	 */
-	.microvolts		= 2000000,		/* 1.8V */
-	.startup_delay		= 250 * 1000,		/* 250ms */
-	.enabled_at_boot	= 0,			/* disabled at boot */
-	.init_data		= &bcm43xx_vmmc_data,
-};
-
-static struct platform_device bcm43xx_vmmc_regulator = {
-	.name		= "reg-fixed-voltage",
-	.id		= PLATFORM_DEVID_AUTO,
-	.dev = {
-		.platform_data	= &bcm43xx_vmmc,
-	},
-};
-
-static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = {
-	.dev_id	= "reg-fixed-voltage.0",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW),
-		{}
-	},
-};
-
-static int __init bcm43xx_regulator_register(void)
-{
-	struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
-	int ret;
-
-	lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
-	gpiod_add_lookup_table(table);
-
-	ret = platform_device_register(&bcm43xx_vmmc_regulator);
-	if (ret) {
-		pr_err("%s: vmmc regulator register failed\n", __func__);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void __init *bcm43xx_platform_data(void *info)
-{
-	int ret;
-
-	ret = bcm43xx_regulator_register();
-	if (ret)
-		return NULL;
-
-	pr_info("Using generic wifi platform data\n");
-
-	/* For now it's empty */
-	return NULL;
-}
-
-static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
-	.name			= "bcm43xx_clk_vmmc",
-	.type			= SFI_DEV_TYPE_SD,
-	.get_platform_data	= &bcm43xx_platform_data,
-};
-
-sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
deleted file mode 100644
index 32912a17f68e..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_bma023.c: bma023 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- */
-
-#include <asm/intel-mid.h>
-
-static const struct devs_id bma023_dev_id __initconst = {
-	.name = "bma023",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-};
-
-sfi_device(bma023_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
deleted file mode 100644
index 31dda18bb370..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Bluetooth platform data initialization file
- *
- * (C) Copyright 2017 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <linux/pci.h>
-#include <linux/platform_device.h>
-
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-#include <asm/intel-mid.h>
-
-struct bt_sfi_data {
-	struct device *dev;
-	const char *name;
-	int (*setup)(struct bt_sfi_data *ddata);
-};
-
-static struct gpiod_lookup_table tng_bt_sfi_gpio_table = {
-	.dev_id	= "hci_bcm",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown",      GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup",   GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
-#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP	"bt_wakeup"
-#define TNG_BT_SFI_GPIO_SHUTDOWN	"BT-reset"
-#define TNG_BT_SFI_GPIO_HOST_WAKEUP	"bt_uart_enable"
-
-static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
-{
-	struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
-	struct pci_dev *pdev;
-
-	/* Connected to /dev/ttyS0 */
-	pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1));
-	if (!pdev)
-		return -ENODEV;
-
-	ddata->dev = &pdev->dev;
-	ddata->name = table->dev_id;
-
-	lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP);
-	lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN);
-	lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP);
-
-	gpiod_add_lookup_table(table);
-	return 0;
-}
-
-static struct bt_sfi_data tng_bt_sfi_data __initdata = {
-	.setup	= tng_bt_sfi_setup,
-};
-
-static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
-	X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID,	&tng_bt_sfi_data),
-	{}
-};
-
-static int __init bt_sfi_init(void)
-{
-	struct platform_device_info info;
-	struct platform_device *pdev;
-	const struct x86_cpu_id *id;
-	struct bt_sfi_data *ddata;
-	int ret;
-
-	id = x86_match_cpu(bt_sfi_cpu_ids);
-	if (!id)
-		return -ENODEV;
-
-	ddata = (struct bt_sfi_data *)id->driver_data;
-	if (!ddata)
-		return -ENODEV;
-
-	ret = ddata->setup(ddata);
-	if (ret)
-		return ret;
-
-	memset(&info, 0, sizeof(info));
-	info.fwnode	= ddata->dev->fwnode;
-	info.parent	= ddata->dev;
-	info.name	= ddata->name,
-	info.id		= PLATFORM_DEVID_NONE,
-
-	pdev = platform_device_register_full(&info);
-	if (IS_ERR(pdev))
-		return PTR_ERR(pdev);
-
-	dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name);
-	return 0;
-}
-device_initcall(bt_sfi_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
deleted file mode 100644
index a2508582a0b1..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_emc1403.c: emc1403 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <asm/intel-mid.h>
-
-static void __init *emc1403_platform_data(void *info)
-{
-	static short intr2nd_pdata;
-	struct i2c_board_info *i2c_info = info;
-	int intr = get_gpio_by_name("thermal_int");
-	int intr2nd = get_gpio_by_name("thermal_alert");
-
-	if (intr < 0)
-		return NULL;
-	if (intr2nd < 0)
-		return NULL;
-
-	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
-
-	return &intr2nd_pdata;
-}
-
-static const struct devs_id emc1403_dev_id __initconst = {
-	.name = "emc1403",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &emc1403_platform_data,
-};
-
-sfi_device(emc1403_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
deleted file mode 100644
index d9435d2196a4..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_gpio_keys.c: gpio_keys platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/input.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/platform_device.h>
-#include <asm/intel-mid.h>
-
-#define DEVICE_NAME "gpio-keys"
-
-/*
- * we will search these buttons in SFI GPIO table (by name)
- * and register them dynamically. Please add all possible
- * buttons here, we will shrink them if no GPIO found.
- */
-static struct gpio_keys_button gpio_button[] = {
-	{KEY_POWER,		-1, 1, "power_btn",	EV_KEY, 0, 3000},
-	{KEY_PROG1,		-1, 1, "prog_btn1",	EV_KEY, 0, 20},
-	{KEY_PROG2,		-1, 1, "prog_btn2",	EV_KEY, 0, 20},
-	{SW_LID,		-1, 1, "lid_switch",	EV_SW,  0, 20},
-	{KEY_VOLUMEUP,		-1, 1, "vol_up",	EV_KEY, 0, 20},
-	{KEY_VOLUMEDOWN,	-1, 1, "vol_down",	EV_KEY, 0, 20},
-	{KEY_MUTE,		-1, 1, "mute_enable",	EV_KEY, 0, 20},
-	{KEY_VOLUMEUP,		-1, 1, "volume_up",	EV_KEY, 0, 20},
-	{KEY_VOLUMEDOWN,	-1, 1, "volume_down",	EV_KEY, 0, 20},
-	{KEY_CAMERA,		-1, 1, "camera_full",	EV_KEY, 0, 20},
-	{KEY_CAMERA_FOCUS,	-1, 1, "camera_half",	EV_KEY, 0, 20},
-	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw1",	EV_SW,  0, 20},
-	{SW_KEYPAD_SLIDE,	-1, 1, "MagSw2",	EV_SW,  0, 20},
-};
-
-static struct gpio_keys_platform_data gpio_keys = {
-	.buttons	= gpio_button,
-	.rep		= 1,
-	.nbuttons	= -1, /* will fill it after search */
-};
-
-static struct platform_device pb_device = {
-	.name		= DEVICE_NAME,
-	.id		= -1,
-	.dev		= {
-		.platform_data	= &gpio_keys,
-	},
-};
-
-/*
- * Shrink the non-existent buttons, register the gpio button
- * device if there is some
- */
-static int __init pb_keys_init(void)
-{
-	struct gpio_keys_button *gb = gpio_button;
-	int i, good = 0;
-
-	for (i = 0; i < ARRAY_SIZE(gpio_button); i++) {
-		gb[i].gpio = get_gpio_by_name(gb[i].desc);
-		pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
-					gb[i].gpio);
-		if (gb[i].gpio < 0)
-			continue;
-
-		if (i != good)
-			gb[good] = gb[i];
-		good++;
-	}
-
-	if (good) {
-		gpio_keys.nbuttons = good;
-		return platform_device_register(&pb_device);
-	}
-	return 0;
-}
-late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
deleted file mode 100644
index a4485cd638c6..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_lis331.c:  lis331 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-
-static void __init *lis331dl_platform_data(void *info)
-{
-	static short intr2nd_pdata;
-	struct i2c_board_info *i2c_info = info;
-	int intr = get_gpio_by_name("accel_int");
-	int intr2nd = get_gpio_by_name("accel_2");
-
-	if (intr < 0)
-		return NULL;
-	if (intr2nd < 0)
-		return NULL;
-
-	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
-
-	return &intr2nd_pdata;
-}
-
-static const struct devs_id lis331dl_dev_id __initconst = {
-	.name = "i2c_accel",
-	.type = SFI_DEV_TYPE_I2C,
-	.get_platform_data = &lis331dl_platform_data,
-};
-
-sfi_device(lis331dl_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
deleted file mode 100644
index e9287c3184da..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_max7315.c: max7315 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <asm/intel-mid.h>
-
-#define MAX7315_NUM 2
-
-static void __init *max7315_platform_data(void *info)
-{
-	static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
-	static int nr;
-	struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
-	struct i2c_board_info *i2c_info = info;
-	int gpio_base, intr;
-	char base_pin_name[SFI_NAME_LEN + 1];
-	char intr_pin_name[SFI_NAME_LEN + 1];
-
-	if (nr == MAX7315_NUM) {
-		pr_err("too many max7315s, we only support %d\n",
-				MAX7315_NUM);
-		return NULL;
-	}
-	/* we have several max7315 on the board, we only need load several
-	 * instances of the same pca953x driver to cover them
-	 */
-	strcpy(i2c_info->type, "max7315");
-	if (nr++) {
-		snprintf(base_pin_name, sizeof(base_pin_name),
-			 "max7315_%d_base", nr);
-		snprintf(intr_pin_name, sizeof(intr_pin_name),
-			 "max7315_%d_int", nr);
-	} else {
-		strcpy(base_pin_name, "max7315_base");
-		strcpy(intr_pin_name, "max7315_int");
-	}
-
-	gpio_base = get_gpio_by_name(base_pin_name);
-	intr = get_gpio_by_name(intr_pin_name);
-
-	if (gpio_base < 0)
-		return NULL;
-	max7315->gpio_base = gpio_base;
-	if (intr != -1) {
-		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-		max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	} else {
-		i2c_info->irq = -1;
-		max7315->irq_base = -1;
-	}
-	return max7315;
-}
-
-static const struct devs_id max7315_dev_id __initconst = {
-	.name = "i2c_max7315",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &max7315_platform_data,
-};
-
-static const struct devs_id max7315_2_dev_id __initconst = {
-	.name = "i2c_max7315_2",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &max7315_platform_data,
-};
-
-sfi_device(max7315_dev_id);
-sfi_device(max7315_2_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
deleted file mode 100644
index 28a182713934..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_mpu3050.c: mpu3050 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <asm/intel-mid.h>
-
-static void *mpu3050_platform_data(void *info)
-{
-	struct i2c_board_info *i2c_info = info;
-	int intr = get_gpio_by_name("mpu3050_int");
-
-	if (intr < 0)
-		return NULL;
-
-	i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-	return NULL;
-}
-
-static const struct devs_id mpu3050_dev_id __initconst = {
-	.name = "mpu3050",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &mpu3050_platform_data,
-};
-
-sfi_device(mpu3050_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
deleted file mode 100644
index 605e1f94ad89..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield FLIS platform device initialization file
- *
- * Copyright (C) 2016, Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/platform_device.h>
-
-#include <asm/intel-mid.h>
-
-#define FLIS_BASE_ADDR			0xff0c0000
-#define FLIS_LENGTH			0x8000
-
-static struct resource mrfld_pinctrl_mmio_resource = {
-	.start		= FLIS_BASE_ADDR,
-	.end		= FLIS_BASE_ADDR + FLIS_LENGTH - 1,
-	.flags		= IORESOURCE_MEM,
-};
-
-static struct platform_device mrfld_pinctrl_device = {
-	.name		= "pinctrl-merrifield",
-	.id		= PLATFORM_DEVID_NONE,
-	.resource	= &mrfld_pinctrl_mmio_resource,
-	.num_resources	= 1,
-};
-
-static int __init mrfld_pinctrl_init(void)
-{
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
-		return platform_device_register(&mrfld_pinctrl_device);
-
-	return -ENODEV;
-}
-arch_initcall(mrfld_pinctrl_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
deleted file mode 100644
index 40e9808a9634..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield legacy RTC initialization file
- *
- * (C) Copyright 2017 Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-
-#include <asm/hw_irq.h>
-#include <asm/intel-mid.h>
-#include <asm/io_apic.h>
-#include <asm/time.h>
-#include <asm/x86_init.h>
-
-static int __init mrfld_legacy_rtc_alloc_irq(void)
-{
-	struct irq_alloc_info info;
-	int ret;
-
-	if (!x86_platform.legacy.rtc)
-		return -ENODEV;
-
-	ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
-	ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
-	if (ret < 0) {
-		pr_info("Failed to allocate RTC interrupt. Disabling RTC\n");
-		x86_platform.legacy.rtc = 0;
-		return ret;
-	}
-
-	return 0;
-}
-
-static int __init mrfld_legacy_rtc_init(void)
-{
-	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
-		return -ENODEV;
-
-	return mrfld_legacy_rtc_alloc_irq();
-}
-arch_initcall(mrfld_legacy_rtc_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
deleted file mode 100644
index fe3b7ff975f3..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SDHCI platform data initilisation file
- *
- * (C) Copyright 2016 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/pci.h>
-
-#include <linux/mmc/sdhci-pci-data.h>
-
-#include <asm/intel-mid.h>
-
-#define INTEL_MRFLD_SD			2
-#define INTEL_MRFLD_SD_CD_GPIO		77
-
-static struct sdhci_pci_data mrfld_sdhci_pci_data = {
-	.rst_n_gpio	= -EINVAL,
-	.cd_gpio	= INTEL_MRFLD_SD_CD_GPIO,
-};
-
-static struct sdhci_pci_data *
-mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
-{
-	unsigned int func = PCI_FUNC(pdev->devfn);
-
-	if (func == INTEL_MRFLD_SD)
-		return &mrfld_sdhci_pci_data;
-
-	return NULL;
-}
-
-static int __init mrfld_sd_init(void)
-{
-	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
-		return -ENODEV;
-
-	sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
-	return 0;
-}
-arch_initcall(mrfld_sd_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
deleted file mode 100644
index b828f4fd40be..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * spidev platform data initialization file
- *
- * (C) Copyright 2014, 2016 Intel Corporation
- * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- *	    Dan O'Donovan <dan@emutex.com>
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/spi/pxa2xx_spi.h>
-#include <linux/spi/spi.h>
-
-#include <asm/intel-mid.h>
-
-#define MRFLD_SPI_DEFAULT_DMA_BURST	8
-#define MRFLD_SPI_DEFAULT_TIMEOUT	500
-
-/* GPIO pin for spidev chipselect */
-#define MRFLD_SPIDEV_GPIO_CS		111
-
-static struct pxa2xx_spi_chip spidev_spi_chip = {
-	.dma_burst_size		= MRFLD_SPI_DEFAULT_DMA_BURST,
-	.timeout		= MRFLD_SPI_DEFAULT_TIMEOUT,
-	.gpio_cs		= MRFLD_SPIDEV_GPIO_CS,
-};
-
-static void __init *spidev_platform_data(void *info)
-{
-	struct spi_board_info *spi_info = info;
-
-	if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
-		return ERR_PTR(-ENODEV);
-
-	spi_info->mode = SPI_MODE_0;
-	spi_info->controller_data = &spidev_spi_chip;
-
-	return NULL;
-}
-
-static const struct devs_id spidev_dev_id __initconst = {
-	.name			= "spidev",
-	.type			= SFI_DEV_TYPE_SPI,
-	.delay			= 0,
-	.get_platform_data	= &spidev_platform_data,
-};
-
-sfi_device(spidev_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
deleted file mode 100644
index 5609d8da3978..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * PCAL9555a platform data initialization file
- *
- * Copyright (C) 2016, Intel Corporation
- *
- * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- *	    Dan O'Donovan <dan@emutex.com>
- */
-
-#include <linux/gpio.h>
-#include <linux/init.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-
-#define PCAL9555A_NUM	4
-
-static struct pca953x_platform_data pcal9555a_pdata[PCAL9555A_NUM];
-static int nr;
-
-static void __init *pcal9555a_platform_data(void *info)
-{
-	struct i2c_board_info *i2c_info = info;
-	char *type = i2c_info->type;
-	struct pca953x_platform_data *pcal9555a;
-	char base_pin_name[SFI_NAME_LEN + 1];
-	char intr_pin_name[SFI_NAME_LEN + 1];
-	int gpio_base, intr;
-
-	snprintf(base_pin_name, sizeof(base_pin_name), "%s_base", type);
-	snprintf(intr_pin_name, sizeof(intr_pin_name), "%s_int", type);
-
-	gpio_base = get_gpio_by_name(base_pin_name);
-	intr = get_gpio_by_name(intr_pin_name);
-
-	/* Check if the SFI record valid */
-	if (gpio_base == -1)
-		return NULL;
-
-	if (nr >= PCAL9555A_NUM) {
-		pr_err("%s: Too many instances, only %d supported\n", __func__,
-		       PCAL9555A_NUM);
-		return NULL;
-	}
-
-	pcal9555a = &pcal9555a_pdata[nr++];
-	pcal9555a->gpio_base = gpio_base;
-
-	if (intr >= 0) {
-		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-		pcal9555a->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	} else {
-		i2c_info->irq = -1;
-		pcal9555a->irq_base = -1;
-	}
-
-	strcpy(type, "pcal9555a");
-	return pcal9555a;
-}
-
-static const struct devs_id pcal9555a_1_dev_id __initconst = {
-	.name			= "pcal9555a-1",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_2_dev_id __initconst = {
-	.name			= "pcal9555a-2",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_3_dev_id __initconst = {
-	.name			= "pcal9555a-3",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_4_dev_id __initconst = {
-	.name			= "pcal9555a-4",
-	.type			= SFI_DEV_TYPE_I2C,
-	.delay			= 1,
-	.get_platform_data	= &pcal9555a_platform_data,
-};
-
-sfi_device(pcal9555a_1_dev_id);
-sfi_device(pcal9555a_2_dev_id);
-sfi_device(pcal9555a_3_dev_id);
-sfi_device(pcal9555a_4_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
deleted file mode 100644
index 139738bbdd36..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_tc35876x.c: tc35876x platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <asm/intel-mid.h>
-
-static struct gpiod_lookup_table tc35876x_gpio_table = {
-	.dev_id	= "i2c_disp_brig",
-	.table	= {
-		GPIO_LOOKUP("0000:00:0c.0", -1, "bridge-reset", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "bl-en", GPIO_ACTIVE_HIGH),
-		GPIO_LOOKUP("0000:00:0c.0", -1, "vadd", GPIO_ACTIVE_HIGH),
-		{ },
-	},
-};
-
-/*tc35876x DSI_LVDS bridge chip and panel platform data*/
-static void *tc35876x_platform_data(void *data)
-{
-	struct gpiod_lookup_table *table = &tc35876x_gpio_table;
-	struct gpiod_lookup *lookup = table->table;
-
-	lookup[0].chip_hwnum = get_gpio_by_name("LCMB_RXEN");
-	lookup[1].chip_hwnum = get_gpio_by_name("6S6P_BL_EN");
-	lookup[2].chip_hwnum = get_gpio_by_name("EN_VREG_LCD_V3P3");
-	gpiod_add_lookup_table(table);
-
-	return NULL;
-}
-
-static const struct devs_id tc35876x_dev_id __initconst = {
-	.name = "i2c_disp_brig",
-	.type = SFI_DEV_TYPE_I2C,
-	.get_platform_data = &tc35876x_platform_data,
-};
-
-sfi_device(tc35876x_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
deleted file mode 100644
index e689d8f61059..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_tca6416.c: tca6416 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/platform_data/pca953x.h>
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-
-#define TCA6416_NAME	"tca6416"
-#define TCA6416_BASE	"tca6416_base"
-#define TCA6416_INTR	"tca6416_int"
-
-static void *tca6416_platform_data(void *info)
-{
-	static struct pca953x_platform_data tca6416;
-	struct i2c_board_info *i2c_info = info;
-	int gpio_base, intr;
-	char base_pin_name[SFI_NAME_LEN + 1];
-	char intr_pin_name[SFI_NAME_LEN + 1];
-
-	strcpy(i2c_info->type, TCA6416_NAME);
-	strcpy(base_pin_name, TCA6416_BASE);
-	strcpy(intr_pin_name, TCA6416_INTR);
-
-	gpio_base = get_gpio_by_name(base_pin_name);
-	intr = get_gpio_by_name(intr_pin_name);
-
-	if (gpio_base < 0)
-		return NULL;
-	tca6416.gpio_base = gpio_base;
-	if (intr >= 0) {
-		i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
-		tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
-	} else {
-		i2c_info->irq = -1;
-		tca6416.irq_base = -1;
-	}
-	return &tca6416;
-}
-
-static const struct devs_id tca6416_dev_id __initconst = {
-	.name = "tca6416",
-	.type = SFI_DEV_TYPE_I2C,
-	.delay = 1,
-	.get_platform_data = &tca6416_platform_data,
-};
-
-sfi_device(tca6416_dev_id);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 864b0c158b2f..2c044e260b4b 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -14,7 +14,6 @@
 #include <linux/interrupt.h>
 #include <linux/regulator/machine.h>
 #include <linux/scatterlist.h>
-#include <linux/sfi.h>
 #include <linux/irq.h>
 #include <linux/export.h>
 #include <linux/notifier.h>
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
deleted file mode 100644
index 63ae342ffb12..000000000000
--- a/arch/x86/platform/intel-mid/sfi.c
+++ /dev/null
@@ -1,419 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * intel_mid_sfi.c: Intel MID SFI initialization code
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/spi/spi.h>
-#include <linux/i2c.h>
-#include <linux/skbuff.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/input.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-#include <linux/export.h>
-#include <linux/notifier.h>
-#include <linux/mmc/core.h>
-#include <linux/mmc/card.h>
-#include <linux/blkdev.h>
-
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/intel-mid.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/intel_scu_ipc.h>
-#include <asm/reboot.h>
-
-#define	SFI_SIG_OEM0	"OEM0"
-#define MAX_IPCDEVS	24
-#define MAX_SCU_SPI	24
-#define MAX_SCU_I2C	24
-
-static struct platform_device *ipc_devs[MAX_IPCDEVS];
-static struct spi_board_info *spi_devs[MAX_SCU_SPI];
-static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
-static struct sfi_gpio_table_entry *gpio_table;
-static int ipc_next_dev;
-static int spi_next_dev;
-static int i2c_next_dev;
-static int i2c_bus[MAX_SCU_I2C];
-static int gpio_num_entry;
-
-struct blocking_notifier_head intel_scu_notifier =
-			BLOCKING_NOTIFIER_INIT(intel_scu_notifier);
-EXPORT_SYMBOL_GPL(intel_scu_notifier);
-
-#define intel_mid_sfi_get_pdata(dev, priv)	\
-	((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL)
-
-/*
- * Parsing GPIO table first, since the DEVS table will need this table
- * to map the pin name to the actual pin.
- */
-static int __init sfi_parse_gpio(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_gpio_table_entry *pentry;
-	int num, i;
-
-	if (gpio_table)
-		return 0;
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
-	pentry = (struct sfi_gpio_table_entry *)sb->pentry;
-
-	gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL);
-	if (!gpio_table)
-		return -1;
-	gpio_num_entry = num;
-
-	pr_debug("GPIO pin info:\n");
-	for (i = 0; i < num; i++, pentry++)
-		pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
-		" pin = %d\n", i,
-			pentry->controller_name,
-			pentry->pin_name,
-			pentry->pin_no);
-	return 0;
-}
-
-int get_gpio_by_name(const char *name)
-{
-	struct sfi_gpio_table_entry *pentry = gpio_table;
-	int i;
-
-	if (!pentry)
-		return -1;
-	for (i = 0; i < gpio_num_entry; i++, pentry++) {
-		if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
-			return pentry->pin_no;
-	}
-	return -EINVAL;
-}
-
-static void __init intel_scu_ipc_device_register(struct platform_device *pdev)
-{
-	if (ipc_next_dev == MAX_IPCDEVS)
-		pr_err("too many SCU IPC devices");
-	else
-		ipc_devs[ipc_next_dev++] = pdev;
-}
-
-static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
-{
-	struct spi_board_info *new_dev;
-
-	if (spi_next_dev == MAX_SCU_SPI) {
-		pr_err("too many SCU SPI devices");
-		return;
-	}
-
-	new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
-	if (!new_dev) {
-		pr_err("failed to alloc mem for delayed spi dev %s\n",
-			sdev->modalias);
-		return;
-	}
-	*new_dev = *sdev;
-
-	spi_devs[spi_next_dev++] = new_dev;
-}
-
-static void __init intel_scu_i2c_device_register(int bus,
-						struct i2c_board_info *idev)
-{
-	struct i2c_board_info *new_dev;
-
-	if (i2c_next_dev == MAX_SCU_I2C) {
-		pr_err("too many SCU I2C devices");
-		return;
-	}
-
-	new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
-	if (!new_dev) {
-		pr_err("failed to alloc mem for delayed i2c dev %s\n",
-			idev->type);
-		return;
-	}
-	*new_dev = *idev;
-
-	i2c_bus[i2c_next_dev] = bus;
-	i2c_devs[i2c_next_dev++] = new_dev;
-}
-
-/* Called by IPC driver */
-void intel_scu_devices_create(void)
-{
-	int i;
-
-	for (i = 0; i < ipc_next_dev; i++)
-		platform_device_add(ipc_devs[i]);
-
-	for (i = 0; i < spi_next_dev; i++)
-		spi_register_board_info(spi_devs[i], 1);
-
-	for (i = 0; i < i2c_next_dev; i++) {
-		struct i2c_adapter *adapter;
-		struct i2c_client *client;
-
-		adapter = i2c_get_adapter(i2c_bus[i]);
-		if (adapter) {
-			client = i2c_new_client_device(adapter, i2c_devs[i]);
-			if (IS_ERR(client))
-				pr_err("can't create i2c device %s\n",
-					i2c_devs[i]->type);
-		} else
-			i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
-	}
-	intel_scu_notifier_post(SCU_AVAILABLE, NULL);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_create);
-
-/* Called by IPC driver */
-void intel_scu_devices_destroy(void)
-{
-	int i;
-
-	intel_scu_notifier_post(SCU_DOWN, NULL);
-
-	for (i = 0; i < ipc_next_dev; i++)
-		platform_device_del(ipc_devs[i]);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
-
-static void __init install_irq_resource(struct platform_device *pdev, int irq)
-{
-	/* Single threaded */
-	static struct resource res __initdata = {
-		.name = "IRQ",
-		.flags = IORESOURCE_IRQ,
-	};
-	res.start = irq;
-	platform_device_add_resources(pdev, &res, 1);
-}
-
-static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct platform_device *pdev;
-	void *pdata = NULL;
-
-	pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
-		pentry->name, pentry->irq);
-
-	/*
-	 * We need to call platform init of IPC devices to fill misc_pdata
-	 * structure. It will be used in msic_init for initialization.
-	 */
-	pdata = intel_mid_sfi_get_pdata(dev, pentry);
-	if (IS_ERR(pdata))
-		return;
-
-	pdev = platform_device_alloc(pentry->name, 0);
-	if (pdev == NULL) {
-		pr_err("out of memory for SFI platform device '%s'.\n",
-			pentry->name);
-		return;
-	}
-	install_irq_resource(pdev, pentry->irq);
-
-	pdev->dev.platform_data = pdata;
-	if (dev->delay)
-		intel_scu_ipc_device_register(pdev);
-	else
-		platform_device_add(pdev);
-}
-
-static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct spi_board_info spi_info;
-	void *pdata = NULL;
-
-	memset(&spi_info, 0, sizeof(spi_info));
-	strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
-	spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
-	spi_info.bus_num = pentry->host_num;
-	spi_info.chip_select = pentry->addr;
-	spi_info.max_speed_hz = pentry->max_freq;
-	pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n",
-		spi_info.bus_num,
-		spi_info.modalias,
-		spi_info.irq,
-		spi_info.max_speed_hz,
-		spi_info.chip_select);
-
-	pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
-	if (IS_ERR(pdata))
-		return;
-
-	spi_info.platform_data = pdata;
-	if (dev->delay)
-		intel_scu_spi_device_register(&spi_info);
-	else
-		spi_register_board_info(&spi_info, 1);
-}
-
-static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct i2c_board_info i2c_info;
-	void *pdata = NULL;
-
-	memset(&i2c_info, 0, sizeof(i2c_info));
-	strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
-	i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
-	i2c_info.addr = pentry->addr;
-	pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n",
-		pentry->host_num,
-		i2c_info.type,
-		i2c_info.irq,
-		i2c_info.addr);
-	pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
-	i2c_info.platform_data = pdata;
-	if (IS_ERR(pdata))
-		return;
-
-	if (dev->delay)
-		intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
-	else
-		i2c_register_board_info(pentry->host_num, &i2c_info, 1);
-}
-
-static void __init sfi_handle_sd_dev(struct sfi_device_table_entry *pentry,
-					struct devs_id *dev)
-{
-	struct mid_sd_board_info sd_info;
-	void *pdata;
-
-	memset(&sd_info, 0, sizeof(sd_info));
-	strncpy(sd_info.name, pentry->name, SFI_NAME_LEN);
-	sd_info.bus_num = pentry->host_num;
-	sd_info.max_clk = pentry->max_freq;
-	sd_info.addr = pentry->addr;
-	pr_debug("SD bus = %d, name = %16.16s, max_clk = %d, addr = 0x%x\n",
-		 sd_info.bus_num,
-		 sd_info.name,
-		 sd_info.max_clk,
-		 sd_info.addr);
-	pdata = intel_mid_sfi_get_pdata(dev, &sd_info);
-	if (IS_ERR(pdata))
-		return;
-
-	/* Nothing we can do with this for now */
-	sd_info.platform_data = pdata;
-
-	pr_debug("Successfully registered %16.16s", sd_info.name);
-}
-
-extern struct devs_id *const __x86_intel_mid_dev_start[],
-		      *const __x86_intel_mid_dev_end[];
-
-static struct devs_id __init *get_device_id(u8 type, char *name)
-{
-	struct devs_id *const *dev_table;
-
-	for (dev_table = __x86_intel_mid_dev_start;
-			dev_table < __x86_intel_mid_dev_end; dev_table++) {
-		struct devs_id *dev = *dev_table;
-		if (dev->type == type &&
-			!strncmp(dev->name, name, SFI_NAME_LEN)) {
-			return dev;
-		}
-	}
-
-	return NULL;
-}
-
-static int __init sfi_parse_devs(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_device_table_entry *pentry;
-	struct devs_id *dev = NULL;
-	int num, i, ret;
-	int polarity;
-	struct irq_alloc_info info;
-
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
-	pentry = (struct sfi_device_table_entry *)sb->pentry;
-
-	for (i = 0; i < num; i++, pentry++) {
-		int irq = pentry->irq;
-
-		if (irq != (u8)0xff) { /* native RTE case */
-			/* these SPI2 devices are not exposed to system as PCI
-			 * devices, but they have separate RTE entry in IOAPIC
-			 * so we have to enable them one by one here
-			 */
-			if (intel_mid_identify_cpu() ==
-					INTEL_MID_CPU_CHIP_TANGIER) {
-				if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
-					/* active low */
-					polarity = 1;
-				else if (!strncmp(pentry->name,
-						"synaptics_3202", 14))
-					/* active low */
-					polarity = 1;
-				else if (irq == 41)
-					/* fast_int_1 */
-					polarity = 1;
-				else
-					/* active high */
-					polarity = 0;
-			} else {
-				/* PNW and CLV go with active low */
-				polarity = 1;
-			}
-
-			ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
-			ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
-			WARN_ON(ret < 0);
-		}
-
-		dev = get_device_id(pentry->type, pentry->name);
-
-		if (!dev)
-			continue;
-
-		switch (pentry->type) {
-		case SFI_DEV_TYPE_IPC:
-			sfi_handle_ipc_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_SPI:
-			sfi_handle_spi_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_I2C:
-			sfi_handle_i2c_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_SD:
-			sfi_handle_sd_dev(pentry, dev);
-			break;
-		case SFI_DEV_TYPE_UART:
-		case SFI_DEV_TYPE_HSI:
-		default:
-			break;
-		}
-	}
-	return 0;
-}
-
-static int __init intel_mid_platform_init(void)
-{
-	sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
-	sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
-	return 0;
-}
-arch_initcall(intel_mid_platform_init);
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
deleted file mode 100644
index 4eba24c2af67..000000000000
--- a/arch/x86/platform/sfi/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_SFI)		+= sfi.o
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
deleted file mode 100644
index 6259563760f9..000000000000
--- a/arch/x86/platform/sfi/sfi.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sfi.c - x86 architecture SFI support.
- *
- * Copyright (c) 2009, Intel Corporation.
- */
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/io.h>
-
-#include <asm/irqdomain.h>
-#include <asm/io_apic.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-
-/* All CPUs enumerated by SFI must be present and enabled */
-static void __init mp_sfi_register_lapic(u8 id)
-{
-	if (MAX_LOCAL_APIC - id <= 0) {
-		pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC);
-		return;
-	}
-
-	pr_info("registering lapic[%d]\n", id);
-
-	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-}
-
-static int __init sfi_parse_cpus(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_cpu_table_entry *pentry;
-	int i;
-	int cpu_num;
-
-	sb = (struct sfi_table_simple *)table;
-	cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
-	pentry = (struct sfi_cpu_table_entry *)sb->pentry;
-
-	for (i = 0; i < cpu_num; i++) {
-		mp_sfi_register_lapic(pentry->apic_id);
-		pentry++;
-	}
-
-	smp_found_config = 1;
-	return 0;
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
-#ifdef CONFIG_X86_IO_APIC
-
-static int __init sfi_parse_ioapic(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_apic_table_entry *pentry;
-	int i, num;
-	struct ioapic_domain_cfg cfg = {
-		.type = IOAPIC_DOMAIN_STRICT,
-		.ops = &mp_ioapic_irqdomain_ops,
-	};
-
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
-	pentry = (struct sfi_apic_table_entry *)sb->pentry;
-
-	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
-		pentry++;
-	}
-
-	WARN(pic_mode, KERN_WARNING
-		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
-	pic_mode = 0;
-	return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-
-/*
- * sfi_platform_init(): register lapics & io-apics
- */
-int __init sfi_platform_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	register_lapic_address(sfi_lapic_addr);
-	sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
-#endif
-	return 0;
-}
diff --git a/drivers/Makefile b/drivers/Makefile
index fd11b9ac4cc3..b2af1514a6b3 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -27,7 +27,7 @@ obj-y				+= idle/
 obj-y				+= char/ipmi/
 
 obj-$(CONFIG_ACPI)		+= acpi/
-obj-$(CONFIG_SFI)		+= sfi/
+
 # PnP must come after ACPI since it will eventually need to check if acpi
 # was used and do nothing if so
 obj-$(CONFIG_PNP)		+= pnp/
diff --git a/drivers/platform/x86/intel_scu_pcidrv.c b/drivers/platform/x86/intel_scu_pcidrv.c
index 8c5fd8240da9..80abc708e4f2 100644
--- a/drivers/platform/x86/intel_scu_pcidrv.c
+++ b/drivers/platform/x86/intel_scu_pcidrv.c
@@ -17,7 +17,6 @@
 static int intel_scu_pci_probe(struct pci_dev *pdev,
 			       const struct pci_device_id *id)
 {
-	void (*setup_fn)(void) = (void (*)(void))id->driver_data;
 	struct intel_scu_ipc_data scu_data = {};
 	struct intel_scu_ipc_dev *scu;
 	int ret;
@@ -30,27 +29,14 @@ static int intel_scu_pci_probe(struct pci_dev *pdev,
 	scu_data.irq = pdev->irq;
 
 	scu = intel_scu_ipc_register(&pdev->dev, &scu_data);
-	if (IS_ERR(scu))
-		return PTR_ERR(scu);
-
-	if (setup_fn)
-		setup_fn();
-	return 0;
-}
-
-static void intel_mid_scu_setup(void)
-{
-	intel_scu_devices_create();
+	return PTR_ERR_OR_ZERO(scu);
 }
 
 static const struct pci_device_id pci_ids[] = {
-	{ PCI_VDEVICE(INTEL, 0x080e),
-	  .driver_data = (kernel_ulong_t)intel_mid_scu_setup },
-	{ PCI_VDEVICE(INTEL, 0x08ea),
-	  .driver_data = (kernel_ulong_t)intel_mid_scu_setup },
+	{ PCI_VDEVICE(INTEL, 0x080e) },
+	{ PCI_VDEVICE(INTEL, 0x08ea) },
 	{ PCI_VDEVICE(INTEL, 0x0a94) },
-	{ PCI_VDEVICE(INTEL, 0x11a0),
-	  .driver_data = (kernel_ulong_t)intel_mid_scu_setup },
+	{ PCI_VDEVICE(INTEL, 0x11a0) },
 	{ PCI_VDEVICE(INTEL, 0x1a94) },
 	{ PCI_VDEVICE(INTEL, 0x5a94) },
 	{}
diff --git a/drivers/sfi/Kconfig b/drivers/sfi/Kconfig
deleted file mode 100644
index 3d0b64db214e..000000000000
--- a/drivers/sfi/Kconfig
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# SFI Configuration
-#
-
-menuconfig SFI
-	bool "SFI (Simple Firmware Interface) Support"
-	help
-	The Simple Firmware Interface (SFI) provides a lightweight method
-	for platform firmware to pass information to the operating system
-	via static tables in memory.  Kernel SFI support is required to
-	boot on SFI-only platforms.  Currently, all SFI-only platforms are
-	based on the 2nd generation Intel Atom processor platform,
-	code-named Moorestown.
-
-	For more information, see http://simplefirmware.org
-
-	Say 'Y' here to enable the kernel to boot on SFI-only platforms.
diff --git a/drivers/sfi/Makefile b/drivers/sfi/Makefile
deleted file mode 100644
index ca9436b12386..000000000000
--- a/drivers/sfi/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-y	+= sfi_acpi.o
-obj-y	+= sfi_core.o
-
diff --git a/drivers/sfi/sfi_acpi.c b/drivers/sfi/sfi_acpi.c
deleted file mode 100644
index d277b36eb389..000000000000
--- a/drivers/sfi/sfi_acpi.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/* sfi_acpi.c Simple Firmware Interface - ACPI extensions */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/sfi_acpi.h>
-#include "sfi_core.h"
-
-/*
- * SFI can access ACPI-defined tables via an optional ACPI XSDT.
- *
- * This allows re-use, and avoids re-definition, of standard tables.
- * For example, the "MCFG" table is defined by PCI, reserved by ACPI,
- * and is expected to be present many SFI-only systems.
- */
-
-static struct acpi_table_xsdt *xsdt_va __read_mostly;
-
-#define XSDT_GET_NUM_ENTRIES(ptable, entry_type) \
-	((ptable->header.length - sizeof(struct acpi_table_header)) / \
-	(sizeof(entry_type)))
-
-static inline struct sfi_table_header *acpi_to_sfi_th(
-				struct acpi_table_header *th)
-{
-	return (struct sfi_table_header *)th;
-}
-
-static inline struct acpi_table_header *sfi_to_acpi_th(
-				struct sfi_table_header *th)
-{
-	return (struct acpi_table_header *)th;
-}
-
-/*
- * sfi_acpi_parse_xsdt()
- *
- * Parse the ACPI XSDT for later access by sfi_acpi_table_parse().
- */
-static int __init sfi_acpi_parse_xsdt(struct sfi_table_header *th)
-{
-	struct sfi_table_key key = SFI_ANY_KEY;
-	int tbl_cnt, i;
-	void *ret;
-
-	xsdt_va = (struct acpi_table_xsdt *)th;
-	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		ret = sfi_check_table(xsdt_va->table_offset_entry[i], &key);
-		if (IS_ERR(ret)) {
-			disable_sfi();
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-int __init sfi_acpi_init(void)
-{
-	struct sfi_table_key xsdt_key = { .sig = SFI_SIG_XSDT };
-
-	sfi_table_parse(SFI_SIG_XSDT, NULL, NULL, sfi_acpi_parse_xsdt);
-
-	/* Only call the get_table to keep the table mapped */
-	xsdt_va = (struct acpi_table_xsdt *)sfi_get_table(&xsdt_key);
-	return 0;
-}
-
-static struct acpi_table_header *sfi_acpi_get_table(struct sfi_table_key *key)
-{
-	u32 tbl_cnt, i;
-	void *ret;
-
-	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		ret = sfi_check_table(xsdt_va->table_offset_entry[i], key);
-		if (!IS_ERR(ret) && ret)
-			return sfi_to_acpi_th(ret);
-	}
-
-	return NULL;
-}
-
-static void sfi_acpi_put_table(struct acpi_table_header *table)
-{
-	sfi_put_table(acpi_to_sfi_th(table));
-}
-
-/*
- * sfi_acpi_table_parse()
- *
- * Find specified table in XSDT, run handler on it and return its return value
- */
-int sfi_acpi_table_parse(char *signature, char *oem_id, char *oem_table_id,
-			int(*handler)(struct acpi_table_header *))
-{
-	struct acpi_table_header *table = NULL;
-	struct sfi_table_key key;
-	int ret = 0;
-
-	if (sfi_disabled)
-		return -1;
-
-	key.sig = signature;
-	key.oem_id = oem_id;
-	key.oem_table_id = oem_table_id;
-
-	table = sfi_acpi_get_table(&key);
-	if (!table)
-		return -EINVAL;
-
-	ret = handler(table);
-	sfi_acpi_put_table(table);
-	return ret;
-}
-
-static ssize_t sfi_acpi_table_show(struct file *filp, struct kobject *kobj,
-			       struct bin_attribute *bin_attr, char *buf,
-			       loff_t offset, size_t count)
-{
-	struct sfi_table_attr *tbl_attr =
-	    container_of(bin_attr, struct sfi_table_attr, attr);
-	struct acpi_table_header *th = NULL;
-	struct sfi_table_key key;
-	ssize_t cnt;
-
-	key.sig = tbl_attr->name;
-	key.oem_id = NULL;
-	key.oem_table_id = NULL;
-
-	th = sfi_acpi_get_table(&key);
-	if (!th)
-		return 0;
-
-	cnt =  memory_read_from_buffer(buf, count, &offset,
-					th, th->length);
-	sfi_acpi_put_table(th);
-
-	return cnt;
-}
-
-
-void __init sfi_acpi_sysfs_init(void)
-{
-	u32 tbl_cnt, i;
-	struct sfi_table_attr *tbl_attr;
-
-	tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		tbl_attr =
-			sfi_sysfs_install_table(xsdt_va->table_offset_entry[i]);
-		tbl_attr->attr.read = sfi_acpi_table_show;
-	}
-
-	return;
-}
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
deleted file mode 100644
index a5136901dd8a..000000000000
--- a/drivers/sfi/sfi_core.c
+++ /dev/null
@@ -1,522 +0,0 @@
-/* sfi_core.c Simple Firmware Interface - core internals */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/memblock.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-
-#include "sfi_core.h"
-
-#define ON_SAME_PAGE(addr1, addr2) \
-	(((unsigned long)(addr1) & PAGE_MASK) == \
-	((unsigned long)(addr2) & PAGE_MASK))
-#define TABLE_ON_PAGE(page, table, size) (ON_SAME_PAGE(page, table) && \
-				ON_SAME_PAGE(page, table + size))
-
-int sfi_disabled __read_mostly;
-EXPORT_SYMBOL(sfi_disabled);
-
-static u64 syst_pa __read_mostly;
-static struct sfi_table_simple *syst_va __read_mostly;
-
-/*
- * FW creates and saves the SFI tables in memory. When these tables get
- * used, they may need to be mapped to virtual address space, and the mapping
- * can happen before or after the memremap() is ready, so a flag is needed
- * to indicating this
- */
-static u32 sfi_use_memremap __read_mostly;
-
-/*
- * sfi_un/map_memory calls early_memremap/memunmap which is a __init function
- * and introduces section mismatch. So use __ref to make it calm.
- */
-static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
-{
-	if (!phys || !size)
-		return NULL;
-
-	if (sfi_use_memremap)
-		return memremap(phys, size, MEMREMAP_WB);
-	else
-		return early_memremap(phys, size);
-}
-
-static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
-{
-	if (!virt || !size)
-		return;
-
-	if (sfi_use_memremap)
-		memunmap(virt);
-	else
-		early_memunmap(virt, size);
-}
-
-static void sfi_print_table_header(unsigned long long pa,
-				struct sfi_table_header *header)
-{
-	pr_info("%4.4s %llX, %04X (v%d %6.6s %8.8s)\n",
-		header->sig, pa,
-		header->len, header->rev, header->oem_id,
-		header->oem_table_id);
-}
-
-/*
- * sfi_verify_table()
- * Sanity check table lengh, calculate checksum
- */
-static int sfi_verify_table(struct sfi_table_header *table)
-{
-
-	u8 checksum = 0;
-	u8 *puchar = (u8 *)table;
-	u32 length = table->len;
-
-	/* Sanity check table length against arbitrary 1MB limit */
-	if (length > 0x100000) {
-		pr_err("Invalid table length 0x%x\n", length);
-		return -1;
-	}
-
-	while (length--)
-		checksum += *puchar++;
-
-	if (checksum) {
-		pr_err("Checksum %2.2X should be %2.2X\n",
-			table->csum, table->csum - checksum);
-		return -1;
-	}
-	return 0;
-}
-
-/*
- * sfi_map_table()
- *
- * Return address of mapped table
- * Check for common case that we can re-use mapping to SYST,
- * which requires syst_pa, syst_va to be initialized.
- */
-static struct sfi_table_header *sfi_map_table(u64 pa)
-{
-	struct sfi_table_header *th;
-	u32 length;
-
-	if (!TABLE_ON_PAGE(syst_pa, pa, sizeof(struct sfi_table_header)))
-		th = sfi_map_memory(pa, sizeof(struct sfi_table_header));
-	else
-		th = (void *)syst_va + (pa - syst_pa);
-
-	 /* If table fits on same page as its header, we are done */
-	if (TABLE_ON_PAGE(th, th, th->len))
-		return th;
-
-	/* Entire table does not fit on same page as SYST */
-	length = th->len;
-	if (!TABLE_ON_PAGE(syst_pa, pa, sizeof(struct sfi_table_header)))
-		sfi_unmap_memory(th, sizeof(struct sfi_table_header));
-
-	return sfi_map_memory(pa, length);
-}
-
-/*
- * sfi_unmap_table()
- *
- * Undoes effect of sfi_map_table() by unmapping table
- * if it did not completely fit on same page as SYST.
- */
-static void sfi_unmap_table(struct sfi_table_header *th)
-{
-	if (!TABLE_ON_PAGE(syst_va, th, th->len))
-		sfi_unmap_memory(th, TABLE_ON_PAGE(th, th, th->len) ?
-					sizeof(*th) : th->len);
-}
-
-static int sfi_table_check_key(struct sfi_table_header *th,
-				struct sfi_table_key *key)
-{
-
-	if (strncmp(th->sig, key->sig, SFI_SIGNATURE_SIZE)
-		|| (key->oem_id && strncmp(th->oem_id,
-				key->oem_id, SFI_OEM_ID_SIZE))
-		|| (key->oem_table_id && strncmp(th->oem_table_id,
-				key->oem_table_id, SFI_OEM_TABLE_ID_SIZE)))
-		return -1;
-
-	return 0;
-}
-
-/*
- * This function will be used in 2 cases:
- * 1. used to enumerate and verify the tables addressed by SYST/XSDT,
- *    thus no signature will be given (in kernel boot phase)
- * 2. used to parse one specific table, signature must exist, and
- *    the mapped virt address will be returned, and the virt space
- *    will be released by call sfi_put_table() later
- *
- * This two cases are from two different functions with two different
- * sections and causes section mismatch warning. So use __ref to tell
- * modpost not to make any noise.
- *
- * Return value:
- *	NULL:			when can't find a table matching the key
- *	ERR_PTR(error):		error value
- *	virt table address:	when a matched table is found
- */
-struct sfi_table_header *
- __ref sfi_check_table(u64 pa, struct sfi_table_key *key)
-{
-	struct sfi_table_header *th;
-	void *ret = NULL;
-
-	th = sfi_map_table(pa);
-	if (!th)
-		return ERR_PTR(-ENOMEM);
-
-	if (!key->sig) {
-		sfi_print_table_header(pa, th);
-		if (sfi_verify_table(th))
-			ret = ERR_PTR(-EINVAL);
-	} else {
-		if (!sfi_table_check_key(th, key))
-			return th;	/* Success */
-	}
-
-	sfi_unmap_table(th);
-	return ret;
-}
-
-/*
- * sfi_get_table()
- *
- * Search SYST for the specified table with the signature in
- * the key, and return the mapped table
- */
-struct sfi_table_header *sfi_get_table(struct sfi_table_key *key)
-{
-	struct sfi_table_header *th;
-	u32 tbl_cnt, i;
-
-	tbl_cnt = SFI_GET_NUM_ENTRIES(syst_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		th = sfi_check_table(syst_va->pentry[i], key);
-		if (!IS_ERR(th) && th)
-			return th;
-	}
-
-	return NULL;
-}
-
-void sfi_put_table(struct sfi_table_header *th)
-{
-	sfi_unmap_table(th);
-}
-
-/* Find table with signature, run handler on it */
-int sfi_table_parse(char *signature, char *oem_id, char *oem_table_id,
-			sfi_table_handler handler)
-{
-	struct sfi_table_header *table = NULL;
-	struct sfi_table_key key;
-	int ret = -EINVAL;
-
-	if (sfi_disabled || !handler || !signature)
-		goto exit;
-
-	key.sig = signature;
-	key.oem_id = oem_id;
-	key.oem_table_id = oem_table_id;
-
-	table = sfi_get_table(&key);
-	if (!table)
-		goto exit;
-
-	ret = handler(table);
-	sfi_put_table(table);
-exit:
-	return ret;
-}
-EXPORT_SYMBOL_GPL(sfi_table_parse);
-
-/*
- * sfi_parse_syst()
- * Checksum all the tables in SYST and print their headers
- *
- * success: set syst_va, return 0
- */
-static int __init sfi_parse_syst(void)
-{
-	struct sfi_table_key key = SFI_ANY_KEY;
-	int tbl_cnt, i;
-	void *ret;
-
-	syst_va = sfi_map_memory(syst_pa, sizeof(struct sfi_table_simple));
-	if (!syst_va)
-		return -ENOMEM;
-
-	tbl_cnt = SFI_GET_NUM_ENTRIES(syst_va, u64);
-	for (i = 0; i < tbl_cnt; i++) {
-		ret = sfi_check_table(syst_va->pentry[i], &key);
-		if (IS_ERR(ret))
-			return PTR_ERR(ret);
-	}
-
-	return 0;
-}
-
-/*
- * The OS finds the System Table by searching 16-byte boundaries between
- * physical address 0x000E0000 and 0x000FFFFF. The OS shall search this region
- * starting at the low address and shall stop searching when the 1st valid SFI
- * System Table is found.
- *
- * success: set syst_pa, return 0
- * fail: return -1
- */
-static __init int sfi_find_syst(void)
-{
-	unsigned long offset, len;
-	void *start;
-
-	len = SFI_SYST_SEARCH_END - SFI_SYST_SEARCH_BEGIN;
-	start = sfi_map_memory(SFI_SYST_SEARCH_BEGIN, len);
-	if (!start)
-		return -1;
-
-	for (offset = 0; offset < len; offset += 16) {
-		struct sfi_table_header *syst_hdr;
-
-		syst_hdr = start + offset;
-		if (strncmp(syst_hdr->sig, SFI_SIG_SYST,
-				SFI_SIGNATURE_SIZE))
-			continue;
-
-		if (syst_hdr->len > PAGE_SIZE)
-			continue;
-
-		sfi_print_table_header(SFI_SYST_SEARCH_BEGIN + offset,
-					syst_hdr);
-
-		if (sfi_verify_table(syst_hdr))
-			continue;
-
-		/*
-		 * Enforce SFI spec mandate that SYST reside within a page.
-		 */
-		if (!ON_SAME_PAGE(syst_pa, syst_pa + syst_hdr->len)) {
-			pr_info("SYST 0x%llx + 0x%x crosses page\n",
-					syst_pa, syst_hdr->len);
-			continue;
-		}
-
-		/* Success */
-		syst_pa = SFI_SYST_SEARCH_BEGIN + offset;
-		sfi_unmap_memory(start, len);
-		return 0;
-	}
-
-	sfi_unmap_memory(start, len);
-	return -1;
-}
-
-static struct kobject *sfi_kobj;
-static struct kobject *tables_kobj;
-
-static ssize_t sfi_table_show(struct file *filp, struct kobject *kobj,
-			       struct bin_attribute *bin_attr, char *buf,
-			       loff_t offset, size_t count)
-{
-	struct sfi_table_attr *tbl_attr =
-	    container_of(bin_attr, struct sfi_table_attr, attr);
-	struct sfi_table_header *th = NULL;
-	struct sfi_table_key key;
-	ssize_t cnt;
-
-	key.sig = tbl_attr->name;
-	key.oem_id = NULL;
-	key.oem_table_id = NULL;
-
-	if (strncmp(SFI_SIG_SYST, tbl_attr->name, SFI_SIGNATURE_SIZE)) {
-		th = sfi_get_table(&key);
-		if (!th)
-			return 0;
-
-		cnt =  memory_read_from_buffer(buf, count, &offset,
-						th, th->len);
-		sfi_put_table(th);
-	} else
-		cnt =  memory_read_from_buffer(buf, count, &offset,
-					syst_va, syst_va->header.len);
-
-	return cnt;
-}
-
-struct sfi_table_attr __init *sfi_sysfs_install_table(u64 pa)
-{
-	struct sfi_table_attr *tbl_attr;
-	struct sfi_table_header *th;
-	int ret;
-
-	tbl_attr = kzalloc(sizeof(struct sfi_table_attr), GFP_KERNEL);
-	if (!tbl_attr)
-		return NULL;
-
-	th = sfi_map_table(pa);
-	if (!th || !th->sig[0]) {
-		kfree(tbl_attr);
-		return NULL;
-	}
-
-	sysfs_attr_init(&tbl_attr->attr.attr);
-	memcpy(tbl_attr->name, th->sig, SFI_SIGNATURE_SIZE);
-
-	tbl_attr->attr.size = 0;
-	tbl_attr->attr.read = sfi_table_show;
-	tbl_attr->attr.attr.name = tbl_attr->name;
-	tbl_attr->attr.attr.mode = 0400;
-
-	ret = sysfs_create_bin_file(tables_kobj,
-				  &tbl_attr->attr);
-	if (ret) {
-		kfree(tbl_attr);
-		tbl_attr = NULL;
-	}
-
-	sfi_unmap_table(th);
-	return tbl_attr;
-}
-
-static int __init sfi_sysfs_init(void)
-{
-	int tbl_cnt, i;
-
-	if (sfi_disabled)
-		return 0;
-
-	sfi_kobj = kobject_create_and_add("sfi", firmware_kobj);
-	if (!sfi_kobj)
-		return 0;
-
-	tables_kobj = kobject_create_and_add("tables", sfi_kobj);
-	if (!tables_kobj) {
-		kobject_put(sfi_kobj);
-		return 0;
-	}
-
-	sfi_sysfs_install_table(syst_pa);
-
-	tbl_cnt = SFI_GET_NUM_ENTRIES(syst_va, u64);
-
-	for (i = 0; i < tbl_cnt; i++)
-		sfi_sysfs_install_table(syst_va->pentry[i]);
-
-	sfi_acpi_sysfs_init();
-	kobject_uevent(sfi_kobj, KOBJ_ADD);
-	kobject_uevent(tables_kobj, KOBJ_ADD);
-	pr_info("SFI sysfs interfaces init success\n");
-	return 0;
-}
-
-void __init sfi_init(void)
-{
-	if (!acpi_disabled)
-		disable_sfi();
-
-	if (sfi_disabled)
-		return;
-
-	pr_info("Simple Firmware Interface v0.81 http://simplefirmware.org\n");
-
-	if (sfi_find_syst() || sfi_parse_syst() || sfi_platform_init())
-		disable_sfi();
-
-	return;
-}
-
-void __init sfi_init_late(void)
-{
-	int length;
-
-	if (sfi_disabled)
-		return;
-
-	length = syst_va->header.len;
-	sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple));
-
-	/* Use memremap now after it is ready */
-	sfi_use_memremap = 1;
-	syst_va = sfi_map_memory(syst_pa, length);
-
-	sfi_acpi_init();
-}
-
-/*
- * The reason we put it here because we need wait till the /sys/firmware
- * is setup, then our interface can be registered in /sys/firmware/sfi
- */
-core_initcall(sfi_sysfs_init);
diff --git a/drivers/sfi/sfi_core.h b/drivers/sfi/sfi_core.h
deleted file mode 100644
index 1d5cfe854cf7..000000000000
--- a/drivers/sfi/sfi_core.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* sfi_core.h Simple Firmware Interface, internal header */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <linux/sysfs.h>
-
-struct sfi_table_key{
-	char	*sig;
-	char	*oem_id;
-	char	*oem_table_id;
-};
-
-/* sysfs interface */
-struct sfi_table_attr {
-	struct bin_attribute attr;
-	char name[8];
-};
-
-#define SFI_ANY_KEY { .sig = NULL, .oem_id = NULL, .oem_table_id = NULL }
-
-extern int __init sfi_acpi_init(void);
-extern  struct sfi_table_header *sfi_check_table(u64 paddr,
-					struct sfi_table_key *key);
-struct sfi_table_header *sfi_get_table(struct sfi_table_key *key);
-extern void sfi_put_table(struct sfi_table_header *table);
-extern struct sfi_table_attr __init *sfi_sysfs_install_table(u64 pa);
-extern void __init sfi_acpi_sysfs_init(void);
diff --git a/include/linux/sfi.h b/include/linux/sfi.h
deleted file mode 100644
index e0e1597ef9e6..000000000000
--- a/include/linux/sfi.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* sfi.h Simple Firmware Interface */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef _LINUX_SFI_H
-#define _LINUX_SFI_H
-
-#include <linux/init.h>
-#include <linux/types.h>
-
-/* Table signatures reserved by the SFI specification */
-#define SFI_SIG_SYST		"SYST"
-#define SFI_SIG_FREQ		"FREQ"
-#define SFI_SIG_IDLE		"IDLE"
-#define SFI_SIG_CPUS		"CPUS"
-#define SFI_SIG_MTMR		"MTMR"
-#define SFI_SIG_MRTC		"MRTC"
-#define SFI_SIG_MMAP		"MMAP"
-#define SFI_SIG_APIC		"APIC"
-#define SFI_SIG_XSDT		"XSDT"
-#define SFI_SIG_WAKE		"WAKE"
-#define SFI_SIG_DEVS		"DEVS"
-#define SFI_SIG_GPIO		"GPIO"
-
-#define SFI_SIGNATURE_SIZE	4
-#define SFI_OEM_ID_SIZE		6
-#define SFI_OEM_TABLE_ID_SIZE	8
-
-#define SFI_NAME_LEN		16
-
-#define SFI_SYST_SEARCH_BEGIN		0x000E0000
-#define SFI_SYST_SEARCH_END		0x000FFFFF
-
-#define SFI_GET_NUM_ENTRIES(ptable, entry_type) \
-	((ptable->header.len - sizeof(struct sfi_table_header)) / \
-	(sizeof(entry_type)))
-/*
- * Table structures must be byte-packed to match the SFI specification,
- * as they are provided by the BIOS.
- */
-struct sfi_table_header {
-	char	sig[SFI_SIGNATURE_SIZE];
-	u32	len;
-	u8	rev;
-	u8	csum;
-	char	oem_id[SFI_OEM_ID_SIZE];
-	char	oem_table_id[SFI_OEM_TABLE_ID_SIZE];
-} __packed;
-
-struct sfi_table_simple {
-	struct sfi_table_header		header;
-	u64				pentry[1];
-} __packed;
-
-/* Comply with UEFI spec 2.1 */
-struct sfi_mem_entry {
-	u32	type;
-	u64	phys_start;
-	u64	virt_start;
-	u64	pages;
-	u64	attrib;
-} __packed;
-
-struct sfi_cpu_table_entry {
-	u32	apic_id;
-} __packed;
-
-struct sfi_cstate_table_entry {
-	u32	hint;		/* MWAIT hint */
-	u32	latency;	/* latency in ms */
-} __packed;
-
-struct sfi_apic_table_entry {
-	u64	phys_addr;	/* phy base addr for APIC reg */
-} __packed;
-
-struct sfi_freq_table_entry {
-	u32	freq_mhz;	/* in MHZ */
-	u32	latency;	/* transition latency in ms */
-	u32	ctrl_val;	/* value to write to PERF_CTL */
-} __packed;
-
-struct sfi_wake_table_entry {
-	u64	phys_addr;	/* pointer to where the wake vector locates */
-} __packed;
-
-struct sfi_timer_table_entry {
-	u64	phys_addr;	/* phy base addr for the timer */
-	u32	freq_hz;	/* in HZ */
-	u32	irq;
-} __packed;
-
-struct sfi_rtc_table_entry {
-	u64	phys_addr;	/* phy base addr for the RTC */
-	u32	irq;
-} __packed;
-
-struct sfi_device_table_entry {
-	u8	type;		/* bus type, I2C, SPI or ...*/
-#define SFI_DEV_TYPE_SPI	0
-#define SFI_DEV_TYPE_I2C	1
-#define SFI_DEV_TYPE_UART	2
-#define SFI_DEV_TYPE_HSI	3
-#define SFI_DEV_TYPE_IPC	4
-#define SFI_DEV_TYPE_SD		5
-
-	u8	host_num;	/* attached to host 0, 1...*/
-	u16	addr;
-	u8	irq;
-	u32	max_freq;
-	char	name[SFI_NAME_LEN];
-} __packed;
-
-struct sfi_gpio_table_entry {
-	char	controller_name[SFI_NAME_LEN];
-	u16	pin_no;
-	char	pin_name[SFI_NAME_LEN];
-} __packed;
-
-typedef int (*sfi_table_handler) (struct sfi_table_header *table);
-
-#ifdef CONFIG_SFI
-extern void __init sfi_init(void);
-extern int __init sfi_platform_init(void);
-extern void __init sfi_init_late(void);
-extern int sfi_table_parse(char *signature, char *oem_id, char *oem_table_id,
-				sfi_table_handler handler);
-
-extern int sfi_disabled;
-static inline void disable_sfi(void)
-{
-	sfi_disabled = 1;
-}
-
-#else /* !CONFIG_SFI */
-
-static inline void sfi_init(void)
-{
-}
-
-static inline void sfi_init_late(void)
-{
-}
-
-#define sfi_disabled	0
-
-static inline int sfi_table_parse(char *signature, char *oem_id,
-					char *oem_table_id,
-					sfi_table_handler handler)
-{
-	return -1;
-}
-
-#endif /* !CONFIG_SFI */
-
-#endif /*_LINUX_SFI_H*/
diff --git a/include/linux/sfi_acpi.h b/include/linux/sfi_acpi.h
deleted file mode 100644
index a6e555cbe05c..000000000000
--- a/include/linux/sfi_acpi.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* sfi.h Simple Firmware Interface */
-
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-  The full GNU General Public License is included in this distribution
-  in the file called LICENSE.GPL.
-
-  BSD LICENSE
-
-  Copyright(c) 2009 Intel Corporation. All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef _LINUX_SFI_ACPI_H
-#define _LINUX_SFI_ACPI_H
-
-#include <linux/acpi.h>
-#include <linux/sfi.h>
-
-#ifdef CONFIG_SFI
-extern int sfi_acpi_table_parse(char *signature, char *oem_id,
-				char *oem_table_id,
-				int (*handler)(struct acpi_table_header *));
-
-static inline int __init acpi_sfi_table_parse(char *signature,
-				int (*handler)(struct acpi_table_header *))
-{
-	if (!acpi_table_parse(signature, handler))
-		return 0;
-
-	return sfi_acpi_table_parse(signature, NULL, NULL, handler);
-}
-#else /* !CONFIG_SFI */
-static inline int sfi_acpi_table_parse(char *signature, char *oem_id,
-				char *oem_table_id,
-				int (*handler)(struct acpi_table_header *))
-{
-	return -1;
-}
-
-static inline int __init acpi_sfi_table_parse(char *signature,
-				int (*handler)(struct acpi_table_header *))
-{
-	return acpi_table_parse(signature, handler);
-}
-#endif /* !CONFIG_SFI */
-
-#endif /*_LINUX_SFI_ACPI_H*/
diff --git a/init/main.c b/init/main.c
index a626e78dbf06..e9933cbf60d4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -74,7 +74,6 @@
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
-#include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
@@ -1054,7 +1053,6 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 
 	acpi_subsystem_init();
 	arch_post_acpi_subsys_init();
-	sfi_init_late();
 	kcsan_init();
 
 	/* Do the rest non-__init'ed, we're now alive */
-- 
cgit v1.2.3


From ca66dca5eda6bd16b7b27fed2a034f2396df5627 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Fri, 5 Feb 2021 03:01:18 +0300
Subject: thermal: qcom: add support for adc-tm5 PMIC thermal monitor

Add support for Thermal Monitoring part of PMIC5. This part is closely
coupled with ADC, using it's channels directly. ADC-TM support
generating interrupts on ADC value crossing low or high voltage bounds,
which is used to support thermal trip points.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20210205000118.493610-3-dmitry.baryshkov@linaro.org
---
 drivers/iio/adc/qcom-vadc-common.c       |  50 +++
 drivers/thermal/qcom/Kconfig             |  11 +
 drivers/thermal/qcom/Makefile            |   1 +
 drivers/thermal/qcom/qcom-spmi-adc-tm5.c | 623 +++++++++++++++++++++++++++++++
 include/linux/iio/adc/qcom-vadc-common.h |   3 +
 5 files changed, 688 insertions(+)
 create mode 100644 drivers/thermal/qcom/qcom-spmi-adc-tm5.c

(limited to 'include/linux')

diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
index 8682cf1e213f..14723896aab2 100644
--- a/drivers/iio/adc/qcom-vadc-common.c
+++ b/drivers/iio/adc/qcom-vadc-common.c
@@ -368,6 +368,28 @@ static int qcom_vadc_map_voltage_temp(const struct vadc_map_pt *pts,
 	return 0;
 }
 
+static s32 qcom_vadc_map_temp_voltage(const struct vadc_map_pt *pts,
+				      u32 tablesize, int input)
+{
+	u32 i = 0;
+
+	/*
+	 * Table must be sorted, find the interval of 'y' which contains value
+	 * 'input' and map it to proper 'x' value
+	 */
+	while (i < tablesize && pts[i].y < input)
+		i++;
+
+	if (i == 0)
+		return pts[0].x;
+	if (i == tablesize)
+		return pts[tablesize - 1].x;
+
+	/* interpolate linearly */
+	return fixp_linear_interpolate(pts[i - 1].y, pts[i - 1].x,
+			pts[i].y, pts[i].x, input);
+}
+
 static void qcom_vadc_scale_calib(const struct vadc_linear_graph *calib_graph,
 				  u16 adc_code,
 				  bool absolute,
@@ -463,6 +485,21 @@ static int qcom_vadc_scale_chg_temp(const struct vadc_linear_graph *calib_graph,
 	return 0;
 }
 
+/* convert voltage to ADC code, using 1.875V reference */
+static u16 qcom_vadc_scale_voltage_code(s32 voltage,
+					const struct vadc_prescale_ratio *prescale,
+					const u32 full_scale_code_volt,
+					unsigned int factor)
+{
+	s64 volt = voltage;
+	s64 adc_vdd_ref_mv = 1875; /* reference voltage */
+
+	volt *= prescale->num * factor * full_scale_code_volt;
+	volt = div64_s64(volt, (s64)prescale->den * adc_vdd_ref_mv * 1000);
+
+	return volt;
+}
+
 static int qcom_vadc_scale_code_voltage_factor(u16 adc_code,
 				const struct vadc_prescale_ratio *prescale,
 				const struct adc5_data *data,
@@ -627,6 +664,19 @@ int qcom_vadc_scale(enum vadc_scale_fn_type scaletype,
 }
 EXPORT_SYMBOL(qcom_vadc_scale);
 
+u16 qcom_adc_tm5_temp_volt_scale(unsigned int prescale_ratio,
+				 u32 full_scale_code_volt, int temp)
+{
+	const struct vadc_prescale_ratio *prescale = &adc5_prescale_ratios[prescale_ratio];
+	s32 voltage;
+
+	voltage = qcom_vadc_map_temp_voltage(adcmap_100k_104ef_104fb_1875_vref,
+					     ARRAY_SIZE(adcmap_100k_104ef_104fb_1875_vref),
+					     temp);
+	return qcom_vadc_scale_voltage_code(voltage, prescale, full_scale_code_volt, 1000);
+}
+EXPORT_SYMBOL(qcom_adc_tm5_temp_volt_scale);
+
 int qcom_adc5_hw_scale(enum vadc_scale_fn_type scaletype,
 		    unsigned int prescale_ratio,
 		    const struct adc5_data *data,
diff --git a/drivers/thermal/qcom/Kconfig b/drivers/thermal/qcom/Kconfig
index aa9c1d80fae4..8d5ac2df26dc 100644
--- a/drivers/thermal/qcom/Kconfig
+++ b/drivers/thermal/qcom/Kconfig
@@ -10,6 +10,17 @@ config QCOM_TSENS
 	  Also able to set threshold temperature for both hot and cold and update
 	  when a threshold is reached.
 
+config QCOM_SPMI_ADC_TM5
+	tristate "Qualcomm SPMI PMIC Thermal Monitor ADC5"
+	depends on OF && SPMI && IIO
+	select REGMAP_SPMI
+	select QCOM_VADC_COMMON
+	help
+	  This enables the thermal driver for the ADC thermal monitoring
+	  device. It shows up as a thermal zone with multiple trip points.
+	  Thermal client sets threshold temperature for both warm and cool and
+	  gets updated when a threshold is reached.
+
 config QCOM_SPMI_TEMP_ALARM
 	tristate "Qualcomm SPMI PMIC Temperature Alarm"
 	depends on OF && SPMI && IIO
diff --git a/drivers/thermal/qcom/Makefile b/drivers/thermal/qcom/Makefile
index ec86eef7f6a6..252ea7d9da0b 100644
--- a/drivers/thermal/qcom/Makefile
+++ b/drivers/thermal/qcom/Makefile
@@ -3,4 +3,5 @@ obj-$(CONFIG_QCOM_TSENS)	+= qcom_tsens.o
 
 qcom_tsens-y			+= tsens.o tsens-v2.o tsens-v1.o tsens-v0_1.o \
 				   tsens-8960.o
+obj-$(CONFIG_QCOM_SPMI_ADC_TM5)	+= qcom-spmi-adc-tm5.o
 obj-$(CONFIG_QCOM_SPMI_TEMP_ALARM)	+= qcom-spmi-temp-alarm.o
diff --git a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
new file mode 100644
index 000000000000..a2014375587d
--- /dev/null
+++ b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
@@ -0,0 +1,623 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Linaro Limited
+ *
+ * Based on original driver:
+ * Copyright (c) 2012-2020, The Linux Foundation. All rights reserved.
+ */
+#include <linux/bitfield.h>
+#include <linux/iio/adc/qcom-vadc-common.h>
+#include <linux/iio/consumer.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+
+/*
+ * Thermal monitoring block consists of 8 (ADC_TM5_NUM_CHANNELS) channels. Each
+ * channel is programmed to use one of ADC channels for voltage comparison.
+ * Voltages are programmed using ADC codes, so we have to convert temp to
+ * voltage and then to ADC code value.
+ *
+ * Configuration of TM channels must match configuration of corresponding ADC
+ * channels.
+ */
+
+#define ADC5_MAX_CHANNEL                        0xc0
+#define ADC_TM5_NUM_CHANNELS		8
+
+#define ADC_TM5_STATUS_LOW			0x0a
+
+#define ADC_TM5_STATUS_HIGH			0x0b
+
+#define ADC_TM5_NUM_BTM				0x0f
+
+#define ADC_TM5_ADC_DIG_PARAM			0x42
+
+#define ADC_TM5_FAST_AVG_CTL			(ADC_TM5_ADC_DIG_PARAM + 1)
+#define ADC_TM5_FAST_AVG_EN				BIT(7)
+
+#define ADC_TM5_MEAS_INTERVAL_CTL		(ADC_TM5_ADC_DIG_PARAM + 2)
+#define ADC_TM5_TIMER1					3 /* 3.9ms */
+
+#define ADC_TM5_MEAS_INTERVAL_CTL2		(ADC_TM5_ADC_DIG_PARAM + 3)
+#define ADC_TM5_MEAS_INTERVAL_CTL2_MASK			0xf0
+#define ADC_TM5_TIMER2					10 /* 1 second */
+#define ADC_TM5_MEAS_INTERVAL_CTL3_MASK			0xf
+#define ADC_TM5_TIMER3					4 /* 4 second */
+
+#define ADC_TM_EN_CTL1				0x46
+#define ADC_TM_EN					BIT(7)
+#define ADC_TM_CONV_REQ				0x47
+#define ADC_TM_CONV_REQ_EN				BIT(7)
+
+#define ADC_TM5_M_CHAN_BASE			0x60
+
+#define ADC_TM5_M_ADC_CH_SEL_CTL(n)		(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 0)
+#define ADC_TM5_M_LOW_THR0(n)			(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 1)
+#define ADC_TM5_M_LOW_THR1(n)			(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 2)
+#define ADC_TM5_M_HIGH_THR0(n)			(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 3)
+#define ADC_TM5_M_HIGH_THR1(n)			(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 4)
+#define ADC_TM5_M_MEAS_INTERVAL_CTL(n)		(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 5)
+#define ADC_TM5_M_CTL(n)			(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 6)
+#define ADC_TM5_M_CTL_HW_SETTLE_DELAY_MASK		0xf
+#define ADC_TM5_M_CTL_CAL_SEL_MASK			0x30
+#define ADC_TM5_M_CTL_CAL_VAL				0x40
+#define ADC_TM5_M_EN(n)				(ADC_TM5_M_CHAN_BASE + ((n) * 8) + 7)
+#define ADC_TM5_M_MEAS_EN				BIT(7)
+#define ADC_TM5_M_HIGH_THR_INT_EN			BIT(1)
+#define ADC_TM5_M_LOW_THR_INT_EN			BIT(0)
+
+enum adc5_timer_select {
+	ADC5_TIMER_SEL_1 = 0,
+	ADC5_TIMER_SEL_2,
+	ADC5_TIMER_SEL_3,
+	ADC5_TIMER_SEL_NONE,
+};
+
+struct adc_tm5_data {
+	const u32	full_scale_code_volt;
+	unsigned int	*decimation;
+	unsigned int	*hw_settle;
+};
+
+enum adc_tm5_cal_method {
+	ADC_TM5_NO_CAL = 0,
+	ADC_TM5_RATIOMETRIC_CAL,
+	ADC_TM5_ABSOLUTE_CAL
+};
+
+struct adc_tm5_chip;
+
+/**
+ * struct adc_tm5_channel - ADC Thermal Monitoring channel data.
+ * @channel: channel number.
+ * @adc_channel: corresponding ADC channel number.
+ * @cal_method: calibration method.
+ * @prescale: channel scaling performed on the input signal.
+ * @hw_settle_time: the time between AMUX being configured and the
+ *	start of conversion.
+ * @iio: IIO channel instance used by this channel.
+ * @chip: ADC TM chip instance.
+ * @tzd: thermal zone device used by this channel.
+ */
+struct adc_tm5_channel {
+	unsigned int		channel;
+	unsigned int		adc_channel;
+	enum adc_tm5_cal_method	cal_method;
+	unsigned int		prescale;
+	unsigned int		hw_settle_time;
+	struct iio_channel	*iio;
+	struct adc_tm5_chip	*chip;
+	struct thermal_zone_device *tzd;
+};
+
+/**
+ * struct adc_tm5_chip - ADC Thermal Monitoring properties
+ * @regmap: SPMI ADC5 Thermal Monitoring  peripheral register map field.
+ * @dev: SPMI ADC5 device.
+ * @data: software configuration data.
+ * @channels: array of ADC TM channel data.
+ * @nchannels: amount of channels defined/allocated
+ * @decimation: sampling rate supported for the channel.
+ * @avg_samples: ability to provide single result from the ADC
+ *	that is an average of multiple measurements.
+ * @base: base address of TM registers.
+ */
+struct adc_tm5_chip {
+	struct regmap		*regmap;
+	struct device		*dev;
+	const struct adc_tm5_data	*data;
+	struct adc_tm5_channel	*channels;
+	unsigned int		nchannels;
+	unsigned int		decimation;
+	unsigned int		avg_samples;
+	u16			base;
+};
+
+static const struct adc_tm5_data adc_tm5_data_pmic = {
+	.full_scale_code_volt = 0x70e4,
+	.decimation = (unsigned int []) { 250, 420, 840 },
+	.hw_settle = (unsigned int []) { 15, 100, 200, 300, 400, 500, 600, 700,
+					 1000, 2000, 4000, 8000, 16000, 32000,
+					 64000, 128000 },
+};
+
+static int adc_tm5_read(struct adc_tm5_chip *adc_tm, u16 offset, u8 *data, int len)
+{
+	return regmap_bulk_read(adc_tm->regmap, adc_tm->base + offset, data, len);
+}
+
+static int adc_tm5_write(struct adc_tm5_chip *adc_tm, u16 offset, u8 *data, int len)
+{
+	return regmap_bulk_write(adc_tm->regmap, adc_tm->base + offset, data, len);
+}
+
+static int adc_tm5_reg_update(struct adc_tm5_chip *adc_tm, u16 offset, u8 mask, u8 val)
+{
+	return regmap_write_bits(adc_tm->regmap, adc_tm->base + offset, mask, val);
+}
+
+static irqreturn_t adc_tm5_isr(int irq, void *data)
+{
+	struct adc_tm5_chip *chip = data;
+	u8 status_low, status_high, ctl;
+	int ret, i;
+
+	ret = adc_tm5_read(chip, ADC_TM5_STATUS_LOW, &status_low, sizeof(status_low));
+	if (unlikely(ret)) {
+		dev_err(chip->dev, "read status low failed: %d\n", ret);
+		return IRQ_HANDLED;
+	}
+
+	ret = adc_tm5_read(chip, ADC_TM5_STATUS_HIGH, &status_high, sizeof(status_high));
+	if (unlikely(ret)) {
+		dev_err(chip->dev, "read status high failed: %d\n", ret);
+		return IRQ_HANDLED;
+	}
+
+	for (i = 0; i < chip->nchannels; i++) {
+		bool upper_set = false, lower_set = false;
+		unsigned int ch = chip->channels[i].channel;
+
+		/* No TZD, we warned at the boot time */
+		if (!chip->channels[i].tzd)
+			continue;
+
+		ret = adc_tm5_read(chip, ADC_TM5_M_EN(ch), &ctl, sizeof(ctl));
+		if (unlikely(ret)) {
+			dev_err(chip->dev, "ctl read failed: %d, channel %d\n", ret, i);
+			continue;
+		}
+
+		if (!(ctl & ADC_TM5_M_MEAS_EN))
+			continue;
+
+		lower_set = (status_low & BIT(ch)) &&
+			(ctl & ADC_TM5_M_LOW_THR_INT_EN);
+
+		upper_set = (status_high & BIT(ch)) &&
+			(ctl & ADC_TM5_M_HIGH_THR_INT_EN);
+
+		if (upper_set || lower_set)
+			thermal_zone_device_update(chip->channels[i].tzd,
+						   THERMAL_EVENT_UNSPECIFIED);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int adc_tm5_get_temp(void *data, int *temp)
+{
+	struct adc_tm5_channel *channel = data;
+	int ret;
+
+	if (!channel || !channel->iio)
+		return -EINVAL;
+
+	ret = iio_read_channel_processed(channel->iio, temp);
+	if (ret < 0)
+		return ret;
+
+	if (ret != IIO_VAL_INT)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int adc_tm5_disable_channel(struct adc_tm5_channel *channel)
+{
+	struct adc_tm5_chip *chip = channel->chip;
+	unsigned int reg = ADC_TM5_M_EN(channel->channel);
+
+	return adc_tm5_reg_update(chip, reg,
+				  ADC_TM5_M_MEAS_EN |
+				  ADC_TM5_M_HIGH_THR_INT_EN |
+				  ADC_TM5_M_LOW_THR_INT_EN,
+				  0);
+}
+
+static int adc_tm5_enable(struct adc_tm5_chip *chip)
+{
+	int ret;
+	u8 data;
+
+	data = ADC_TM_EN;
+	ret = adc_tm5_write(chip, ADC_TM_EN_CTL1, &data, sizeof(data));
+	if (ret < 0) {
+		dev_err(chip->dev, "adc-tm enable failed\n");
+		return ret;
+	}
+
+	data = ADC_TM_CONV_REQ_EN;
+	ret = adc_tm5_write(chip, ADC_TM_CONV_REQ, &data, sizeof(data));
+	if (ret < 0) {
+		dev_err(chip->dev, "adc-tm request conversion failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int adc_tm5_configure(struct adc_tm5_channel *channel, int low, int high)
+{
+	struct adc_tm5_chip *chip = channel->chip;
+	u8 buf[8];
+	u16 reg = ADC_TM5_M_ADC_CH_SEL_CTL(channel->channel);
+	int ret;
+
+	ret = adc_tm5_read(chip, reg, buf, sizeof(buf));
+	if (ret) {
+		dev_err(chip->dev, "channel %d params read failed: %d\n", channel->channel, ret);
+		return ret;
+	}
+
+	buf[0] = channel->adc_channel;
+
+	/* High temperature corresponds to low voltage threshold */
+	if (high != INT_MAX) {
+		u16 adc_code = qcom_adc_tm5_temp_volt_scale(channel->prescale,
+				chip->data->full_scale_code_volt, high);
+
+		buf[1] = adc_code & 0xff;
+		buf[2] = adc_code >> 8;
+		buf[7] |= ADC_TM5_M_LOW_THR_INT_EN;
+	} else {
+		buf[7] &= ~ADC_TM5_M_LOW_THR_INT_EN;
+	}
+
+	/* Low temperature corresponds to high voltage threshold */
+	if (low != -INT_MAX) {
+		u16 adc_code = qcom_adc_tm5_temp_volt_scale(channel->prescale,
+				chip->data->full_scale_code_volt, low);
+
+		buf[3] = adc_code & 0xff;
+		buf[4] = adc_code >> 8;
+		buf[7] |= ADC_TM5_M_HIGH_THR_INT_EN;
+	} else {
+		buf[7] &= ~ADC_TM5_M_HIGH_THR_INT_EN;
+	}
+
+	buf[5] = ADC5_TIMER_SEL_2;
+
+	/* Set calibration select, hw_settle delay */
+	buf[6] &= ~ADC_TM5_M_CTL_HW_SETTLE_DELAY_MASK;
+	buf[6] |= FIELD_PREP(ADC_TM5_M_CTL_HW_SETTLE_DELAY_MASK, channel->hw_settle_time);
+	buf[6] &= ~ADC_TM5_M_CTL_CAL_SEL_MASK;
+	buf[6] |= FIELD_PREP(ADC_TM5_M_CTL_CAL_SEL_MASK, channel->cal_method);
+
+	buf[7] |= ADC_TM5_M_MEAS_EN;
+
+	ret = adc_tm5_write(chip, reg, buf, sizeof(buf));
+	if (ret) {
+		dev_err(chip->dev, "channel %d params write failed: %d\n", channel->channel, ret);
+		return ret;
+	}
+
+	return adc_tm5_enable(chip);
+}
+
+static int adc_tm5_set_trips(void *data, int low, int high)
+{
+	struct adc_tm5_channel *channel = data;
+	struct adc_tm5_chip *chip;
+	int ret;
+
+	if (!channel)
+		return -EINVAL;
+
+	chip = channel->chip;
+	dev_dbg(chip->dev, "%d:low(mdegC):%d, high(mdegC):%d\n",
+		channel->channel, low, high);
+
+	if (high == INT_MAX && low <= -INT_MAX)
+		ret = adc_tm5_disable_channel(channel);
+	else
+		ret = adc_tm5_configure(channel, low, high);
+
+	return ret;
+}
+
+static struct thermal_zone_of_device_ops adc_tm5_ops = {
+	.get_temp = adc_tm5_get_temp,
+	.set_trips = adc_tm5_set_trips,
+};
+
+static int adc_tm5_register_tzd(struct adc_tm5_chip *adc_tm)
+{
+	unsigned int i;
+	struct thermal_zone_device *tzd;
+
+	for (i = 0; i < adc_tm->nchannels; i++) {
+		adc_tm->channels[i].chip = adc_tm;
+
+		tzd = devm_thermal_zone_of_sensor_register(adc_tm->dev,
+							   adc_tm->channels[i].channel,
+							   &adc_tm->channels[i],
+							   &adc_tm5_ops);
+		if (IS_ERR(tzd)) {
+			dev_err(adc_tm->dev, "Error registering TZ zone for channel %d: %ld\n",
+				adc_tm->channels[i].channel, PTR_ERR(tzd));
+			return PTR_ERR(tzd);
+		}
+		adc_tm->channels[i].tzd = tzd;
+	}
+
+	return 0;
+}
+
+static int adc_tm5_init(struct adc_tm5_chip *chip)
+{
+	u8 buf[4], channels_available;
+	int ret;
+	unsigned int i;
+
+	for (i = 0; i < chip->nchannels; i++) {
+		if (chip->channels[i].channel >= channels_available) {
+			dev_err(chip->dev, "Invalid channel %d\n", chip->channels[i].channel);
+			return -EINVAL;
+		}
+	}
+
+	ret = adc_tm5_read(chip, ADC_TM5_NUM_BTM,
+			   &channels_available, sizeof(channels_available));
+	if (ret) {
+		dev_err(chip->dev, "read failed for BTM channels\n");
+		return ret;
+	}
+
+	buf[0] = chip->decimation;
+	buf[1] = chip->avg_samples | ADC_TM5_FAST_AVG_EN;
+	buf[2] = ADC_TM5_TIMER1;
+	buf[3] = FIELD_PREP(ADC_TM5_MEAS_INTERVAL_CTL2_MASK, ADC_TM5_TIMER2) |
+		 FIELD_PREP(ADC_TM5_MEAS_INTERVAL_CTL3_MASK, ADC_TM5_TIMER3);
+
+	ret = adc_tm5_write(chip, ADC_TM5_ADC_DIG_PARAM, buf, sizeof(buf));
+	if (ret) {
+		dev_err(chip->dev, "block write failed: %d\n", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int adc_tm5_get_dt_channel_data(struct adc_tm5_chip *adc_tm,
+				       struct adc_tm5_channel *channel,
+				       struct device_node *node)
+{
+	const char *name = node->name;
+	u32 chan, value, varr[2];
+	int ret;
+	struct device *dev = adc_tm->dev;
+	struct of_phandle_args args;
+
+	ret = of_property_read_u32(node, "reg", &chan);
+	if (ret) {
+		dev_err(dev, "%s: invalid channel number %d\n", name, ret);
+		return ret;
+	}
+
+	if (chan >= ADC_TM5_NUM_CHANNELS) {
+		dev_err(dev, "%s: channel number too big: %d\n", name, chan);
+		return -EINVAL;
+	}
+
+	channel->channel = chan;
+
+	/*
+	 * We are tied to PMIC's ADC controller, which always use single
+	 * argument for channel number.  So don't bother parsing
+	 * #io-channel-cells, just enforce cell_count = 1.
+	 */
+	ret = of_parse_phandle_with_fixed_args(node, "io-channels", 1, 0, &args);
+	if (ret < 0) {
+		dev_err(dev, "%s: error parsing ADC channel number %d: %d\n", name, chan, ret);
+		return ret;
+	}
+	of_node_put(args.np);
+
+	if (args.args_count != 1 || args.args[0] >= ADC5_MAX_CHANNEL) {
+		dev_err(dev, "%s: invalid ADC channel number %d\n", name, chan);
+		return ret;
+	}
+	channel->adc_channel = args.args[0];
+
+	channel->iio = devm_of_iio_channel_get_by_name(adc_tm->dev, node, NULL);
+	if (IS_ERR(channel->iio)) {
+		ret = PTR_ERR(channel->iio);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "%s: error getting channel: %d\n", name, ret);
+		return ret;
+	}
+
+	ret = of_property_read_u32_array(node, "qcom,pre-scaling", varr, 2);
+	if (!ret) {
+		ret = qcom_adc5_prescaling_from_dt(varr[0], varr[1]);
+		if (ret < 0) {
+			dev_err(dev, "%s: invalid pre-scaling <%d %d>\n",
+				name, varr[0], varr[1]);
+			return ret;
+		}
+		channel->prescale = ret;
+	} else {
+		/* 1:1 prescale is index 0 */
+		channel->prescale = 0;
+	}
+
+	ret = of_property_read_u32(node, "qcom,hw-settle-time-us", &value);
+	if (!ret) {
+		ret = qcom_adc5_hw_settle_time_from_dt(value, adc_tm->data->hw_settle);
+		if (ret < 0) {
+			dev_err(dev, "%s invalid hw-settle-time-us %d us\n",
+				name, value);
+			return ret;
+		}
+		channel->hw_settle_time = ret;
+	} else {
+		channel->hw_settle_time = VADC_DEF_HW_SETTLE_TIME;
+	}
+
+	if (of_property_read_bool(node, "qcom,ratiometric"))
+		channel->cal_method = ADC_TM5_RATIOMETRIC_CAL;
+	else
+		channel->cal_method = ADC_TM5_ABSOLUTE_CAL;
+
+	return 0;
+}
+
+static int adc_tm5_get_dt_data(struct adc_tm5_chip *adc_tm, struct device_node *node)
+{
+	struct adc_tm5_channel *channels;
+	struct device_node *child;
+	u32 value;
+	int ret;
+	struct device *dev = adc_tm->dev;
+
+	adc_tm->nchannels = of_get_available_child_count(node);
+	if (!adc_tm->nchannels)
+		return -EINVAL;
+
+	adc_tm->channels = devm_kcalloc(dev, adc_tm->nchannels,
+					sizeof(*adc_tm->channels), GFP_KERNEL);
+	if (!adc_tm->channels)
+		return -ENOMEM;
+
+	channels = adc_tm->channels;
+
+	adc_tm->data = of_device_get_match_data(dev);
+	if (!adc_tm->data)
+		adc_tm->data = &adc_tm5_data_pmic;
+
+	ret = of_property_read_u32(node, "qcom,decimation", &value);
+	if (!ret) {
+		ret = qcom_adc5_decimation_from_dt(value, adc_tm->data->decimation);
+		if (ret < 0) {
+			dev_err(dev, "invalid decimation %d\n", value);
+			return ret;
+		}
+		adc_tm->decimation = ret;
+	} else {
+		adc_tm->decimation = ADC5_DECIMATION_DEFAULT;
+	}
+
+	ret = of_property_read_u32(node, "qcom,avg-samples", &value);
+	if (!ret) {
+		ret = qcom_adc5_avg_samples_from_dt(value);
+		if (ret < 0) {
+			dev_err(dev, "invalid avg-samples %d\n", value);
+			return ret;
+		}
+		adc_tm->avg_samples = ret;
+	} else {
+		adc_tm->avg_samples = VADC_DEF_AVG_SAMPLES;
+	}
+
+	for_each_available_child_of_node(node, child) {
+		ret = adc_tm5_get_dt_channel_data(adc_tm, channels, child);
+		if (ret) {
+			of_node_put(child);
+			return ret;
+		}
+
+		channels++;
+	}
+
+	return 0;
+}
+
+static int adc_tm5_probe(struct platform_device *pdev)
+{
+	struct device_node *node = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct adc_tm5_chip *adc_tm;
+	struct regmap *regmap;
+	int ret, irq;
+	u32 reg;
+
+	regmap = dev_get_regmap(dev->parent, NULL);
+	if (!regmap)
+		return -ENODEV;
+
+	ret = of_property_read_u32(node, "reg", &reg);
+	if (ret)
+		return ret;
+
+	adc_tm = devm_kzalloc(&pdev->dev, sizeof(*adc_tm), GFP_KERNEL);
+	if (!adc_tm)
+		return -ENOMEM;
+
+	adc_tm->regmap = regmap;
+	adc_tm->dev = dev;
+	adc_tm->base = reg;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(dev, "get_irq failed: %d\n", irq);
+		return irq;
+	}
+
+	ret = adc_tm5_get_dt_data(adc_tm, node);
+	if (ret) {
+		dev_err(dev, "get dt data failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = adc_tm5_init(adc_tm);
+	if (ret) {
+		dev_err(dev, "adc-tm init failed\n");
+		return ret;
+	}
+
+	ret = adc_tm5_register_tzd(adc_tm);
+	if (ret) {
+		dev_err(dev, "tzd register failed\n");
+		return ret;
+	}
+
+	return devm_request_threaded_irq(dev, irq, NULL, adc_tm5_isr,
+					 IRQF_ONESHOT, "pm-adc-tm5", adc_tm);
+}
+
+static const struct of_device_id adc_tm5_match_table[] = {
+	{
+		.compatible = "qcom,spmi-adc-tm5",
+		.data = &adc_tm5_data_pmic,
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, adc_tm5_match_table);
+
+static struct platform_driver adc_tm5_driver = {
+	.driver = {
+		.name = "qcom-spmi-adc-tm5",
+		.of_match_table = adc_tm5_match_table,
+	},
+	.probe = adc_tm5_probe,
+};
+module_platform_driver(adc_tm5_driver);
+
+MODULE_DESCRIPTION("SPMI PMIC Thermal Monitor ADC driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h
index 58216124d89d..33f60f43e1aa 100644
--- a/include/linux/iio/adc/qcom-vadc-common.h
+++ b/include/linux/iio/adc/qcom-vadc-common.h
@@ -158,6 +158,9 @@ int qcom_adc5_hw_scale(enum vadc_scale_fn_type scaletype,
 		    const struct adc5_data *data,
 		    u16 adc_code, int *result_mdec);
 
+u16 qcom_adc_tm5_temp_volt_scale(unsigned int prescale_ratio,
+				 u32 full_scale_code_volt, int temp);
+
 int qcom_adc5_prescaling_from_dt(u32 num, u32 den);
 
 int qcom_adc5_hw_settle_time_from_dt(u32 value, const unsigned int *hw_settle);
-- 
cgit v1.2.3


From d0a0bbe7b0a181c58bd22d6942146cfa3ab9e49a Mon Sep 17 00:00:00 2001
From: Tong Zhang <ztong0001@gmail.com>
Date: Sun, 14 Feb 2021 18:43:08 -0500
Subject: atm: idt77252: fix build broken on amd64

idt77252 is broken and wont load on amd64 systems
  modprobe idt77252 shows the following

    idt77252_init: skb->cb is too small (48 < 56)

  Add packed attribute to struct idt77252_skb_prv and struct atm_skb_data
  so that the total size can be <= sizeof(skb->cb)
  Also convert runtime size check to buildtime size check in
  idt77252_init()

Signed-off-by: Tong Zhang <ztong0001@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/idt77252.c | 11 +----------
 drivers/atm/idt77252.h |  2 +-
 include/linux/atmdev.h |  2 +-
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 5f0472c18bcb..0c13cac903de 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -3743,16 +3743,7 @@ static int __init idt77252_init(void)
 	struct sk_buff *skb;
 
 	printk("%s: at %p\n", __func__, idt77252_init);
-
-	if (sizeof(skb->cb) < sizeof(struct atm_skb_data) +
-			      sizeof(struct idt77252_skb_prv)) {
-		printk(KERN_ERR "%s: skb->cb is too small (%lu < %lu)\n",
-		       __func__, (unsigned long) sizeof(skb->cb),
-		       (unsigned long) sizeof(struct atm_skb_data) +
-				       sizeof(struct idt77252_skb_prv));
-		return -EIO;
-	}
-
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct idt77252_skb_prv) + sizeof(struct atm_skb_data));
 	return pci_register_driver(&idt77252_driver);
 }
 
diff --git a/drivers/atm/idt77252.h b/drivers/atm/idt77252.h
index 9339197d701c..b059d31364dd 100644
--- a/drivers/atm/idt77252.h
+++ b/drivers/atm/idt77252.h
@@ -789,7 +789,7 @@ struct idt77252_skb_prv {
 	struct scqe	tbd;	/* Transmit Buffer Descriptor */
 	dma_addr_t	paddr;	/* DMA handle */
 	u32		pool;	/* sb_pool handle */
-};
+} __packed;
 
 #define IDT77252_PRV_TBD(skb)	\
 	(((struct idt77252_skb_prv *)(ATM_SKB(skb)+1))->tbd)
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index d7493016cd46..60cd25c0461b 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -207,7 +207,7 @@ struct atm_skb_data {
 	struct atm_vcc	*vcc;		/* ATM VCC */
 	unsigned long	atm_options;	/* ATM layer options */
 	unsigned int	acct_truesize;  /* truesize accounted to vcc */
-};
+} __packed;
 
 #define VCC_HTABLE_SIZE 32
 
-- 
cgit v1.2.3


From 17d3a83afbbff34209d6c3636718fc1abe305ef8 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 12 Feb 2021 19:46:31 -0800
Subject: net: phy: broadcom: Remove unused flags

We have a number of unused flags defined today and since we are scarce
on space and may need to introduce new flags in the future remove and
shift every existing flag down into a contiguous assignment.
PHY_BCM_FLAGS_MODE_1000BX was only used internally for the BCM54616S
PHY, so we allocate a driver private structure instead to store that
flag instead of canibalizing one from phydev->dev_flags for that
purpose.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 19 ++++++++++++++++---
 include/linux/brcmphy.h    | 21 ++++++++-------------
 2 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 4142f69c1530..3ce266ab521b 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -381,10 +381,21 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
 	return ret;
 }
 
+struct bcm54616s_phy_priv {
+	bool mode_1000bx_en;
+};
+
 static int bcm54616s_probe(struct phy_device *phydev)
 {
+	struct bcm54616s_phy_priv *priv;
 	int val, intf_sel;
 
+	priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	phydev->priv = priv;
+
 	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
 	if (val < 0)
 		return val;
@@ -407,7 +418,7 @@ static int bcm54616s_probe(struct phy_device *phydev)
 		 * 1000BASE-X configuration.
 		 */
 		if (!(val & BCM54616S_100FX_MODE))
-			phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX;
+			priv->mode_1000bx_en = true;
 
 		phydev->port = PORT_FIBRE;
 	}
@@ -417,10 +428,11 @@ static int bcm54616s_probe(struct phy_device *phydev)
 
 static int bcm54616s_config_aneg(struct phy_device *phydev)
 {
+	struct bcm54616s_phy_priv *priv = phydev->priv;
 	int ret;
 
 	/* Aneg firstly. */
-	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX)
+	if (priv->mode_1000bx_en)
 		ret = genphy_c37_config_aneg(phydev);
 	else
 		ret = genphy_config_aneg(phydev);
@@ -433,9 +445,10 @@ static int bcm54616s_config_aneg(struct phy_device *phydev)
 
 static int bcm54616s_read_status(struct phy_device *phydev)
 {
+	struct bcm54616s_phy_priv *priv = phydev->priv;
 	int err;
 
-	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX)
+	if (priv->mode_1000bx_en)
 		err = genphy_c37_read_status(phydev);
 	else
 		err = genphy_read_status(phydev);
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index de9430d55c90..844dcfe789a2 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -61,19 +61,14 @@
 #define PHY_BCM_OUI_5			0x03625e00
 #define PHY_BCM_OUI_6			0xae025000
 
-#define PHY_BCM_FLAGS_MODE_COPPER	0x00000001
-#define PHY_BCM_FLAGS_MODE_1000BX	0x00000002
-#define PHY_BCM_FLAGS_INTF_SGMII	0x00000010
-#define PHY_BCM_FLAGS_INTF_XAUI		0x00000020
-#define PHY_BRCM_WIRESPEED_ENABLE	0x00000100
-#define PHY_BRCM_AUTO_PWRDWN_ENABLE	0x00000200
-#define PHY_BRCM_RX_REFCLK_UNUSED	0x00000400
-#define PHY_BRCM_STD_IBND_DISABLE	0x00000800
-#define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00001000
-#define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
-#define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
-#define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
-#define PHY_BRCM_EN_MASTER_MODE		0x00010000
+#define PHY_BRCM_AUTO_PWRDWN_ENABLE	0x00000001
+#define PHY_BRCM_RX_REFCLK_UNUSED	0x00000002
+#define PHY_BRCM_STD_IBND_DISABLE	0x00000004
+#define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00000008
+#define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00000010
+#define PHY_BRCM_CLEAR_RGMII_MODE	0x00000020
+#define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00000040
+#define PHY_BRCM_EN_MASTER_MODE		0x00000080
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
-- 
cgit v1.2.3


From 5d4358ede8ebe2e4ae03a633082f3ce21ec2df3e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 12 Feb 2021 19:46:32 -0800
Subject: net: phy: broadcom: Allow BCM54210E to configure APD

BCM54210E/BCM50212E has been verified to work correctly with the
auto-power down configuration done by bcm54xx_adjust_rxrefclk(), add it
to the list of PHYs working.

While we are at it, provide an appropriate name for the bit we are
changing which disables the RXC and TXC during auto-power down when
there is no energy on the cable.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 8 +++++---
 include/linux/brcmphy.h    | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 3ce266ab521b..91fbd26c809e 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -193,6 +193,7 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev)
 	if (BRCM_PHY_MODEL(phydev) != PHY_ID_BCM57780 &&
 	    BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610 &&
 	    BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610M &&
+	    BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54210E &&
 	    BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54810 &&
 	    BRCM_PHY_MODEL(phydev) != PHY_ID_BCM54811)
 		return;
@@ -227,9 +228,10 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev)
 		val |= BCM54XX_SHD_SCR3_DLLAPD_DIS;
 
 	if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) {
-		if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 ||
-		    BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54811)
-			val |= BCM54810_SHD_SCR3_TRDDAPD;
+		if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E ||
+		    BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810 ||
+		    BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E)
+			val |= BCM54XX_SHD_SCR3_RXCTXC_DIS;
 		else
 			val |= BCM54XX_SHD_SCR3_TRDDAPD;
 	}
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 844dcfe789a2..16597d3fa011 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -193,6 +193,7 @@
 #define  BCM54XX_SHD_SCR3_DEF_CLK125	0x0001
 #define  BCM54XX_SHD_SCR3_DLLAPD_DIS	0x0002
 #define  BCM54XX_SHD_SCR3_TRDDAPD	0x0004
+#define  BCM54XX_SHD_SCR3_RXCTXC_DIS	0x0100
 
 /* 01010: Auto Power-Down */
 #define BCM54XX_SHD_APD			0x0a
@@ -253,7 +254,6 @@
 #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN	(1 << 0)
 #define BCM54810_SHD_CLK_CTL			0x3
 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN		(1 << 9)
-#define BCM54810_SHD_SCR3_TRDDAPD		0x0100
 
 /* BCM54612E Registers */
 #define BCM54612E_EXP_SPARE0		(MII_BCM54XX_EXP_SEL_ETC + 0x34)
-- 
cgit v1.2.3


From 93e8990c24bee30696c02e8f6aed043333491a25 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 14 Feb 2021 15:16:23 +0100
Subject: net: phy: rename PHY_IGNORE_INTERRUPT to PHY_MAC_INTERRUPT

Some internal PHY's have their events like link change reported by the
MAC interrupt. We have PHY_IGNORE_INTERRUPT to deal with this scenario.
I'm not too happy with this name. We don't ignore interrupts, typically
there is no interrupt exposed at a PHY level. So let's rename it to
PHY_MAC_INTERRUPT. This is in line with phy_mac_interrupt(), which is
called from the MAC interrupt handler to handle PHY events.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phy.rst                |  2 +-
 drivers/net/ethernet/broadcom/genet/bcmmii.c    |  2 +-
 drivers/net/ethernet/realtek/r8169_main.c       |  2 +-
 drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c |  4 ++--
 drivers/net/mdio/mdio-moxart.c                  |  4 ++--
 drivers/net/phy/icplus.c                        |  2 +-
 drivers/net/phy/phy.c                           |  2 +-
 drivers/net/phy/phy_device.c                    |  4 ++--
 include/linux/phy.h                             | 10 +++++-----
 9 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 399f17976a6c..70136cc9e25e 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -216,7 +216,7 @@ put into an unsupported state.
 Lastly, once the controller is ready to handle network traffic, you call
 phy_start(phydev).  This tells the PAL that you are ready, and configures the
 PHY to connect to the network. If the MAC interrupt of your network driver
-also handles PHY status changes, just set phydev->irq to PHY_IGNORE_INTERRUPT
+also handles PHY status changes, just set phydev->irq to PHY_MAC_INTERRUPT
 before you call phy_start and use phy_mac_interrupt() from the network
 driver. If you don't want to use interrupts, set phydev->irq to PHY_POLL.
 phy_start() enables the PHY interrupts (if applicable) and starts the
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 17f997ef950f..5335244e4577 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -359,7 +359,7 @@ int bcmgenet_mii_probe(struct net_device *dev)
 	 * those versions of GENET.
 	 */
 	if (priv->internal_phy && !GENET_IS_V5(priv))
-		dev->phydev->irq = PHY_IGNORE_INTERRUPT;
+		dev->phydev->irq = PHY_MAC_INTERRUPT;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 44a438bad153..cbc30df4e08a 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5056,7 +5056,7 @@ static int r8169_mdio_register(struct rtl8169_private *tp)
 	new_bus->name = "r8169";
 	new_bus->priv = tp;
 	new_bus->parent = &pdev->dev;
-	new_bus->irq[0] = PHY_IGNORE_INTERRUPT;
+	new_bus->irq[0] = PHY_MAC_INTERRUPT;
 	snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x", pci_dev_id(pdev));
 
 	new_bus->read = r8169_mdio_read_reg;
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c
index b1e7f7ab281c..fceb6d637235 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_mdio.c
@@ -203,8 +203,8 @@ int sxgbe_mdio_register(struct net_device *ndev)
 			case PHY_POLL:
 				irq_str = "POLL";
 				break;
-			case PHY_IGNORE_INTERRUPT:
-				irq_str = "IGNORE";
+			case PHY_MAC_INTERRUPT:
+				irq_str = "MAC";
 				break;
 			default:
 				sprintf(irq_num, "%d", phy->irq);
diff --git a/drivers/net/mdio/mdio-moxart.c b/drivers/net/mdio/mdio-moxart.c
index b72c6d185175..f0cff584e176 100644
--- a/drivers/net/mdio/mdio-moxart.c
+++ b/drivers/net/mdio/mdio-moxart.c
@@ -125,7 +125,7 @@ static int moxart_mdio_probe(struct platform_device *pdev)
 	snprintf(bus->id, MII_BUS_ID_SIZE, "%s-%d-mii", pdev->name, pdev->id);
 	bus->parent = &pdev->dev;
 
-	/* Setting PHY_IGNORE_INTERRUPT here even if it has no effect,
+	/* Setting PHY_MAC_INTERRUPT here even if it has no effect,
 	 * of_mdiobus_register() sets these PHY_POLL.
 	 * Ideally, the interrupt from MAC controller could be used to
 	 * detect link state changes, not polling, i.e. if there was
@@ -133,7 +133,7 @@ static int moxart_mdio_probe(struct platform_device *pdev)
 	 * interrupt handled in ethernet drivercode.
 	 */
 	for (i = 0; i < PHY_MAX_ADDR; i++)
-		bus->irq[i] = PHY_IGNORE_INTERRUPT;
+		bus->irq[i] = PHY_MAC_INTERRUPT;
 
 	data = bus->priv;
 	data->base = devm_platform_ioremap_resource(pdev, 0);
diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c
index 4e15d4d02488..3e431737c1ba 100644
--- a/drivers/net/phy/icplus.c
+++ b/drivers/net/phy/icplus.c
@@ -188,7 +188,7 @@ static int ip175c_read_status(struct phy_device *phydev)
 		genphy_read_status(phydev);
 	else
 		/* Don't need to read status for switch ports */
-		phydev->irq = PHY_IGNORE_INTERRUPT;
+		phydev->irq = PHY_MAC_INTERRUPT;
 
 	return 0;
 }
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index fdb914b5b857..1be07e45d314 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1145,7 +1145,7 @@ void phy_state_machine(struct work_struct *work)
 	}
 
 	/* Only re-schedule a PHY state machine change if we are polling the
-	 * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving
+	 * PHY, if PHY_MAC_INTERRUPT is set, then we will be moving
 	 * between states from phy_mac_interrupt().
 	 *
 	 * In state PHY_HALTED the PHY gets suspended, so rescheduling the
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 30a20a29ae05..05261698bf74 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1167,8 +1167,8 @@ char *phy_attached_info_irq(struct phy_device *phydev)
 	case PHY_POLL:
 		irq_str = "POLL";
 		break;
-	case PHY_IGNORE_INTERRUPT:
-		irq_str = "IGNORE";
+	case PHY_MAC_INTERRUPT:
+		irq_str = "MAC";
 		break;
 	default:
 		snprintf(irq_num, sizeof(irq_num), "%d", phydev->irq);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index c130788306c8..5d7c4084ade9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -71,11 +71,11 @@ extern const int phy_10gbit_features_array[1];
 
 /*
  * Set phydev->irq to PHY_POLL if interrupts are not supported,
- * or not desired for this PHY.  Set to PHY_IGNORE_INTERRUPT if
- * the attached driver handles the interrupt
+ * or not desired for this PHY.  Set to PHY_MAC_INTERRUPT if
+ * the attached MAC driver handles the interrupt
  */
 #define PHY_POLL		-1
-#define PHY_IGNORE_INTERRUPT	-2
+#define PHY_MAC_INTERRUPT	-2
 
 #define PHY_IS_INTERNAL		0x00000001
 #define PHY_RST_AFTER_CLK_EN	0x00000002
@@ -1202,11 +1202,11 @@ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad,
  * @phydev: the phy_device struct
  *
  * NOTE: must be kept in sync with addition/removal of PHY_POLL and
- * PHY_IGNORE_INTERRUPT
+ * PHY_MAC_INTERRUPT
  */
 static inline bool phy_interrupt_is_valid(struct phy_device *phydev)
 {
-	return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT;
+	return phydev->irq != PHY_POLL && phydev->irq != PHY_MAC_INTERRUPT;
 }
 
 /**
-- 
cgit v1.2.3


From aec6c60a01d3a3170242d6a99372a388e1136dc6 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 16 Jan 2021 08:35:42 +0900
Subject: kbuild: check the minimum compiler version in Kconfig

Paul Gortmaker reported a regression in the GCC version check. [1]
If you use GCC 4.8, the build breaks before showing the error message
"error Sorry, your version of GCC is too old - please use 4.9 or newer."

I do not want to apply his fix-up since it implies we would not be able
to remove any cc-option test. Anyway, I admit checking the GCC version
in <linux/compiler-gcc.h> is too late.

Almost at the same time, Linus also suggested to move the compiler
version error to Kconfig time. [2]

I unified the two similar scripts, gcc-version.sh and clang-version.sh
into cc-version.sh. The old scripts invoked the compiler multiple times
(3 times for gcc-version.sh, 4 times for clang-version.sh). I refactored
the code so the new one invokes the compiler just once, and also tried
my best to use shell-builtin commands where possible.

The new script runs faster.

  $ time ./scripts/clang-version.sh clang
  120000

  real    0m0.029s
  user    0m0.012s
  sys     0m0.021s

  $ time ./scripts/cc-version.sh clang
  Clang 120000

  real    0m0.009s
  user    0m0.006s
  sys     0m0.004s

cc-version.sh also shows an error message if the compiler is too old:

  $ make defconfig CC=clang-9
  *** Default configuration is based on 'x86_64_defconfig'
  ***
  *** Compiler is too old.
  ***   Your Clang version:    9.0.1
  ***   Minimum Clang version: 10.0.1
  ***
  scripts/Kconfig.include:46: Sorry, this compiler is not supported.
  make[1]: *** [scripts/kconfig/Makefile:81: defconfig] Error 1
  make: *** [Makefile:602: defconfig] Error 2

The new script takes care of ICC because we have <linux/compiler-intel.h>
although I am not sure if building the kernel with ICC is well-supported.

[1]: https://lore.kernel.org/r/20210110190807.134996-1-paul.gortmaker@windriver.com
[2]: https://lore.kernel.org/r/CAHk-=wh-+TMHPTFo1qs-MYyK7tZh-OQovA=pP3=e06aCVp6_kA@mail.gmail.com

Fixes: 87de84c9140e ("kbuild: remove cc-option test of -Werror=date-time")
Reported-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Tested-by: Miguel Ojeda <ojeda@kernel.org>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 MAINTAINERS                    |  1 -
 include/linux/compiler-clang.h | 10 ------
 include/linux/compiler-gcc.h   | 11 ------
 init/Kconfig                   |  9 ++---
 scripts/Kconfig.include        |  6 ++++
 scripts/cc-version.sh          | 82 ++++++++++++++++++++++++++++++++++++++++++
 scripts/clang-version.sh       | 19 ----------
 scripts/gcc-version.sh         | 20 -----------
 8 files changed, 93 insertions(+), 65 deletions(-)
 create mode 100755 scripts/cc-version.sh
 delete mode 100755 scripts/clang-version.sh
 delete mode 100755 scripts/gcc-version.sh

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 667d03852191..df820969be1f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4314,7 +4314,6 @@ C:	irc://chat.freenode.net/clangbuiltlinux
 F:	Documentation/kbuild/llvm.rst
 F:	include/linux/compiler-clang.h
 F:	scripts/clang-tools/
-F:	scripts/clang-version.sh
 F:	scripts/lld-version.sh
 K:	\b(?i:clang|llvm)\b
 
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 98cff1b4b088..04c0a5a717f7 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -3,16 +3,6 @@
 #error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead."
 #endif
 
-#define CLANG_VERSION (__clang_major__ * 10000	\
-		     + __clang_minor__ * 100	\
-		     + __clang_patchlevel__)
-
-#if CLANG_VERSION < 100001
-#ifndef __BPF_TRACING__
-# error Sorry, your version of Clang is too old - please use 10.0.1 or newer.
-#endif
-#endif
-
 /* Compiler specific definitions for Clang compiler */
 
 /* same as gcc, this was present in clang-2.6 so we can assume it works
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 555ab0fddbef..48750243db4c 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -10,17 +10,6 @@
 		     + __GNUC_MINOR__ * 100	\
 		     + __GNUC_PATCHLEVEL__)
 
-/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */
-#if GCC_VERSION < 40900
-# error Sorry, your version of GCC is too old - please use 4.9 or newer.
-#elif defined(CONFIG_ARM64) && GCC_VERSION < 50100
-/*
- * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293
- * https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk
- */
-# error Sorry, your version of GCC is too old - please use 5.1 or newer.
-#endif
-
 /*
  * This macro obfuscates arithmetic on a variable address so that gcc
  * shouldn't recognize the original var, and make assumptions about it.
diff --git a/init/Kconfig b/init/Kconfig
index 29ad68325028..7bcfa24524c2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,11 +26,11 @@ config CC_VERSION_TEXT
 	    and then every file will be rebuilt.
 
 config CC_IS_GCC
-	def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc)
+	def_bool $(success,test "$(cc-name)" = GCC)
 
 config GCC_VERSION
 	int
-	default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC
+	default $(cc-version) if CC_IS_GCC
 	default 0
 
 config LD_VERSION
@@ -38,14 +38,15 @@ config LD_VERSION
 	default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh)
 
 config CC_IS_CLANG
-	def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q clang)
+	def_bool $(success,test "$(cc-name)" = Clang)
 
 config LD_IS_LLD
 	def_bool $(success,$(LD) -v | head -n 1 | grep -q LLD)
 
 config CLANG_VERSION
 	int
-	default $(shell,$(srctree)/scripts/clang-version.sh $(CC))
+	default $(cc-version) if CC_IS_CLANG
+	default 0
 
 config LLD_VERSION
 	int
diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include
index a5fe72c504ff..0228cb9c74aa 100644
--- a/scripts/Kconfig.include
+++ b/scripts/Kconfig.include
@@ -39,6 +39,12 @@ as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler
 $(error-if,$(failure,command -v $(CC)),compiler '$(CC)' not found)
 $(error-if,$(failure,command -v $(LD)),linker '$(LD)' not found)
 
+# Get the compiler name, version, and error out if it is not supported.
+cc-info := $(shell,$(srctree)/scripts/cc-version.sh $(CC))
+$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this compiler is not supported.)
+cc-name := $(shell,set -- $(cc-info) && echo $1)
+cc-version := $(shell,set -- $(cc-info) && echo $2)
+
 # Fail if the linker is gold as it's not capable of linking the kernel proper
 $(error-if,$(success, $(LD) -v | grep -q gold), gold linker '$(LD)' not supported)
 
diff --git a/scripts/cc-version.sh b/scripts/cc-version.sh
new file mode 100755
index 000000000000..3f2ee885b116
--- /dev/null
+++ b/scripts/cc-version.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Print the compiler name and its version in a 5 or 6-digit form.
+# Also, perform the minimum version check.
+
+set -e
+
+# When you raise the minimum compiler version, please update
+# Documentation/process/changes.rst as well.
+gcc_min_version=4.9.0
+clang_min_version=10.0.1
+icc_min_version=16.0.3 # temporary
+
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293
+# https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk
+if [ "$SRCARCH" = arm64 ]; then
+	gcc_min_version=5.1.0
+fi
+
+# Print the compiler name and some version components.
+get_compiler_info()
+{
+	cat <<- EOF | "$@" -E -P -x c - 2>/dev/null
+	#if defined(__clang__)
+	Clang	__clang_major__  __clang_minor__  __clang_patchlevel__
+	#elif defined(__INTEL_COMPILER)
+	ICC	__INTEL_COMPILER  __INTEL_COMPILER_UPDATE
+	#elif defined(__GNUC__)
+	GCC	__GNUC__  __GNUC_MINOR__  __GNUC_PATCHLEVEL__
+	#else
+	unknown
+	#endif
+	EOF
+}
+
+# Convert the version string x.y.z to a canonical 5 or 6-digit form.
+get_canonical_version()
+{
+	IFS=.
+	set -- $1
+	echo $((10000 * $1 + 100 * $2 + $3))
+}
+
+# $@ instead of $1 because multiple words might be given, e.g. CC="ccache gcc".
+orig_args="$@"
+set -- $(get_compiler_info "$@")
+
+name=$1
+
+case "$name" in
+GCC)
+	version=$2.$3.$4
+	min_version=$gcc_min_version
+	;;
+Clang)
+	version=$2.$3.$4
+	min_version=$clang_min_version
+	;;
+ICC)
+	version=$(($2 / 100)).$(($2 % 100)).$3
+	min_version=$icc_min_version
+	;;
+*)
+	echo "$orig_args: unknown compiler" >&2
+	exit 1
+	;;
+esac
+
+cversion=$(get_canonical_version $version)
+min_cversion=$(get_canonical_version $min_version)
+
+if [ "$cversion" -lt "$min_cversion" ]; then
+	echo >&2 "***"
+	echo >&2 "*** Compiler is too old."
+	echo >&2 "***   Your $name version:    $version"
+	echo >&2 "***   Minimum $name version: $min_version"
+	echo >&2 "***"
+	exit 1
+fi
+
+echo $name $cversion
diff --git a/scripts/clang-version.sh b/scripts/clang-version.sh
deleted file mode 100755
index 6fabf0695761..000000000000
--- a/scripts/clang-version.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# clang-version clang-command
-#
-# Print the compiler version of `clang-command' in a 5 or 6-digit form
-# such as `50001' for clang-5.0.1 etc.
-
-compiler="$*"
-
-if ! ( $compiler --version | grep -q clang) ; then
-	echo 0
-	exit 1
-fi
-
-MAJOR=$(echo __clang_major__ | $compiler -E -x c - | tail -n 1)
-MINOR=$(echo __clang_minor__ | $compiler -E -x c - | tail -n 1)
-PATCHLEVEL=$(echo __clang_patchlevel__ | $compiler -E -x c - | tail -n 1)
-printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL
diff --git a/scripts/gcc-version.sh b/scripts/gcc-version.sh
deleted file mode 100755
index ae353432539b..000000000000
--- a/scripts/gcc-version.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# gcc-version gcc-command
-#
-# Print the gcc version of `gcc-command' in a 5 or 6-digit form
-# such as `29503' for gcc-2.95.3, `30301' for gcc-3.3.1, etc.
-
-compiler="$*"
-
-if [ ${#compiler} -eq 0 ]; then
-	echo "Error: No compiler specified." >&2
-	printf "Usage:\n\t$0 <gcc-command>\n" >&2
-	exit 1
-fi
-
-MAJOR=$(echo __GNUC__ | $compiler -E -x c - | tail -n 1)
-MINOR=$(echo __GNUC_MINOR__ | $compiler -E -x c - | tail -n 1)
-PATCHLEVEL=$(echo __GNUC_PATCHLEVEL__ | $compiler -E -x c - | tail -n 1)
-printf "%d%02d%02d\\n" $MAJOR $MINOR $PATCHLEVEL
-- 
cgit v1.2.3


From 88a686728b3739d3598851e729c0e81f194e5c53 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Fri, 12 Feb 2021 11:29:24 -0500
Subject: kbuild: simplify access to the kernel's version

Instead of storing the version in a single integer and having various
kernel (and userspace) code how it's constructed, export individual
(major, patchlevel, sublevel) components and simplify kernel code that
uses it.

This should also make it easier on userspace.

Signed-off-by: Sasha Levin <sashal@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Makefile                                       | 5 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++--
 drivers/usb/core/hcd.c                         | 4 ++--
 drivers/usb/gadget/udc/aspeed-vhub/hub.c       | 4 ++--
 include/linux/usb/composite.h                  | 4 ++--
 kernel/sys.c                                   | 2 +-
 6 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index 12607d389148..1fdd44fe1659 100644
--- a/Makefile
+++ b/Makefile
@@ -1255,7 +1255,10 @@ define filechk_version.h
 		expr $(VERSION) \* 65536 + 0$(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \
 	fi;                                                              \
 	echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) +  \
-	((c) > 255 ? 255 : (c)))'
+	((c) > 255 ? 255 : (c)))';                                       \
+	echo \#define LINUX_VERSION_MAJOR $(VERSION);                    \
+	echo \#define LINUX_VERSION_PATCHLEVEL $(PATCHLEVEL);            \
+	echo \#define LINUX_VERSION_SUBLEVEL $(SUBLEVEL)
 endef
 
 $(version_h): FORCE
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ca6f2fc39ea0..29f886263dc5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -235,8 +235,8 @@ static void mlx5_set_driver_version(struct mlx5_core_dev *dev)
 	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
 
 	snprintf(string + strlen(string), remaining_size, "%u.%u.%u",
-		 (u8)((LINUX_VERSION_CODE >> 16) & 0xff), (u8)((LINUX_VERSION_CODE >> 8) & 0xff),
-		 (u16)(LINUX_VERSION_CODE & 0xffff));
+		LINUX_VERSION_MAJOR, LINUX_VERSION_PATCHLEVEL,
+		LINUX_VERSION_SUBLEVEL);
 
 	/*Send the command*/
 	MLX5_SET(set_driver_version_in, in, opcode,
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index ad5a0f405a75..3f0381344221 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -111,8 +111,8 @@ DECLARE_WAIT_QUEUE_HEAD(usb_kill_urb_queue);
  */
 
 /*-------------------------------------------------------------------------*/
-#define KERNEL_REL	bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff))
-#define KERNEL_VER	bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff))
+#define KERNEL_REL	bin2bcd(LINUX_VERSION_MAJOR)
+#define KERNEL_VER	bin2bcd(LINUX_VERSION_PATCHLEVEL)
 
 /* usb 3.1 root hub device descriptor */
 static const u8 usb31_rh_dev_descriptor[18] = {
diff --git a/drivers/usb/gadget/udc/aspeed-vhub/hub.c b/drivers/usb/gadget/udc/aspeed-vhub/hub.c
index bfd8e77788e2..5c7dea5e0ff1 100644
--- a/drivers/usb/gadget/udc/aspeed-vhub/hub.c
+++ b/drivers/usb/gadget/udc/aspeed-vhub/hub.c
@@ -46,8 +46,8 @@
  *    - Make vid/did overridable
  *    - make it look like usb1 if usb1 mode forced
  */
-#define KERNEL_REL	bin2bcd(((LINUX_VERSION_CODE >> 16) & 0x0ff))
-#define KERNEL_VER	bin2bcd(((LINUX_VERSION_CODE >> 8) & 0x0ff))
+#define KERNEL_REL	bin2bcd(LINUX_VERSION_MAJOR)
+#define KERNEL_VER	bin2bcd(LINUX_VERSION_PATCHLEVEL)
 
 enum {
 	AST_VHUB_STR_INDEX_MAX = 4,
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index a2d229ab63ba..7531ce723374 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -573,8 +573,8 @@ static inline u16 get_default_bcdDevice(void)
 {
 	u16 bcdDevice;
 
-	bcdDevice = bin2bcd((LINUX_VERSION_CODE >> 16 & 0xff)) << 8;
-	bcdDevice |= bin2bcd((LINUX_VERSION_CODE >> 8 & 0xff));
+	bcdDevice = bin2bcd(LINUX_VERSION_MAJOR) << 8;
+	bcdDevice |= bin2bcd(LINUX_VERSION_PATCHLEVEL);
 	return bcdDevice;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 51f00fe20e4d..c2225bd405d5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1243,7 +1243,7 @@ static int override_release(char __user *release, size_t len)
 				break;
 			rest++;
 		}
-		v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
+		v = LINUX_VERSION_PATCHLEVEL + 60;
 		copy = clamp_t(size_t, len, 1, sizeof(buf));
 		copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
 		ret = copy_to_user(release, buf, copy + 1);
-- 
cgit v1.2.3


From aab73d9524026caa14aab17fa9b750a6539fd49f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 13 Jan 2021 15:26:33 -0800
Subject: tpm: add sysfs exports for all banks of PCR registers

Create sysfs per hash groups with 24 PCR files in them one group,
named pcr-<hash>, for each agile hash of the TPM.  The files are
plugged in to a PCR read function which is TPM version agnostic, so
this works also for TPM 1.2 but the hash is only sha1 in that case.

Note: the macros used to create the hashes emit spurious checkpatch
warnings.  Do not try to "fix" them as checkpatch recommends, otherwise
they'll break.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-sysfs.c | 179 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h          |   9 ++-
 2 files changed, 187 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm-sysfs.c b/drivers/char/tpm/tpm-sysfs.c
index e2ff0b273a0f..63f03cfb8e6a 100644
--- a/drivers/char/tpm/tpm-sysfs.c
+++ b/drivers/char/tpm/tpm-sysfs.c
@@ -337,11 +337,190 @@ static const struct attribute_group tpm2_dev_group = {
 	.attrs = tpm2_dev_attrs,
 };
 
+struct tpm_pcr_attr {
+	int alg_id;
+	int pcr;
+	struct device_attribute attr;
+};
+
+#define to_tpm_pcr_attr(a) container_of(a, struct tpm_pcr_attr, attr)
+
+static ssize_t pcr_value_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct tpm_pcr_attr *ha = to_tpm_pcr_attr(attr);
+	struct tpm_chip *chip = to_tpm_chip(dev);
+	struct tpm_digest digest;
+	int i;
+	int digest_size = 0;
+	int rc;
+	char *str = buf;
+
+	for (i = 0; i < chip->nr_allocated_banks; i++)
+		if (ha->alg_id == chip->allocated_banks[i].alg_id)
+			digest_size = chip->allocated_banks[i].digest_size;
+	/* should never happen */
+	if (!digest_size)
+		return -EINVAL;
+
+	digest.alg_id = ha->alg_id;
+	rc = tpm_pcr_read(chip, ha->pcr, &digest);
+	if (rc)
+		return rc;
+	for (i = 0; i < digest_size; i++)
+		str += sprintf(str, "%02X", digest.digest[i]);
+	str += sprintf(str, "\n");
+
+	return str - buf;
+}
+
+/*
+ * The following set of defines represents all the magic to build
+ * the per hash attribute groups for displaying each bank of PCRs.
+ * The only slight problem with this approach is that every PCR is
+ * hard coded to be present, so you don't know if an PCR is missing
+ * until a cat of the file returns -EINVAL
+ *
+ * Also note you must ignore checkpatch warnings in this macro
+ * code. This is deep macro magic that checkpatch.pl doesn't
+ * understand.
+ */
+
+/* Note, this must match TPM2_PLATFORM_PCR which is fixed at 24. */
+#define _TPM_HELPER(_alg, _hash, F) \
+	F(_alg, _hash, 0)	    \
+	F(_alg, _hash, 1)	    \
+	F(_alg, _hash, 2)	    \
+	F(_alg, _hash, 3)	    \
+	F(_alg, _hash, 4)	    \
+	F(_alg, _hash, 5)	    \
+	F(_alg, _hash, 6)	    \
+	F(_alg, _hash, 7)	    \
+	F(_alg, _hash, 8)	    \
+	F(_alg, _hash, 9)	    \
+	F(_alg, _hash, 10)	    \
+	F(_alg, _hash, 11)	    \
+	F(_alg, _hash, 12)	    \
+	F(_alg, _hash, 13)	    \
+	F(_alg, _hash, 14)	    \
+	F(_alg, _hash, 15)	    \
+	F(_alg, _hash, 16)	    \
+	F(_alg, _hash, 17)	    \
+	F(_alg, _hash, 18)	    \
+	F(_alg, _hash, 19)	    \
+	F(_alg, _hash, 20)	    \
+	F(_alg, _hash, 21)	    \
+	F(_alg, _hash, 22)	    \
+	F(_alg, _hash, 23)
+
+/* ignore checkpatch warning about trailing ; in macro. */
+#define PCR_ATTR(_alg, _hash, _pcr)				   \
+	static struct tpm_pcr_attr dev_attr_pcr_##_hash##_##_pcr = {	\
+		.alg_id = _alg,					   \
+		.pcr = _pcr,					   \
+		.attr = {					   \
+			.attr = {				   \
+				.name = __stringify(_pcr),	   \
+				.mode = 0444			   \
+			},					   \
+			.show = pcr_value_show			   \
+		}						   \
+	};
+
+#define PCR_ATTRS(_alg, _hash)			\
+	_TPM_HELPER(_alg, _hash, PCR_ATTR)
+
+/* ignore checkpatch warning about trailing , in macro. */
+#define PCR_ATTR_VAL(_alg, _hash, _pcr)		\
+	&dev_attr_pcr_##_hash##_##_pcr.attr.attr,
+
+#define PCR_ATTR_GROUP_ARRAY(_alg, _hash)		       \
+	static struct attribute *pcr_group_attrs_##_hash[] = { \
+		_TPM_HELPER(_alg, _hash, PCR_ATTR_VAL)	       \
+		NULL					       \
+	}
+
+#define PCR_ATTR_GROUP(_alg, _hash)			    \
+	static struct attribute_group pcr_group_##_hash = { \
+		.name = "pcr-" __stringify(_hash),	    \
+		.attrs = pcr_group_attrs_##_hash	    \
+	}
+
+#define PCR_ATTR_BUILD(_alg, _hash)	   \
+	PCR_ATTRS(_alg, _hash)		   \
+	PCR_ATTR_GROUP_ARRAY(_alg, _hash); \
+	PCR_ATTR_GROUP(_alg, _hash)
+/*
+ * End of macro structure to build an attribute group containing 24
+ * PCR value files for each supported hash algorithm
+ */
+
+/*
+ * The next set of macros implements the cleverness for each hash to
+ * build a static attribute group called pcr_group_<hash> which can be
+ * added to chip->groups[].
+ *
+ * The first argument is the TPM algorithm id and the second is the
+ * hash used as both the suffix and the group name.  Note: the group
+ * name is a directory in the top level tpm class with the name
+ * pcr-<hash>, so it must not clash with any other names already
+ * in the sysfs directory.
+ */
+PCR_ATTR_BUILD(TPM_ALG_SHA1, sha1);
+PCR_ATTR_BUILD(TPM_ALG_SHA256, sha256);
+PCR_ATTR_BUILD(TPM_ALG_SHA384, sha384);
+PCR_ATTR_BUILD(TPM_ALG_SHA512, sha512);
+PCR_ATTR_BUILD(TPM_ALG_SM3_256, sm3);
+
+
 void tpm_sysfs_add_device(struct tpm_chip *chip)
 {
+	int i;
+
 	WARN_ON(chip->groups_cnt != 0);
+
 	if (chip->flags & TPM_CHIP_FLAG_TPM2)
 		chip->groups[chip->groups_cnt++] = &tpm2_dev_group;
 	else
 		chip->groups[chip->groups_cnt++] = &tpm1_dev_group;
+
+	/* add one group for each bank hash */
+	for (i = 0; i < chip->nr_allocated_banks; i++) {
+		switch (chip->allocated_banks[i].alg_id) {
+		case TPM_ALG_SHA1:
+			chip->groups[chip->groups_cnt++] = &pcr_group_sha1;
+			break;
+		case TPM_ALG_SHA256:
+			chip->groups[chip->groups_cnt++] = &pcr_group_sha256;
+			break;
+		case TPM_ALG_SHA384:
+			chip->groups[chip->groups_cnt++] = &pcr_group_sha384;
+			break;
+		case TPM_ALG_SHA512:
+			chip->groups[chip->groups_cnt++] = &pcr_group_sha512;
+			break;
+		case TPM_ALG_SM3_256:
+			chip->groups[chip->groups_cnt++] = &pcr_group_sm3;
+			break;
+		default:
+			/*
+			 * If triggers, send a patch to add both a
+			 * PCR_ATTR_BUILD() macro above for the
+			 * missing algorithm as well as an additional
+			 * case in this switch statement.
+			 */
+			dev_err(&chip->dev,
+				"TPM with unsupported bank algorithm 0x%04x",
+				chip->allocated_banks[i].alg_id);
+			break;
+		}
+	}
+
+	/*
+	 * This will only trigger if someone has added an additional
+	 * hash to the tpm_algorithms enum without incrementing
+	 * TPM_MAX_HASHES.
+	 */
+	WARN_ON(chip->groups_cnt > TPM_MAX_HASHES + 1);
 }
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 8f4ff39f51e7..ae2482510f8c 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -31,6 +31,7 @@ struct tpm_chip;
 struct trusted_key_payload;
 struct trusted_key_options;
 
+/* if you add a new hash to this, increment TPM_MAX_HASHES below */
 enum tpm_algorithms {
 	TPM_ALG_ERROR		= 0x0000,
 	TPM_ALG_SHA1		= 0x0004,
@@ -42,6 +43,12 @@ enum tpm_algorithms {
 	TPM_ALG_SM3_256		= 0x0012,
 };
 
+/*
+ * maximum number of hashing algorithms a TPM can have.  This is
+ * basically a count of every hash in tpm_algorithms above
+ */
+#define TPM_MAX_HASHES	5
+
 struct tpm_digest {
 	u16 alg_id;
 	u8 digest[TPM_MAX_DIGEST_SIZE];
@@ -146,7 +153,7 @@ struct tpm_chip {
 
 	struct dentry *bios_dir[TPM_NUM_EVENT_LOG_FILES];
 
-	const struct attribute_group *groups[3];
+	const struct attribute_group *groups[3 + TPM_MAX_HASHES];
 	unsigned int groups_cnt;
 
 	u32 nr_allocated_banks;
-- 
cgit v1.2.3


From 8c657a0590de585b1115847c17b34a58025f2f4b Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Fri, 29 Jan 2021 01:56:21 +0200
Subject: KEYS: trusted: Reserve TPM for seal and unseal operations

When TPM 2.0 trusted keys code was moved to the trusted keys subsystem,
the operations were unwrapped from tpm_try_get_ops() and tpm_put_ops(),
which are used to take temporarily the ownership of the TPM chip. The
ownership is only taken inside tpm_send(), but this is not sufficient,
as in the key load TPM2_CC_LOAD, TPM2_CC_UNSEAL and TPM2_FLUSH_CONTEXT
need to be done as a one single atom.

Take the TPM chip ownership before sending anything with
tpm_try_get_ops() and tpm_put_ops(), and use tpm_transmit_cmd() to send
TPM commands instead of tpm_send(), reverting back to the old behaviour.

Fixes: 2e19e10131a0 ("KEYS: trusted: Move TPM2 trusted keys code")
Reported-by: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: stable@vger.kernel.org
Cc: David Howells <dhowells@redhat.com>
Cc: Mimi Zohar <zohar@linux.ibm.com>
Cc: Sumit Garg <sumit.garg@linaro.org>
Acked-by Sumit Garg <sumit.garg@linaro.org>
Tested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm.h                    |  4 ----
 include/linux/tpm.h                       |  5 ++++-
 security/keys/trusted-keys/trusted_tpm2.c | 22 ++++++++++++++++++----
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 947d1db0a5cc..283f78211c3a 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -164,8 +164,6 @@ extern const struct file_operations tpmrm_fops;
 extern struct idr dev_nums_idr;
 
 ssize_t tpm_transmit(struct tpm_chip *chip, u8 *buf, size_t bufsiz);
-ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_buf *buf,
-			 size_t min_rsp_body_length, const char *desc);
 int tpm_get_timeouts(struct tpm_chip *);
 int tpm_auto_startup(struct tpm_chip *chip);
 
@@ -194,8 +192,6 @@ static inline void tpm_msleep(unsigned int delay_msec)
 int tpm_chip_start(struct tpm_chip *chip);
 void tpm_chip_stop(struct tpm_chip *chip);
 struct tpm_chip *tpm_find_get_ops(struct tpm_chip *chip);
-__must_check int tpm_try_get_ops(struct tpm_chip *chip);
-void tpm_put_ops(struct tpm_chip *chip);
 
 struct tpm_chip *tpm_chip_alloc(struct device *dev,
 				const struct tpm_class_ops *ops);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index ae2482510f8c..543aa3b1dedc 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -404,6 +404,10 @@ static inline u32 tpm2_rc_value(u32 rc)
 #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE)
 
 extern int tpm_is_tpm2(struct tpm_chip *chip);
+extern __must_check int tpm_try_get_ops(struct tpm_chip *chip);
+extern void tpm_put_ops(struct tpm_chip *chip);
+extern ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_buf *buf,
+				size_t min_rsp_body_length, const char *desc);
 extern int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
 			struct tpm_digest *digest);
 extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
@@ -417,7 +421,6 @@ static inline int tpm_is_tpm2(struct tpm_chip *chip)
 {
 	return -ENODEV;
 }
-
 static inline int tpm_pcr_read(struct tpm_chip *chip, int pcr_idx,
 			       struct tpm_digest *digest)
 {
diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c
index 08ec7f48f01d..e2a0ed5d02f0 100644
--- a/security/keys/trusted-keys/trusted_tpm2.c
+++ b/security/keys/trusted-keys/trusted_tpm2.c
@@ -83,6 +83,12 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 	if (rc)
 		return rc;
 
+	rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS, TPM2_CC_CREATE);
+	if (rc) {
+		tpm_put_ops(chip);
+		return rc;
+	}
+
 	tpm_buf_append_u32(&buf, options->keyhandle);
 	tpm2_buf_append_auth(&buf, TPM2_RS_PW,
 			     NULL /* nonce */, 0,
@@ -130,7 +136,7 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 		goto out;
 	}
 
-	rc = tpm_send(chip, buf.data, tpm_buf_length(&buf));
+	rc = tpm_transmit_cmd(chip, &buf, 4, "sealing data");
 	if (rc)
 		goto out;
 
@@ -157,6 +163,7 @@ out:
 			rc = -EPERM;
 	}
 
+	tpm_put_ops(chip);
 	return rc;
 }
 
@@ -211,7 +218,7 @@ static int tpm2_load_cmd(struct tpm_chip *chip,
 		goto out;
 	}
 
-	rc = tpm_send(chip, buf.data, tpm_buf_length(&buf));
+	rc = tpm_transmit_cmd(chip, &buf, 4, "loading blob");
 	if (!rc)
 		*blob_handle = be32_to_cpup(
 			(__be32 *) &buf.data[TPM_HEADER_SIZE]);
@@ -260,7 +267,7 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 			     options->blobauth /* hmac */,
 			     TPM_DIGEST_SIZE);
 
-	rc = tpm_send(chip, buf.data, tpm_buf_length(&buf));
+	rc = tpm_transmit_cmd(chip, &buf, 6, "unsealing");
 	if (rc > 0)
 		rc = -EPERM;
 
@@ -304,12 +311,19 @@ int tpm2_unseal_trusted(struct tpm_chip *chip,
 	u32 blob_handle;
 	int rc;
 
-	rc = tpm2_load_cmd(chip, payload, options, &blob_handle);
+	rc = tpm_try_get_ops(chip);
 	if (rc)
 		return rc;
 
+	rc = tpm2_load_cmd(chip, payload, options, &blob_handle);
+	if (rc)
+		goto out;
+
 	rc = tpm2_unseal_cmd(chip, payload, options, blob_handle);
 	tpm2_flush_context(chip, blob_handle);
 
+out:
+	tpm_put_ops(chip);
+
 	return rc;
 }
-- 
cgit v1.2.3


From bfe3911a91047557eb0e620f95a370aee6a248c7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 5 Feb 2021 22:00:12 +0000
Subject: kcmp: Support selection of SYS_kcmp without CHECKPOINT_RESTORE

Userspace has discovered the functionality offered by SYS_kcmp and has
started to depend upon it. In particular, Mesa uses SYS_kcmp for
os_same_file_description() in order to identify when two fd (e.g. device
or dmabuf) point to the same struct file. Since they depend on it for
core functionality, lift SYS_kcmp out of the non-default
CONFIG_CHECKPOINT_RESTORE into the selectable syscall category.

Rasmus Villemoes also pointed out that systemd uses SYS_kcmp to
deduplicate the per-service file descriptor store.

Note that some distributions such as Ubuntu are already enabling
CHECKPOINT_RESTORE in their configs and so, by extension, SYS_kcmp.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/3046
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: stable@vger.kernel.org
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> # DRM depends on kcmp
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> # systemd uses kcmp
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210205220012.1983-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/Kconfig                       |  3 +++
 fs/eventpoll.c                                |  4 ++--
 include/linux/eventpoll.h                     |  2 +-
 init/Kconfig                                  | 11 +++++++++++
 kernel/Makefile                               |  2 +-
 tools/testing/selftests/seccomp/seccomp_bpf.c |  2 +-
 6 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 0973f408d75f..af6c6d214d91 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -15,6 +15,9 @@ menuconfig DRM
 	select I2C_ALGOBIT
 	select DMA_SHARED_BUFFER
 	select SYNC_FILE
+# gallium uses SYS_kcmp for os_same_file_description() to de-duplicate
+# device and dmabuf fd. Let's make sure that is available for our userspace.
+	select KCMP
 	help
 	  Kernel-level support for the Direct Rendering Infrastructure (DRI)
 	  introduced in XFree86 4.0. If you say Y here, you need to select
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a829af074eb5..3196474cbe24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -979,7 +979,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_KCMP
 static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
 {
 	struct rb_node *rbp;
@@ -1021,7 +1021,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
 
 	return file_raw;
 }
-#endif /* CONFIG_CHECKPOINT_RESTORE */
+#endif /* CONFIG_KCMP */
 
 /**
  * Adds a new entry to the tail of the list in a lockless way, i.e.
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 0350393465d4..593322c946e6 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -18,7 +18,7 @@ struct file;
 
 #ifdef CONFIG_EPOLL
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_KCMP
 struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
 #endif
 
diff --git a/init/Kconfig b/init/Kconfig
index 29ad68325028..b7d3c6a12196 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1193,6 +1193,7 @@ endif # NAMESPACES
 config CHECKPOINT_RESTORE
 	bool "Checkpoint/restore support"
 	select PROC_CHILDREN
+	select KCMP
 	default n
 	help
 	  Enables additional kernel features in a sake of checkpoint/restore.
@@ -1736,6 +1737,16 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS
 config ARCH_HAS_MEMBARRIER_SYNC_CORE
 	bool
 
+config KCMP
+	bool "Enable kcmp() system call" if EXPERT
+	help
+	  Enable the kernel resource comparison system call. It provides
+	  user-space with the ability to compare two processes to see if they
+	  share a common resource, such as a file descriptor or even virtual
+	  memory space.
+
+	  If unsure, say N.
+
 config RSEQ
 	bool "Enable rseq() system call" if EXPERT
 	default y
diff --git a/kernel/Makefile b/kernel/Makefile
index aa7368c7eabf..320f1f3941b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,7 +51,7 @@ obj-y += livepatch/
 obj-y += dma/
 obj-y += entry/
 
-obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_KCMP) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 26c72f2b61b1..1b6c7d33c4ff 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -315,7 +315,7 @@ TEST(kcmp)
 	ret = __filecmp(getpid(), getpid(), 1, 1);
 	EXPECT_EQ(ret, 0);
 	if (ret != 0 && errno == ENOSYS)
-		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_CHECKPOINT_RESTORE?)");
+		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
 }
 
 TEST(mode_strict_support)
-- 
cgit v1.2.3


From afd56e78dd179d5638333bb407d9f7da2863381a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 22 Jan 2021 15:41:14 +0100
Subject: libceph: deprecate [no]cephx_require_signatures options

These options were introduced in 3.19 with support for message signing
and are rather useless, as explained in commit a51983e4dd2d ("libceph:
add nocephx_sign_messages option").  Deprecate them.

In case there is someone out there with a cluster that lacks support
for MSG_AUTH feature (very unlikely but has to be considered since we
haven't formally raised the bar from argonaut to bobtail yet), make
nocephx_sign_messages also waive MSG_AUTH requirement.  This is probably
how it should have been done in the first place -- if we aren't going
to sign, requiring the signing feature makes no sense.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/libceph.h |  7 +++----
 net/ceph/ceph_common.c       | 11 +++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index eb9008bb3992..409d8c29bc4f 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -32,10 +32,9 @@
 #define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
 #define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
 #define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes (msgr1) */
-#define CEPH_OPT_NOMSGAUTH	  (1<<4) /* don't require msg signing feat */
-#define CEPH_OPT_TCP_NODELAY	  (1<<5) /* TCP_NODELAY on TCP sockets */
-#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs (msgr1) */
-#define CEPH_OPT_ABORT_ON_FULL	  (1<<7) /* abort w/ ENOSPC when full */
+#define CEPH_OPT_TCP_NODELAY      (1<<4) /* TCP_NODELAY on TCP sockets */
+#define CEPH_OPT_NOMSGSIGN        (1<<5) /* don't sign msgs (msgr1) */
+#define CEPH_OPT_ABORT_ON_FULL    (1<<6) /* abort w/ ENOSPC when full */
 
 #define CEPH_OPT_DEFAULT   (CEPH_OPT_TCP_NODELAY)
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 271287c5ec12..bec181181d41 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -307,7 +307,8 @@ static const struct constant_table ceph_param_ms_mode[] = {
 
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
-	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
+	__fsparam	(NULL, "cephx_require_signatures", Opt_cephx_require_signatures,
+			 fs_param_neg_with_no|fs_param_deprecated, NULL),
 	fsparam_flag_no ("cephx_sign_messages",		Opt_cephx_sign_messages),
 	fsparam_flag_no ("crc",				Opt_crc),
 	fsparam_string	("crush_location",		Opt_crush_location),
@@ -596,9 +597,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 		break;
 	case Opt_cephx_require_signatures:
 		if (!result.negated)
-			opt->flags &= ~CEPH_OPT_NOMSGAUTH;
+			warn_plog(&log, "Ignoring cephx_require_signatures");
 		else
-			opt->flags |= CEPH_OPT_NOMSGAUTH;
+			warn_plog(&log, "Ignoring nocephx_require_signatures, use nocephx_sign_messages");
 		break;
 	case Opt_cephx_sign_messages:
 		if (!result.negated)
@@ -686,8 +687,6 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 		seq_puts(m, "noshare,");
 	if (opt->flags & CEPH_OPT_NOCRC)
 		seq_puts(m, "nocrc,");
-	if (opt->flags & CEPH_OPT_NOMSGAUTH)
-		seq_puts(m, "nocephx_require_signatures,");
 	if (opt->flags & CEPH_OPT_NOMSGSIGN)
 		seq_puts(m, "nocephx_sign_messages,");
 	if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
@@ -756,7 +755,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
 
-	if (!ceph_test_opt(client, NOMSGAUTH))
+	if (!ceph_test_opt(client, NOMSGSIGN))
 		client->required_features |= CEPH_FEATURE_MSG_AUTH;
 
 	/* msgr */
-- 
cgit v1.2.3


From a6a217dddcd544f6b75f0e2a60b6e84c1d494b7e Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 9 Feb 2021 15:11:06 +0200
Subject: net/mlx5: Add new timestamp mode bits

These fields declare which timestamp mode is supported by the device
per RQ/SQ/QP.

In addition add the ts_format field to the select the mode for
RQ/SQ/QP.

Link: https://lore.kernel.org/r/20210209131107.698833-2-leon@kernel.org
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 54 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cf692fc17f41..436d6f421dfd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -932,11 +932,18 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         reserved_at_200[0x600];
 };
 
+enum {
+	MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
+	MLX5_QP_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
+	MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
+};
+
 struct mlx5_ifc_roce_cap_bits {
 	u8         roce_apm[0x1];
 	u8         reserved_at_1[0x3];
 	u8         sw_r_roce_src_udp_port[0x1];
-	u8         reserved_at_5[0x1b];
+	u8         reserved_at_5[0x19];
+	u8	   qp_ts_format[0x2];
 
 	u8         reserved_at_20[0x60];
 
@@ -1253,6 +1260,18 @@ enum {
 	MLX5_STEERING_FORMAT_CONNECTX_6DX = 1,
 };
 
+enum {
+	MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
+	MLX5_SQ_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
+	MLX5_SQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
+};
+
+enum {
+	MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING               = 0x0,
+	MLX5_RQ_TIMESTAMP_FORMAT_CAP_REAL_TIME                  = 0x1,
+	MLX5_RQ_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME = 0x2,
+};
+
 struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_0[0x1f];
 	u8         vhca_resource_manager[0x1];
@@ -1564,7 +1583,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         general_obj_types[0x40];
 
-	u8         reserved_at_440[0x4];
+	u8         sq_ts_format[0x2];
+	u8         rq_ts_format[0x2];
 	u8         steering_format_version[0x4];
 	u8         create_qp_start_hint[0x18];
 
@@ -2868,6 +2888,12 @@ enum {
 	MLX5_QPC_CS_RES_UP_TO_64B  = 0x2,
 };
 
+enum {
+	MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
+	MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
+	MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
+};
+
 struct mlx5_ifc_qpc_bits {
 	u8         state[0x4];
 	u8         lag_tx_port_affinity[0x4];
@@ -2896,7 +2922,9 @@ struct mlx5_ifc_qpc_bits {
 	u8         log_rq_stride[0x3];
 	u8         no_sq[0x1];
 	u8         log_sq_size[0x4];
-	u8         reserved_at_55[0x6];
+	u8         reserved_at_55[0x3];
+	u8	   ts_format[0x2];
+	u8         reserved_at_5a[0x1];
 	u8         rlky[0x1];
 	u8         ulp_stateless_offload_mode[0x4];
 
@@ -3312,6 +3340,12 @@ enum {
 	MLX5_SQC_STATE_ERR  = 0x3,
 };
 
+enum {
+	MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
+	MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
+	MLX5_SQC_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
+};
+
 struct mlx5_ifc_sqc_bits {
 	u8         rlky[0x1];
 	u8         cd_master[0x1];
@@ -3323,7 +3357,9 @@ struct mlx5_ifc_sqc_bits {
 	u8         reg_umr[0x1];
 	u8         allow_swp[0x1];
 	u8         hairpin[0x1];
-	u8         reserved_at_f[0x11];
+	u8         reserved_at_f[0xb];
+	u8	   ts_format[0x2];
+	u8	   reserved_at_1c[0x4];
 
 	u8         reserved_at_20[0x8];
 	u8         user_index[0x18];
@@ -3414,6 +3450,12 @@ enum {
 	MLX5_RQC_STATE_ERR  = 0x3,
 };
 
+enum {
+	MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
+	MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT      = 0x1,
+	MLX5_RQC_TIMESTAMP_FORMAT_REAL_TIME    = 0x2,
+};
+
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
 	u8	   delay_drop_en[0x1];
@@ -3424,7 +3466,9 @@ struct mlx5_ifc_rqc_bits {
 	u8         reserved_at_c[0x1];
 	u8         flush_in_error_en[0x1];
 	u8         hairpin[0x1];
-	u8         reserved_at_f[0x11];
+	u8         reserved_at_f[0xb];
+	u8	   ts_format[0x2];
+	u8	   reserved_at_1c[0x4];
 
 	u8         reserved_at_20[0x8];
 	u8         user_index[0x18];
-- 
cgit v1.2.3


From e0a912e8ddbaa0536352dd8318845cdfdbab7bab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 16 Feb 2021 12:17:22 -0500
Subject: SUNRPC: Use TCP_CORK to optimise send performance on the server

Use a counter to keep track of how many requests are queued behind the
xprt->xpt_mutex, and keep TCP_CORK set until the queue is empty.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Link: https://lore.kernel.org/linux-nfs/20210213202532.23146-1-trondmy@kernel.org/T/#u
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svcsock.h | 2 ++
 net/sunrpc/svcsock.c           | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index b7ac7fe68306..bcc555c7ae9c 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -35,6 +35,8 @@ struct svc_sock {
 	/* Total length of the data (not including fragment headers)
 	 * received so far in the fragments making up this rpc: */
 	u32			sk_datalen;
+	/* Number of queued send requests */
+	atomic_t		sk_sendqlen;
 
 	struct page *		sk_pages[RPCSVC_MAXPAGES];	/* received data */
 };
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5a809c64dc7b..231f510a4830 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1171,18 +1171,23 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 
 	svc_tcp_release_rqst(rqstp);
 
+	atomic_inc(&svsk->sk_sendqlen);
 	mutex_lock(&xprt->xpt_mutex);
 	if (svc_xprt_is_dead(xprt))
 		goto out_notconn;
+	tcp_sock_set_cork(svsk->sk_sk, true);
 	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
 	xdr_free_bvec(xdr);
 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
 	if (err < 0 || sent != (xdr->len + sizeof(marker)))
 		goto out_close;
+	if (atomic_dec_and_test(&svsk->sk_sendqlen))
+		tcp_sock_set_cork(svsk->sk_sk, false);
 	mutex_unlock(&xprt->xpt_mutex);
 	return sent;
 
 out_notconn:
+	atomic_dec(&svsk->sk_sendqlen);
 	mutex_unlock(&xprt->xpt_mutex);
 	return -ENOTCONN;
 out_close:
@@ -1192,6 +1197,7 @@ out_close:
 		  (err < 0) ? err : sent, xdr->len);
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	svc_xprt_enqueue(xprt);
+	atomic_dec(&svsk->sk_sendqlen);
 	mutex_unlock(&xprt->xpt_mutex);
 	return -EAGAIN;
 }
@@ -1261,7 +1267,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 		svsk->sk_datalen = 0;
 		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
 
-		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+		tcp_sock_set_nodelay(sk);
 
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		switch (sk->sk_state) {
-- 
cgit v1.2.3


From 7232c132d13aafd178ba18c1099b2cb98d104b8c Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@nvidia.com>
Date: Thu, 11 Feb 2021 10:55:49 +0200
Subject: RDMA/mlx5: Allow CQ creation without attached EQs

The traditional DevX CQ creation flow goes through mlx5_core_create_cq()
which checks that the given EQN corresponds to an existing EQ and attaches
a devx handler to the EQN for the CQ.

In some cases the EQ will not be a kernel EQ, but will be controlled by
modify CQ, don't block creating these just because the EQN can't be found
in the kernel.

Link: https://lore.kernel.org/r/20210211085549.1277674-1-leon@kernel.org
Signed-off-by: Tal Gilboa <talgi@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/devx.c | 13 ++++++++++++-
 include/linux/mlx5/mlx5_ifc.h     |  5 +++--
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index e39661db0d2f..ebc2a4355fa5 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -1439,6 +1439,16 @@ out:
 	rcu_read_unlock();
 }
 
+static bool is_apu_thread_cq(struct mlx5_ib_dev *dev, const void *in)
+{
+	if (!MLX5_CAP_GEN(dev->mdev, apu) ||
+	    !MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context),
+		      apu_thread_cq))
+		return false;
+
+	return true;
+}
+
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 	struct uverbs_attr_bundle *attrs)
 {
@@ -1492,7 +1502,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
 		obj->flags |= DEVX_OBJ_FLAGS_DCT;
 		err = mlx5_core_create_dct(dev, &obj->core_dct, cmd_in,
 					   cmd_in_len, cmd_out, cmd_out_len);
-	} else if (opcode == MLX5_CMD_OP_CREATE_CQ) {
+	} else if (opcode == MLX5_CMD_OP_CREATE_CQ &&
+		   !is_apu_thread_cq(dev, cmd_in)) {
 		obj->flags |= DEVX_OBJ_FLAGS_CQ;
 		obj->core_cq.comp = devx_cq_comp;
 		err = mlx5_core_create_cq(dev->mdev, &obj->core_cq,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cf692fc17f41..768e097c633b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1634,7 +1634,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         sf_set_partition[0x1];
 	u8         reserved_at_682[0x1];
 	u8         log_max_sf[0x5];
-	u8         reserved_at_688[0x8];
+	u8         apu[0x1];
+	u8         reserved_at_689[0x7];
 	u8         log_min_sf_size[0x8];
 	u8         max_num_sf_partitions[0x8];
 
@@ -3816,7 +3817,7 @@ struct mlx5_ifc_cqc_bits {
 	u8         status[0x4];
 	u8         reserved_at_4[0x2];
 	u8         dbr_umem_valid[0x1];
-	u8         reserved_at_7[0x1];
+	u8         apu_thread_cq[0x1];
 	u8         cqe_sz[0x3];
 	u8         cc[0x1];
 	u8         reserved_at_c[0x1];
-- 
cgit v1.2.3


From 75cfb200cd081d23eb7eaa68deba9e0ab9320070 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 12 Feb 2021 16:49:47 -0500
Subject: NFS: 'flags' field should be unsigned in struct nfs_server

The mount flags are all unsigned integers, so we should not be storing
them in a signed field.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs_sb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 38e60ec742df..962e8313f007 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -142,7 +142,7 @@ struct nfs_server {
 	struct nlm_host		*nlm_host;	/* NLM client handle */
 	struct nfs_iostats __percpu *io_stats;	/* I/O statistics */
 	atomic_long_t		writeback;	/* number of writeback pages */
-	int			flags;		/* various flags */
+	unsigned int		flags;		/* various flags */
 
 /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */
 #define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
-- 
cgit v1.2.3


From ed7bcdb374d20fab9e9dc36853a6735c047ad1b1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 12 Feb 2021 16:49:48 -0500
Subject: NFS: Add support for eager writes

Support eager writing to the server, meaning that we write the data to
cache on the server, and wait for that to complete. This ensures that we
see ENOSPC errors immediately.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/file.c             | 19 +++++++++++++++++--
 fs/nfs/write.c            | 17 ++++++++++++-----
 include/linux/nfs_fs_sb.h |  2 ++
 3 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 03fd1dcc96bd..16ad5050e046 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
-	unsigned long written = 0;
-	ssize_t result;
+	unsigned int mntflags = NFS_SERVER(inode)->flags;
+	ssize_t result, written;
 	errseq_t since;
 	int error;
 
@@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 
 	written = result;
 	iocb->ki_pos += written;
+
+	if (mntflags & NFS_MOUNT_WRITE_EAGER) {
+		result = filemap_fdatawrite_range(file->f_mapping,
+						  iocb->ki_pos - written,
+						  iocb->ki_pos - 1);
+		if (result < 0)
+			goto out;
+	}
+	if (mntflags & NFS_MOUNT_WRITE_WAIT) {
+		result = filemap_fdatawait_range(file->f_mapping,
+						 iocb->ki_pos - written,
+						 iocb->ki_pos - 1);
+		if (result < 0)
+			goto out;
+	}
 	result = generic_write_sync(iocb, written);
 	if (result < 0)
 		goto out;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6193350356a8..82bdcb982186 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
 	struct nfs_pageio_descriptor pgio;
-	struct nfs_io_completion *ioc;
+	struct nfs_io_completion *ioc = NULL;
+	unsigned int mntflags = NFS_SERVER(inode)->flags;
+	int priority = 0;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-	ioc = nfs_io_completion_alloc(GFP_KERNEL);
-	if (ioc)
-		nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
+	if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
+	    wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+		ioc = nfs_io_completion_alloc(GFP_KERNEL);
+		if (ioc)
+			nfs_io_completion_init(ioc, nfs_io_completion_commit,
+					       inode);
+		priority = wb_priority(wbc);
+	}
 
-	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+	nfs_pageio_init_write(&pgio, inode, priority, false,
 				&nfs_async_write_completion_ops);
 	pgio.pg_io_completion = ioc;
 	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 962e8313f007..6f76b32a0238 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -153,6 +153,8 @@ struct nfs_server {
 #define NFS_MOUNT_LOCAL_FCNTL		0x200000
 #define NFS_MOUNT_SOFTERR		0x400000
 #define NFS_MOUNT_SOFTREVAL		0x800000
+#define NFS_MOUNT_WRITE_EAGER		0x01000000
+#define NFS_MOUNT_WRITE_WAIT		0x02000000
 
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
-- 
cgit v1.2.3


From ae02d41551d6f2a035d3e63ce4415e1b2ba3a7e6 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Fri, 12 Feb 2021 14:30:38 -0800
Subject: net/mlx5: Add register layout to support real-time time-stamp

Add needed structure layouts and defines for MTUTC (Management UTC)
register. MTUTC will be used for cyc2time HW translation.

In addition, add cyc2time modify capability bit and init segment HCA
real time address.

Finally, add capability bits indicating which time-stamping format is
supported per SQ and RQ. Add ts_format in the queue's context layout to
allow configuration.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h   |  5 ++++-
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 30 ++++++++++++++++++++++++++++--
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f1de49d64a98..fd694add6ff0 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -578,7 +578,10 @@ struct mlx5_init_seg {
 	__be32			internal_timer_l;
 	__be32			rsvd3[2];
 	__be32			health_counter;
-	__be32			rsvd4[1019];
+	__be32			rsvd4[11];
+	__be32			real_time_h;
+	__be32			real_time_l;
+	__be32			rsvd5[1006];
 	__be64			ieee1588_clk;
 	__be32			ieee1588_clk_type;
 	__be32			clr_intx;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f93bfe7473aa..2eb7755143d7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -143,6 +143,7 @@ enum {
 	MLX5_REG_MPCNT		 = 0x9051,
 	MLX5_REG_MTPPS		 = 0x9053,
 	MLX5_REG_MTPPSE		 = 0x9054,
+	MLX5_REG_MTUTC		 = 0x9055,
 	MLX5_REG_MPEGC		 = 0x9056,
 	MLX5_REG_MCQS		 = 0x9060,
 	MLX5_REG_MCQI		 = 0x9061,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 436d6f421dfd..7059fcdf54a3 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9150,6 +9150,28 @@ struct mlx5_ifc_mpegc_reg_bits {
 	u8         reserved_at_60[0x100];
 };
 
+enum {
+	MLX5_MTUTC_OPERATION_SET_TIME_IMMEDIATE   = 0x1,
+	MLX5_MTUTC_OPERATION_ADJUST_TIME          = 0x2,
+	MLX5_MTUTC_OPERATION_ADJUST_FREQ_UTC      = 0x3,
+};
+
+struct mlx5_ifc_mtutc_reg_bits {
+	u8         reserved_at_0[0x1c];
+	u8         operation[0x4];
+
+	u8         freq_adjustment[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         utc_sec[0x20];
+
+	u8         reserved_at_a0[0x2];
+	u8         utc_nsec[0x1e];
+
+	u8         time_adjustment[0x20];
+};
+
 struct mlx5_ifc_pcam_enhanced_features_bits {
 	u8         reserved_at_0[0x68];
 	u8         fec_50G_per_lane_in_pplm[0x1];
@@ -9208,7 +9230,9 @@ struct mlx5_ifc_pcam_reg_bits {
 };
 
 struct mlx5_ifc_mcam_enhanced_features_bits {
-	u8         reserved_at_0[0x6e];
+	u8         reserved_at_0[0x6b];
+	u8         ptpcyc2realtime_modify[0x1];
+	u8         reserved_at_6c[0x2];
 	u8         pci_status_and_power[0x1];
 	u8         reserved_at_6f[0x5];
 	u8         mark_tx_action_cnp[0x1];
@@ -9231,7 +9255,8 @@ struct mlx5_ifc_mcam_access_reg_bits {
 
 	u8         regs_95_to_87[0x9];
 	u8         mpegc[0x1];
-	u8         regs_85_to_68[0x12];
+	u8         mtutc[0x1];
+	u8         regs_84_to_68[0x11];
 	u8         tracer_registers[0x4];
 
 	u8         regs_63_to_32[0x20];
@@ -9964,6 +9989,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_mcda_reg_bits mcda_reg;
 	struct mlx5_ifc_mirc_reg_bits mirc_reg;
 	struct mlx5_ifc_mfrl_reg_bits mfrl_reg;
+	struct mlx5_ifc_mtutc_reg_bits mtutc_reg;
 	u8         reserved_at_0[0x60e0];
 };
 
-- 
cgit v1.2.3


From d6f3dc8f509ce6288e2537eb4b0614ef444fd84a Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Fri, 12 Feb 2021 14:30:40 -0800
Subject: net/mlx5: Move all internal timer metadata into a dedicated struct

Internal timer mode (SW clock) requires some PTP clock related metadata
structs. Real time mode (HW clock) will not need these metadata structs.
This separation emphasize the different interfaces for HW clock and SW
clock.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Aya Levin <ayal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lib/clock.c    | 107 ++++++++++++---------
 .../net/ethernet/mellanox/mlx5/core/lib/clock.h    |   3 +-
 include/linux/mlx5/driver.h                        |  12 ++-
 3 files changed, 71 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index aaf7d837a967..dcacaa451b53 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -89,7 +89,8 @@ static u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev,
 
 static u64 read_internal_timer(const struct cyclecounter *cc)
 {
-	struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles);
+	struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles);
+	struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer);
 	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
 						  clock);
 
@@ -100,6 +101,7 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_ib_clock_info *clock_info = mdev->clock_info;
 	struct mlx5_clock *clock = &mdev->clock;
+	struct mlx5_timer *timer;
 	u32 sign;
 
 	if (!clock_info)
@@ -109,10 +111,11 @@ static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev)
 	smp_store_mb(clock_info->sign,
 		     sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING);
 
-	clock_info->cycles = clock->tc.cycle_last;
-	clock_info->mult   = clock->cycles.mult;
-	clock_info->nsec   = clock->tc.nsec;
-	clock_info->frac   = clock->tc.frac;
+	timer = &clock->timer;
+	clock_info->cycles = timer->tc.cycle_last;
+	clock_info->mult   = timer->cycles.mult;
+	clock_info->nsec   = timer->tc.nsec;
+	clock_info->frac   = timer->tc.frac;
 
 	smp_store_release(&clock_info->sign,
 			  sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2);
@@ -151,28 +154,32 @@ static void mlx5_timestamp_overflow(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct mlx5_core_dev *mdev;
+	struct mlx5_timer *timer;
 	struct mlx5_clock *clock;
 	unsigned long flags;
 
-	clock = container_of(dwork, struct mlx5_clock, overflow_work);
+	timer = container_of(dwork, struct mlx5_timer, overflow_work);
+	clock = container_of(timer, struct mlx5_clock, timer);
 	mdev = container_of(clock, struct mlx5_core_dev, clock);
+
 	write_seqlock_irqsave(&clock->lock, flags);
-	timecounter_read(&clock->tc);
+	timecounter_read(&timer->tc);
 	mlx5_update_clock_info_page(mdev);
 	write_sequnlock_irqrestore(&clock->lock, flags);
-	schedule_delayed_work(&clock->overflow_work, clock->overflow_period);
+	schedule_delayed_work(&timer->overflow_work, timer->overflow_period);
 }
 
 static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts)
 {
 	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_timer *timer = &clock->timer;
 	u64 ns = timespec64_to_ns(ts);
 	struct mlx5_core_dev *mdev;
 	unsigned long flags;
 
 	mdev = container_of(clock, struct mlx5_core_dev, clock);
 	write_seqlock_irqsave(&clock->lock, flags);
-	timecounter_init(&clock->tc, &clock->cycles, ns);
+	timecounter_init(&timer->tc, &timer->cycles, ns);
 	mlx5_update_clock_info_page(mdev);
 	write_sequnlock_irqrestore(&clock->lock, flags);
 
@@ -183,6 +190,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
 			     struct ptp_system_timestamp *sts)
 {
 	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_timer *timer = &clock->timer;
 	struct mlx5_core_dev *mdev;
 	unsigned long flags;
 	u64 cycles, ns;
@@ -190,7 +198,7 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
 	mdev = container_of(clock, struct mlx5_core_dev, clock);
 	write_seqlock_irqsave(&clock->lock, flags);
 	cycles = mlx5_read_internal_timer(mdev, sts);
-	ns = timecounter_cyc2time(&clock->tc, cycles);
+	ns = timecounter_cyc2time(&timer->tc, cycles);
 	write_sequnlock_irqrestore(&clock->lock, flags);
 
 	*ts = ns_to_timespec64(ns);
@@ -201,12 +209,13 @@ static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
 static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
 	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_timer *timer = &clock->timer;
 	struct mlx5_core_dev *mdev;
 	unsigned long flags;
 
 	mdev = container_of(clock, struct mlx5_core_dev, clock);
 	write_seqlock_irqsave(&clock->lock, flags);
-	timecounter_adjtime(&clock->tc, delta);
+	timecounter_adjtime(&timer->tc, delta);
 	mlx5_update_clock_info_page(mdev);
 	write_sequnlock_irqrestore(&clock->lock, flags);
 
@@ -216,27 +225,27 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 {
 	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_timer *timer = &clock->timer;
 	struct mlx5_core_dev *mdev;
 	unsigned long flags;
 	int neg_adj = 0;
 	u32 diff;
 	u64 adj;
 
-
 	if (delta < 0) {
 		neg_adj = 1;
 		delta = -delta;
 	}
 
-	adj = clock->nominal_c_mult;
+	adj = timer->nominal_c_mult;
 	adj *= delta;
 	diff = div_u64(adj, 1000000000ULL);
 
 	mdev = container_of(clock, struct mlx5_core_dev, clock);
 	write_seqlock_irqsave(&clock->lock, flags);
-	timecounter_read(&clock->tc);
-	clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
-				       clock->nominal_c_mult + diff;
+	timecounter_read(&timer->tc);
+	timer->cycles.mult = neg_adj ? timer->nominal_c_mult - diff :
+				       timer->nominal_c_mult + diff;
 	mlx5_update_clock_info_page(mdev);
 	write_sequnlock_irqrestore(&clock->lock, flags);
 
@@ -313,6 +322,7 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp,
 			container_of(ptp, struct mlx5_clock, ptp_info);
 	struct mlx5_core_dev *mdev =
 			container_of(clock, struct mlx5_core_dev, clock);
+	struct mlx5_timer *timer = &clock->timer;
 	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
 	u64 nsec_now, nsec_delta, time_stamp = 0;
 	u64 cycles_now, cycles_delta;
@@ -355,10 +365,10 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp,
 		ns = timespec64_to_ns(&ts);
 		cycles_now = mlx5_read_internal_timer(mdev, NULL);
 		write_seqlock_irqsave(&clock->lock, flags);
-		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
+		nsec_now = timecounter_cyc2time(&timer->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
-		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
-					 clock->cycles.mult);
+		cycles_delta = div64_u64(nsec_delta << timer->cycles.shift,
+					 timer->cycles.mult);
 		write_sequnlock_irqrestore(&clock->lock, flags);
 		time_stamp = cycles_now + cycles_delta;
 		field_select = MLX5_MTPPS_FS_PIN_MODE |
@@ -541,6 +551,7 @@ static int mlx5_pps_event(struct notifier_block *nb,
 			  unsigned long type, void *data)
 {
 	struct mlx5_clock *clock = mlx5_nb_cof(nb, struct mlx5_clock, pps_nb);
+	struct mlx5_timer *timer = &clock->timer;
 	struct ptp_clock_event ptp_event;
 	u64 cycles_now, cycles_delta;
 	u64 nsec_now, nsec_delta, ns;
@@ -575,10 +586,10 @@ static int mlx5_pps_event(struct notifier_block *nb,
 		ts.tv_nsec = 0;
 		ns = timespec64_to_ns(&ts);
 		write_seqlock_irqsave(&clock->lock, flags);
-		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
+		nsec_now = timecounter_cyc2time(&timer->tc, cycles_now);
 		nsec_delta = ns - nsec_now;
-		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
-					 clock->cycles.mult);
+		cycles_delta = div64_u64(nsec_delta << timer->cycles.shift,
+					 timer->cycles.mult);
 		clock->pps_info.start[pin] = cycles_now + cycles_delta;
 		write_sequnlock_irqrestore(&clock->lock, flags);
 		schedule_work(&clock->pps_info.out_work);
@@ -594,17 +605,18 @@ static int mlx5_pps_event(struct notifier_block *nb,
 static void mlx5_timecounter_init(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_clock *clock = &mdev->clock;
+	struct mlx5_timer *timer = &clock->timer;
 	u32 dev_freq;
 
 	dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz);
-	clock->cycles.read = read_internal_timer;
-	clock->cycles.shift = MLX5_CYCLES_SHIFT;
-	clock->cycles.mult = clocksource_khz2mult(dev_freq,
-						  clock->cycles.shift);
-	clock->nominal_c_mult = clock->cycles.mult;
-	clock->cycles.mask = CLOCKSOURCE_MASK(41);
-
-	timecounter_init(&clock->tc, &clock->cycles,
+	timer->cycles.read = read_internal_timer;
+	timer->cycles.shift = MLX5_CYCLES_SHIFT;
+	timer->cycles.mult = clocksource_khz2mult(dev_freq,
+						  timer->cycles.shift);
+	timer->nominal_c_mult = timer->cycles.mult;
+	timer->cycles.mask = CLOCKSOURCE_MASK(41);
+
+	timecounter_init(&timer->tc, &timer->cycles,
 			 ktime_to_ns(ktime_get_real()));
 }
 
@@ -612,6 +624,7 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock)
 {
 	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock);
 	struct mlx5_ib_clock_info *clock_info = mdev->clock_info;
+	struct mlx5_timer *timer = &clock->timer;
 	u64 overflow_cycles;
 	u64 frac = 0;
 	u64 ns;
@@ -623,29 +636,30 @@ static void mlx5_init_overflow_period(struct mlx5_clock *clock)
 	 * multiplied by clock multiplier where the result doesn't exceed
 	 * 64bits.
 	 */
-	overflow_cycles = div64_u64(~0ULL >> 1, clock->cycles.mult);
-	overflow_cycles = min(overflow_cycles, div_u64(clock->cycles.mask, 3));
+	overflow_cycles = div64_u64(~0ULL >> 1, timer->cycles.mult);
+	overflow_cycles = min(overflow_cycles, div_u64(timer->cycles.mask, 3));
 
-	ns = cyclecounter_cyc2ns(&clock->cycles, overflow_cycles,
+	ns = cyclecounter_cyc2ns(&timer->cycles, overflow_cycles,
 				 frac, &frac);
 	do_div(ns, NSEC_PER_SEC / HZ);
-	clock->overflow_period = ns;
+	timer->overflow_period = ns;
 
-	INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow);
-	if (clock->overflow_period)
-		schedule_delayed_work(&clock->overflow_work, 0);
+	INIT_DELAYED_WORK(&timer->overflow_work, mlx5_timestamp_overflow);
+	if (timer->overflow_period)
+		schedule_delayed_work(&timer->overflow_work, 0);
 	else
 		mlx5_core_warn(mdev,
 			       "invalid overflow period, overflow_work is not scheduled\n");
 
 	if (clock_info)
-		clock_info->overflow_period = clock->overflow_period;
+		clock_info->overflow_period = timer->overflow_period;
 }
 
 static void mlx5_init_clock_info(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_clock *clock = &mdev->clock;
 	struct mlx5_ib_clock_info *info;
+	struct mlx5_timer *timer;
 
 	mdev->clock_info = (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL);
 	if (!mdev->clock_info) {
@@ -654,13 +668,14 @@ static void mlx5_init_clock_info(struct mlx5_core_dev *mdev)
 	}
 
 	info = mdev->clock_info;
-
-	info->nsec = clock->tc.nsec;
-	info->cycles = clock->tc.cycle_last;
-	info->mask = clock->cycles.mask;
-	info->mult = clock->nominal_c_mult;
-	info->shift = clock->cycles.shift;
-	info->frac = clock->tc.frac;
+	timer = &clock->timer;
+
+	info->nsec = timer->tc.nsec;
+	info->cycles = timer->tc.cycle_last;
+	info->mask = timer->cycles.mask;
+	info->mult = timer->nominal_c_mult;
+	info->shift = timer->cycles.shift;
+	info->frac = timer->tc.frac;
 }
 
 void mlx5_init_clock(struct mlx5_core_dev *mdev)
@@ -714,7 +729,7 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
 	}
 
 	cancel_work_sync(&clock->pps_info.out_work);
-	cancel_delayed_work_sync(&clock->overflow_work);
+	cancel_delayed_work_sync(&clock->timer.overflow_work);
 
 	if (mdev->clock_info) {
 		free_page((unsigned long)mdev->clock_info);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
index 31600924bdc3..6e8804ebc773 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -45,12 +45,13 @@ static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev)
 static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
 						u64 timestamp)
 {
+	struct mlx5_timer *timer = &clock->timer;
 	unsigned int seq;
 	u64 nsec;
 
 	do {
 		seq = read_seqbegin(&clock->lock);
-		nsec = timecounter_cyc2time(&clock->tc, timestamp);
+		nsec = timecounter_cyc2time(&timer->tc, timestamp);
 	} while (read_seqretry(&clock->lock, seq));
 
 	return ns_to_ktime(nsec);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2eb7755143d7..2f4d6182df1b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -662,18 +662,22 @@ struct mlx5_pps {
 	u8                         enabled;
 };
 
-struct mlx5_clock {
-	struct mlx5_nb             pps_nb;
-	seqlock_t                  lock;
+struct mlx5_timer {
 	struct cyclecounter        cycles;
 	struct timecounter         tc;
-	struct hwtstamp_config     hwtstamp_config;
 	u32                        nominal_c_mult;
 	unsigned long              overflow_period;
 	struct delayed_work        overflow_work;
+};
+
+struct mlx5_clock {
+	struct mlx5_nb             pps_nb;
+	seqlock_t                  lock;
+	struct hwtstamp_config     hwtstamp_config;
 	struct ptp_clock          *ptp;
 	struct ptp_clock_info      ptp_info;
 	struct mlx5_pps            pps_info;
+	struct mlx5_timer          timer;
 };
 
 struct mlx5_dm;
-- 
cgit v1.2.3


From 32aeba1f7a98b0c69d4a5704a7d9cea42ba856ba Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 16 Feb 2021 11:08:37 -0800
Subject: tg3: Remove unused PHY_BRCM flags

The tg3 driver tried to communicate towards the PHY driver whether it
wanted RGMII in-band signaling enabled or disabled however there is
nothing that looks at those flags in drivers/net/phy/broadcom.c so this
does do not anything.

Suggested-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 6 ------
 include/linux/brcmphy.h             | 9 +++------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 8936c2bc6286..d2381929931b 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -1580,12 +1580,6 @@ static int tg3_mdio_init(struct tg3 *tp)
 				     PHY_BRCM_RX_REFCLK_UNUSED |
 				     PHY_BRCM_DIS_TXCRXC_NOENRGY |
 				     PHY_BRCM_AUTO_PWRDWN_ENABLE;
-		if (tg3_flag(tp, RGMII_INBAND_DISABLE))
-			phydev->dev_flags |= PHY_BRCM_STD_IBND_DISABLE;
-		if (tg3_flag(tp, RGMII_EXT_IBND_RX_EN))
-			phydev->dev_flags |= PHY_BRCM_EXT_IBND_RX_ENABLE;
-		if (tg3_flag(tp, RGMII_EXT_IBND_TX_EN))
-			phydev->dev_flags |= PHY_BRCM_EXT_IBND_TX_ENABLE;
 		fallthrough;
 	case PHY_ID_RTL8211C:
 		phydev->interface = PHY_INTERFACE_MODE_RGMII;
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 16597d3fa011..8fe1d55371ac 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -63,12 +63,9 @@
 
 #define PHY_BRCM_AUTO_PWRDWN_ENABLE	0x00000001
 #define PHY_BRCM_RX_REFCLK_UNUSED	0x00000002
-#define PHY_BRCM_STD_IBND_DISABLE	0x00000004
-#define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00000008
-#define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00000010
-#define PHY_BRCM_CLEAR_RGMII_MODE	0x00000020
-#define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00000040
-#define PHY_BRCM_EN_MASTER_MODE		0x00000080
+#define PHY_BRCM_CLEAR_RGMII_MODE	0x00000004
+#define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00000008
+#define PHY_BRCM_EN_MASTER_MODE		0x00000010
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
-- 
cgit v1.2.3


From 7331d1d4622ba7e668ec19cfba2ed7feb4a3084e Mon Sep 17 00:00:00 2001
From: Pavana Sharma <pavana.sharma@digi.com>
Date: Tue, 16 Feb 2021 20:20:53 +0100
Subject: net: phy: Add 5GBASER interface mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add 5GBASE-R phy interface mode

Signed-off-by: Pavana Sharma <pavana.sharma@digi.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phy.rst | 6 ++++++
 include/linux/phy.h              | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 70136cc9e25e..06adfc2afcf0 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -267,6 +267,12 @@ Some of the interface modes are described below:
     duplex, pause or other settings.  This is dependent on the MAC and/or
     PHY behaviour.
 
+``PHY_INTERFACE_MODE_5GBASER``
+    This is the IEEE 802.3 Clause 129 defined 5GBASE-R protocol. It is
+    identical to the 10GBASE-R protocol defined in Clause 49, with the
+    exception that it operates at half the frequency. Please refer to the
+    IEEE standard for the definition.
+
 ``PHY_INTERFACE_MODE_10GBASER``
     This is the IEEE 802.3 Clause 49 defined 10GBASE-R protocol used with
     various different mediums. Please refer to the IEEE standard for a
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 5d7c4084ade9..0d537f59b77f 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -107,6 +107,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX
  * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX
  * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX
+ * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR
  * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI
  * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface
  * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR
@@ -139,6 +140,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_100BASEX,
 	PHY_INTERFACE_MODE_1000BASEX,
 	PHY_INTERFACE_MODE_2500BASEX,
+	PHY_INTERFACE_MODE_5GBASER,
 	PHY_INTERFACE_MODE_RXAUI,
 	PHY_INTERFACE_MODE_XAUI,
 	/* 10GBASE-R, XFI, SFI - single lane 10G Serdes */
@@ -209,6 +211,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "1000base-x";
 	case PHY_INTERFACE_MODE_2500BASEX:
 		return "2500base-x";
+	case PHY_INTERFACE_MODE_5GBASER:
+		return "5gbase-r";
 	case PHY_INTERFACE_MODE_RXAUI:
 		return "rxaui";
 	case PHY_INTERFACE_MODE_XAUI:
-- 
cgit v1.2.3


From d8ea7ff3995ead5193313c72c0d97c9c16c83be9 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Tue, 16 Feb 2021 22:42:03 +0100
Subject: net: mscc: ocelot: Add support for MRP

Add basic support for MRP. The HW will just trap all MRP frames on the
ring ports to CPU and allow the SW to process them. In this way it is
possible to for this node to behave both as MRM and MRC.

Current limitations are:
- it doesn't support Interconnect roles.
- it supports only a single ring.
- the HW should be able to do forwarding of MRP Test frames so the SW
  will not need to do this. So it would be able to have the role MRC
  without SW support.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/Makefile     |   1 +
 drivers/net/ethernet/mscc/ocelot.c     |  10 +-
 drivers/net/ethernet/mscc/ocelot_mrp.c | 175 +++++++++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_net.c |  60 +++++++++++
 include/linux/dsa/ocelot.h             |   5 +
 include/soc/mscc/ocelot.h              |  45 +++++++++
 6 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mscc/ocelot_mrp.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile
index 346bba2730ad..722c27694b21 100644
--- a/drivers/net/ethernet/mscc/Makefile
+++ b/drivers/net/ethernet/mscc/Makefile
@@ -8,6 +8,7 @@ mscc_ocelot_switch_lib-y := \
 	ocelot_flower.o \
 	ocelot_ptp.o \
 	ocelot_devlink.o
+mscc_ocelot_switch_lib-$(CONFIG_BRIDGE_MRP) += ocelot_mrp.o
 obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot.o
 mscc_ocelot-y := \
 	ocelot_vsc7514.o \
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 5d13087c85d6..46e5c9136bac 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -687,7 +687,7 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
 int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
 {
 	struct skb_shared_hwtstamps *shhwtstamps;
-	u64 tod_in_ns, full_ts_in_ns;
+	u64 tod_in_ns, full_ts_in_ns, cpuq;
 	u64 timestamp, src_port, len;
 	u32 xfh[OCELOT_TAG_LEN / 4];
 	struct net_device *dev;
@@ -704,6 +704,7 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
 	ocelot_xfh_get_src_port(xfh, &src_port);
 	ocelot_xfh_get_len(xfh, &len);
 	ocelot_xfh_get_rew_val(xfh, &timestamp);
+	ocelot_xfh_get_cpuq(xfh, &cpuq);
 
 	if (WARN_ON(src_port >= ocelot->num_phys_ports))
 		return -EINVAL;
@@ -770,6 +771,13 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
 		skb->offload_fwd_mark = 1;
 
 	skb->protocol = eth_type_trans(skb, dev);
+
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+	if (skb->protocol == cpu_to_be16(ETH_P_MRP) &&
+	    cpuq & BIT(OCELOT_MRP_CPUQ))
+		skb->offload_fwd_mark = 0;
+#endif
+
 	*nskb = skb;
 
 	return 0;
diff --git a/drivers/net/ethernet/mscc/ocelot_mrp.c b/drivers/net/ethernet/mscc/ocelot_mrp.c
new file mode 100644
index 000000000000..683da320bfd8
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_mrp.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/* Microsemi Ocelot Switch driver
+ *
+ * This contains glue logic between the switchdev driver operations and the
+ * mscc_ocelot_switch_lib.
+ *
+ * Copyright (c) 2017, 2019 Microsemi Corporation
+ * Copyright 2020-2021 NXP Semiconductors
+ */
+
+#include <linux/if_bridge.h>
+#include <linux/mrp_bridge.h>
+#include <soc/mscc/ocelot_vcap.h>
+#include <uapi/linux/mrp_bridge.h>
+#include "ocelot.h"
+#include "ocelot_vcap.h"
+
+static int ocelot_mrp_del_vcap(struct ocelot *ocelot, int port)
+{
+	struct ocelot_vcap_block *block_vcap_is2;
+	struct ocelot_vcap_filter *filter;
+
+	block_vcap_is2 = &ocelot->block[VCAP_IS2];
+	filter = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, port,
+						     false);
+	if (!filter)
+		return 0;
+
+	return ocelot_vcap_filter_del(ocelot, filter);
+}
+
+int ocelot_mrp_add(struct ocelot *ocelot, int port,
+		   const struct switchdev_obj_mrp *mrp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_port_private *priv;
+	struct net_device *dev;
+
+	if (!ocelot_port)
+		return -EOPNOTSUPP;
+
+	priv = container_of(ocelot_port, struct ocelot_port_private, port);
+	dev = priv->dev;
+
+	if (mrp->p_port != dev && mrp->s_port != dev)
+		return 0;
+
+	if (ocelot->mrp_ring_id != 0 &&
+	    ocelot->mrp_s_port &&
+	    ocelot->mrp_p_port)
+		return -EINVAL;
+
+	if (mrp->p_port == dev)
+		ocelot->mrp_p_port = dev;
+
+	if (mrp->s_port == dev)
+		ocelot->mrp_s_port = dev;
+
+	ocelot->mrp_ring_id = mrp->ring_id;
+
+	return 0;
+}
+EXPORT_SYMBOL(ocelot_mrp_add);
+
+int ocelot_mrp_del(struct ocelot *ocelot, int port,
+		   const struct switchdev_obj_mrp *mrp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_port_private *priv;
+	struct net_device *dev;
+
+	if (!ocelot_port)
+		return -EOPNOTSUPP;
+
+	priv = container_of(ocelot_port, struct ocelot_port_private, port);
+	dev = priv->dev;
+
+	if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev)
+		return 0;
+
+	if (ocelot->mrp_ring_id == 0 &&
+	    !ocelot->mrp_s_port &&
+	    !ocelot->mrp_p_port)
+		return -EINVAL;
+
+	if (ocelot_mrp_del_vcap(ocelot, priv->chip_port))
+		return -EINVAL;
+
+	if (ocelot->mrp_p_port == dev)
+		ocelot->mrp_p_port = NULL;
+
+	if (ocelot->mrp_s_port == dev)
+		ocelot->mrp_s_port = NULL;
+
+	ocelot->mrp_ring_id = 0;
+
+	return 0;
+}
+EXPORT_SYMBOL(ocelot_mrp_del);
+
+int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port,
+			     const struct switchdev_obj_ring_role_mrp *mrp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_vcap_filter *filter;
+	struct ocelot_port_private *priv;
+	struct net_device *dev;
+	int err;
+
+	if (!ocelot_port)
+		return -EOPNOTSUPP;
+
+	priv = container_of(ocelot_port, struct ocelot_port_private, port);
+	dev = priv->dev;
+
+	if (ocelot->mrp_ring_id != mrp->ring_id)
+		return -EINVAL;
+
+	if (!mrp->sw_backup)
+		return -EOPNOTSUPP;
+
+	if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev)
+		return 0;
+
+	filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
+	if (!filter)
+		return -ENOMEM;
+
+	filter->key_type = OCELOT_VCAP_KEY_ETYPE;
+	filter->prio = 1;
+	filter->id.cookie = priv->chip_port;
+	filter->id.tc_offload = false;
+	filter->block_id = VCAP_IS2;
+	filter->type = OCELOT_VCAP_FILTER_OFFLOAD;
+	filter->ingress_port_mask = BIT(priv->chip_port);
+	*(__be16 *)filter->key.etype.etype.value = htons(ETH_P_MRP);
+	*(__be16 *)filter->key.etype.etype.mask = htons(0xffff);
+	filter->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY;
+	filter->action.port_mask = 0x0;
+	filter->action.cpu_copy_ena = true;
+	filter->action.cpu_qu_num = OCELOT_MRP_CPUQ;
+
+	err = ocelot_vcap_filter_add(ocelot, filter, NULL);
+	if (err)
+		kfree(filter);
+
+	return err;
+}
+EXPORT_SYMBOL(ocelot_mrp_add_ring_role);
+
+int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port,
+			     const struct switchdev_obj_ring_role_mrp *mrp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_port_private *priv;
+	struct net_device *dev;
+
+	if (!ocelot_port)
+		return -EOPNOTSUPP;
+
+	priv = container_of(ocelot_port, struct ocelot_port_private, port);
+	dev = priv->dev;
+
+	if (ocelot->mrp_ring_id != mrp->ring_id)
+		return -EINVAL;
+
+	if (!mrp->sw_backup)
+		return -EOPNOTSUPP;
+
+	if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev)
+		return 0;
+
+	return ocelot_mrp_del_vcap(ocelot, priv->chip_port);
+}
+EXPORT_SYMBOL(ocelot_mrp_del_ring_role);
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 6518262532f0..12cb6867a2d0 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1010,6 +1010,52 @@ static int ocelot_port_obj_del_mdb(struct net_device *dev,
 	return ocelot_port_mdb_del(ocelot, port, mdb);
 }
 
+static int ocelot_port_obj_mrp_add(struct net_device *dev,
+				   const struct switchdev_obj_mrp *mrp)
+{
+	struct ocelot_port_private *priv = netdev_priv(dev);
+	struct ocelot_port *ocelot_port = &priv->port;
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	int port = priv->chip_port;
+
+	return ocelot_mrp_add(ocelot, port, mrp);
+}
+
+static int ocelot_port_obj_mrp_del(struct net_device *dev,
+				   const struct switchdev_obj_mrp *mrp)
+{
+	struct ocelot_port_private *priv = netdev_priv(dev);
+	struct ocelot_port *ocelot_port = &priv->port;
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	int port = priv->chip_port;
+
+	return ocelot_mrp_del(ocelot, port, mrp);
+}
+
+static int
+ocelot_port_obj_mrp_add_ring_role(struct net_device *dev,
+				  const struct switchdev_obj_ring_role_mrp *mrp)
+{
+	struct ocelot_port_private *priv = netdev_priv(dev);
+	struct ocelot_port *ocelot_port = &priv->port;
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	int port = priv->chip_port;
+
+	return ocelot_mrp_add_ring_role(ocelot, port, mrp);
+}
+
+static int
+ocelot_port_obj_mrp_del_ring_role(struct net_device *dev,
+				  const struct switchdev_obj_ring_role_mrp *mrp)
+{
+	struct ocelot_port_private *priv = netdev_priv(dev);
+	struct ocelot_port *ocelot_port = &priv->port;
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	int port = priv->chip_port;
+
+	return ocelot_mrp_del_ring_role(ocelot, port, mrp);
+}
+
 static int ocelot_port_obj_add(struct net_device *dev,
 			       const struct switchdev_obj *obj,
 			       struct netlink_ext_ack *extack)
@@ -1024,6 +1070,13 @@ static int ocelot_port_obj_add(struct net_device *dev,
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj));
 		break;
+	case SWITCHDEV_OBJ_ID_MRP:
+		ret = ocelot_port_obj_mrp_add(dev, SWITCHDEV_OBJ_MRP(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+		ret = ocelot_port_obj_mrp_add_ring_role(dev,
+							SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1044,6 +1097,13 @@ static int ocelot_port_obj_del(struct net_device *dev,
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		ret = ocelot_port_obj_del_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj));
 		break;
+	case SWITCHDEV_OBJ_ID_MRP:
+		ret = ocelot_port_obj_mrp_del(dev, SWITCHDEV_OBJ_MRP(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+		ret = ocelot_port_obj_mrp_del_ring_role(dev,
+							SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index c6bc45ae5e03..4265f328681a 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -160,6 +160,11 @@ static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port)
 	packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0);
 }
 
+static inline void ocelot_xfh_get_cpuq(void *extraction, u64 *cpuq)
+{
+	packing(extraction, cpuq, 28, 20, OCELOT_TAG_LEN, UNPACK, 0);
+}
+
 static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class)
 {
 	packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 1f2d90976564..425ff29d9389 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -112,6 +112,8 @@
 #define REG_RESERVED_ADDR		0xffffffff
 #define REG_RESERVED(reg)		REG(reg, REG_RESERVED_ADDR)
 
+#define OCELOT_MRP_CPUQ			7
+
 enum ocelot_target {
 	ANA = 1,
 	QS,
@@ -677,6 +679,12 @@ struct ocelot {
 	/* Protects the PTP clock */
 	spinlock_t			ptp_clock_lock;
 	struct ptp_pin_desc		ptp_pins[OCELOT_PTP_PINS_NUM];
+
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+	u16				mrp_ring_id;
+	struct net_device		*mrp_p_port;
+	struct net_device		*mrp_s_port;
+#endif
 };
 
 struct ocelot_policer {
@@ -874,4 +882,41 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port,
 				   enum devlink_sb_pool_type pool_type,
 				   u32 *p_cur, u32 *p_max);
 
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+int ocelot_mrp_add(struct ocelot *ocelot, int port,
+		   const struct switchdev_obj_mrp *mrp);
+int ocelot_mrp_del(struct ocelot *ocelot, int port,
+		   const struct switchdev_obj_mrp *mrp);
+int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port,
+			     const struct switchdev_obj_ring_role_mrp *mrp);
+int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port,
+			     const struct switchdev_obj_ring_role_mrp *mrp);
+#else
+static inline int ocelot_mrp_add(struct ocelot *ocelot, int port,
+				 const struct switchdev_obj_mrp *mrp)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int ocelot_mrp_del(struct ocelot *ocelot, int port,
+				 const struct switchdev_obj_mrp *mrp)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port,
+			 const struct switchdev_obj_ring_role_mrp *mrp)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port,
+			 const struct switchdev_obj_ring_role_mrp *mrp)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 3afd0218992a8d1398e9791d6c2edd4c948ae7ee Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock@calian.com>
Date: Tue, 16 Feb 2021 16:54:52 -0600
Subject: net: phy: broadcom: Set proper 1000BaseX/SGMII interface mode for
 BCM54616S

The default configuration for the BCM54616S PHY may not match the desired
mode when using 1000BaseX or SGMII interface modes, such as when it is on
an SFP module. Add code to explicitly set the correct mode using
programming sequences provided by Bel-Fuse:

https://www.belfuse.com/resources/datasheets/powersolutions/ds-bps-sfp-1gbt-05-series.pdf
https://www.belfuse.com/resources/datasheets/powersolutions/ds-bps-sfp-1gbt-06-series.pdf

Signed-off-by: Robert Hancock <robert.hancock@calian.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 84 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/brcmphy.h    |  4 +++
 2 files changed, 76 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 91fbd26c809e..8e7fc3368380 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -103,6 +103,64 @@ static int bcm54612e_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+static int bcm54616s_config_init(struct phy_device *phydev)
+{
+	int rc, val;
+
+	if (phydev->interface != PHY_INTERFACE_MODE_SGMII &&
+	    phydev->interface != PHY_INTERFACE_MODE_1000BASEX)
+		return 0;
+
+	/* Ensure proper interface mode is selected. */
+	/* Disable RGMII mode */
+	val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+	if (val < 0)
+		return val;
+	val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN;
+	val |= MII_BCM54XX_AUXCTL_MISC_WREN;
+	rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+				  val);
+	if (rc < 0)
+		return rc;
+
+	/* Select 1000BASE-X register set (primary SerDes) */
+	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
+	if (val < 0)
+		return val;
+	val |= BCM54XX_SHD_MODE_1000BX;
+	rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val);
+	if (rc < 0)
+		return rc;
+
+	/* Power down SerDes interface */
+	rc = phy_set_bits(phydev, MII_BMCR, BMCR_PDOWN);
+	if (rc < 0)
+		return rc;
+
+	/* Select proper interface mode */
+	val &= ~BCM54XX_SHD_INTF_SEL_MASK;
+	val |= phydev->interface == PHY_INTERFACE_MODE_SGMII ?
+		BCM54XX_SHD_INTF_SEL_SGMII :
+		BCM54XX_SHD_INTF_SEL_GBIC;
+	rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val);
+	if (rc < 0)
+		return rc;
+
+	/* Power up SerDes interface */
+	rc = phy_clear_bits(phydev, MII_BMCR, BMCR_PDOWN);
+	if (rc < 0)
+		return rc;
+
+	/* Select copper register set */
+	val &= ~BCM54XX_SHD_MODE_1000BX;
+	rc = bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE, val);
+	if (rc < 0)
+		return rc;
+
+	/* Power up copper interface */
+	return phy_clear_bits(phydev, MII_BMCR, BMCR_PDOWN);
+}
+
 /* Needs SMDSP clock enabled via bcm54xx_phydsp_config() */
 static int bcm50610_a0_workaround(struct phy_device *phydev)
 {
@@ -283,15 +341,17 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 
 	bcm54xx_adjust_rxrefclk(phydev);
 
-	if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54210E) {
+	switch (BRCM_PHY_MODEL(phydev)) {
+	case PHY_ID_BCM54210E:
 		err = bcm54210e_config_init(phydev);
-		if (err)
-			return err;
-	} else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54612E) {
+		break;
+	case PHY_ID_BCM54612E:
 		err = bcm54612e_config_init(phydev);
-		if (err)
-			return err;
-	} else if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
+		break;
+	case PHY_ID_BCM54616S:
+		err = bcm54616s_config_init(phydev);
+		break;
+	case PHY_ID_BCM54810:
 		/* For BCM54810, we need to disable BroadR-Reach function */
 		val = bcm_phy_read_exp(phydev,
 				       BCM54810_EXP_BROADREACH_LRE_MISC_CTL);
@@ -299,9 +359,10 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 		err = bcm_phy_write_exp(phydev,
 					BCM54810_EXP_BROADREACH_LRE_MISC_CTL,
 					val);
-		if (err < 0)
-			return err;
+		break;
 	}
+	if (err)
+		return err;
 
 	bcm54xx_phydsp_config(phydev);
 
@@ -390,7 +451,7 @@ struct bcm54616s_phy_priv {
 static int bcm54616s_probe(struct phy_device *phydev)
 {
 	struct bcm54616s_phy_priv *priv;
-	int val, intf_sel;
+	int val;
 
 	priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -408,8 +469,7 @@ static int bcm54616s_probe(struct phy_device *phydev)
 	 * RGMII-1000Base-X is properly supported, but RGMII-100Base-FX
 	 * support is still missing as of now.
 	 */
-	intf_sel = (val & BCM54XX_SHD_INTF_SEL_MASK) >> 1;
-	if (intf_sel == 1) {
+	if ((val & BCM54XX_SHD_INTF_SEL_MASK) == BCM54XX_SHD_INTF_SEL_RGMII) {
 		val = bcm_phy_read_shadow(phydev, BCM54616S_SHD_100FX_CTRL);
 		if (val < 0)
 			return val;
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 8fe1d55371ac..c2c2147dfeb8 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -129,6 +129,7 @@
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC			0x07
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	0x0010
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN	0x0080
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	0x0100
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX		0x0200
 #define MII_BCM54XX_AUXCTL_MISC_WREN			0x8000
@@ -216,6 +217,9 @@
 /* 11111: Mode Control Register */
 #define BCM54XX_SHD_MODE		0x1f
 #define BCM54XX_SHD_INTF_SEL_MASK	GENMASK(2, 1)	/* INTERF_SEL[1:0] */
+#define BCM54XX_SHD_INTF_SEL_RGMII	0x02
+#define BCM54XX_SHD_INTF_SEL_SGMII	0x04
+#define BCM54XX_SHD_INTF_SEL_GBIC	0x06
 #define BCM54XX_SHD_MODE_1000BX		BIT(0)	/* Enable 1000-X registers */
 
 /*
-- 
cgit v1.2.3


From b834489bceccc64641684eee5e93275cdf5f465b Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock@calian.com>
Date: Tue, 16 Feb 2021 16:54:53 -0600
Subject: net: phy: Add is_on_sfp_module flag and phy_on_sfp helper

Add a flag and helper function to indicate that a PHY device is part of
an SFP module, which is set on attach. This can be used by PHY drivers
to handle SFP-specific quirks or behavior.

Signed-off-by: Robert Hancock <robert.hancock@calian.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c |  2 ++
 include/linux/phy.h          | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 05261698bf74..d6ac3ed38197 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1377,6 +1377,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 		if (phydev->sfp_bus_attached)
 			dev->sfp_bus = phydev->sfp_bus;
+		else if (dev->sfp_bus)
+			phydev->is_on_sfp_module = true;
 	}
 
 	/* Some Ethernet drivers try to connect to a PHY device before
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 0d537f59b77f..1a12e4436b5b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -492,6 +492,7 @@ struct macsec_ops;
  * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal.
  * @loopback_enabled: Set true if this PHY has been loopbacked successfully.
  * @downshifted_rate: Set true if link speed has been downshifted.
+ * @is_on_sfp_module: Set true if PHY is located on an SFP module.
  * @state: State of the PHY for management purposes
  * @dev_flags: Device-specific flags used by the PHY driver.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
@@ -565,6 +566,7 @@ struct phy_device {
 	unsigned sysfs_links:1;
 	unsigned loopback_enabled:1;
 	unsigned downshifted_rate:1;
+	unsigned is_on_sfp_module:1;
 
 	unsigned autoneg:1;
 	/* The most recently read link state */
@@ -1296,6 +1298,15 @@ static inline bool phy_is_internal(struct phy_device *phydev)
 	return phydev->is_internal;
 }
 
+/**
+ * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module
+ * @phydev: the phy_device struct
+ */
+static inline bool phy_on_sfp(struct phy_device *phydev)
+{
+	return phydev->is_on_sfp_module;
+}
+
 /**
  * phy_interface_mode_is_rgmii - Convenience function for testing if a
  * PHY interface mode is RGMII (all variants)
-- 
cgit v1.2.3


From 1f975074634a63f014e2b7e76852ee6d6005a91d Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 12 Feb 2021 18:10:43 +0100
Subject: libnvdimm: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All drivers return 0 in their remove callback and the driver core ignores
the return value of nvdimm_bus_remove() anyhow. So simplify by changing
the driver remove callback to return void and return 0 unconditionally
to the upper layer.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20210212171043.2136580-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/pmem/compat.c |  3 +--
 drivers/nvdimm/blk.c      |  3 +--
 drivers/nvdimm/bus.c      | 13 +++++--------
 drivers/nvdimm/dimm.c     |  4 +---
 drivers/nvdimm/pmem.c     |  4 +---
 drivers/nvdimm/region.c   |  4 +---
 include/linux/nd.h        |  2 +-
 7 files changed, 11 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c
index 863c114fd88c..d81dc35fd65d 100644
--- a/drivers/dax/pmem/compat.c
+++ b/drivers/dax/pmem/compat.c
@@ -41,10 +41,9 @@ static int dax_pmem_compat_release(struct device *dev, void *data)
 	return 0;
 }
 
-static int dax_pmem_compat_remove(struct device *dev)
+static void dax_pmem_compat_remove(struct device *dev)
 {
 	device_for_each_child(dev, NULL, dax_pmem_compat_release);
-	return 0;
 }
 
 static struct nd_device_driver dax_pmem_compat_driver = {
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 22e5617b2cea..8a53728e13e6 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -310,11 +310,10 @@ static int nd_blk_probe(struct device *dev)
 		return nsblk_attach_disk(nsblk);
 }
 
-static int nd_blk_remove(struct device *dev)
+static void nd_blk_remove(struct device *dev)
 {
 	if (is_nd_btt(dev))
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
-	return 0;
 }
 
 static struct nd_device_driver nd_blk_driver = {
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2304c6183822..48f0985ca8a0 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -113,18 +113,17 @@ static int nvdimm_bus_remove(struct device *dev)
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
 	struct module *provider = to_bus_provider(dev);
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-	int rc = 0;
 
 	if (nd_drv->remove) {
 		debug_nvdimm_lock(dev);
-		rc = nd_drv->remove(dev);
+		nd_drv->remove(dev);
 		debug_nvdimm_unlock(dev);
 	}
 
-	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
-			dev_name(dev), rc);
+	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s)\n", dev->driver->name,
+			dev_name(dev));
 	module_put(provider);
-	return rc;
+	return 0;
 }
 
 static void nvdimm_bus_shutdown(struct device *dev)
@@ -427,7 +426,7 @@ static void free_badrange_list(struct list_head *badrange_list)
 	list_del_init(badrange_list);
 }
 
-static int nd_bus_remove(struct device *dev)
+static void nd_bus_remove(struct device *dev)
 {
 	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 
@@ -446,8 +445,6 @@ static int nd_bus_remove(struct device *dev)
 	spin_unlock(&nvdimm_bus->badrange.lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
-
-	return 0;
 }
 
 static int nd_bus_probe(struct device *dev)
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 94be3ae1d29f..91d9163ee303 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -113,7 +113,7 @@ static int nvdimm_probe(struct device *dev)
 	return rc;
 }
 
-static int nvdimm_remove(struct device *dev)
+static void nvdimm_remove(struct device *dev)
 {
 	struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
 
@@ -121,8 +121,6 @@ static int nvdimm_remove(struct device *dev)
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 	put_ndd(ndd);
-
-	return 0;
 }
 
 static struct nd_device_driver nvdimm_driver = {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 875076b0ea6c..062f0f22bac9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -564,7 +564,7 @@ static int nd_pmem_probe(struct device *dev)
 	return pmem_attach_disk(dev, ndns);
 }
 
-static int nd_pmem_remove(struct device *dev)
+static void nd_pmem_remove(struct device *dev)
 {
 	struct pmem_device *pmem = dev_get_drvdata(dev);
 
@@ -579,8 +579,6 @@ static int nd_pmem_remove(struct device *dev)
 		pmem->bb_state = NULL;
 	}
 	nvdimm_flush(to_nd_region(dev->parent), NULL);
-
-	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index bfce87ed72ab..e0c34120df37 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -87,7 +87,7 @@ static int child_unregister(struct device *dev, void *data)
 	return 0;
 }
 
-static int nd_region_remove(struct device *dev)
+static void nd_region_remove(struct device *dev)
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 
@@ -108,8 +108,6 @@ static int nd_region_remove(struct device *dev)
 	 */
 	sysfs_put(nd_region->bb_state);
 	nd_region->bb_state = NULL;
-
-	return 0;
 }
 
 static int child_notify(struct device *dev, void *data)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 55c735997805..cec526c8043d 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -26,7 +26,7 @@ struct nd_device_driver {
 	struct device_driver drv;
 	unsigned long type;
 	int (*probe)(struct device *dev);
-	int (*remove)(struct device *dev);
+	void (*remove)(struct device *dev);
 	void (*shutdown)(struct device *dev);
 	void (*notify)(struct device *dev, enum nvdimm_event event);
 };
-- 
cgit v1.2.3


From 4cdadfd5e0a70017fec735b7b6d7f2f731842dc6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 16 Feb 2021 20:09:50 -0800
Subject: cxl/mem: Introduce a driver for CXL-2.0-Type-3 endpoints

The CXL.mem protocol allows a device to act as a provider of "System
RAM" and/or "Persistent Memory" that is fully coherent as if the memory
was attached to the typical CPU memory controller.

With the CXL-2.0 specification a PCI endpoint can implement a "Type-3"
device interface and give the operating system control over "Host
Managed Device Memory". See section 2.3 Type 3 CXL Device.

The memory range exported by the device may optionally be described by
the platform firmware memory map, or by infrastructure like LIBNVDIMM to
provision persistent memory capacity from one, or more, CXL.mem devices.

A pre-requisite for Linux-managed memory-capacity provisioning is this
cxl_mem driver that can speak the mailbox protocol defined in section
8.2.8.4 Mailbox Registers.

For now just land the initial driver boiler-plate and Documentation/
infrastructure.

Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: David Rientjes <rientjes@google.com> (v1)
Cc: Jonathan Corbet <corbet@lwn.net>
Link: https://www.computeexpresslink.org/download-the-specification
Link: https://lore.kernel.org/r/20210217040958.1354670-2-ben.widawsky@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/driver-api/cxl/index.rst          | 12 +++++
 Documentation/driver-api/cxl/memory-devices.rst | 15 ++++++
 Documentation/driver-api/index.rst              |  1 +
 drivers/Kconfig                                 |  1 +
 drivers/Makefile                                |  1 +
 drivers/cxl/Kconfig                             | 35 ++++++++++++++
 drivers/cxl/Makefile                            |  4 ++
 drivers/cxl/mem.c                               | 62 +++++++++++++++++++++++++
 drivers/cxl/pci.h                               | 17 +++++++
 include/linux/pci_ids.h                         |  1 +
 10 files changed, 149 insertions(+)
 create mode 100644 Documentation/driver-api/cxl/index.rst
 create mode 100644 Documentation/driver-api/cxl/memory-devices.rst
 create mode 100644 drivers/cxl/Kconfig
 create mode 100644 drivers/cxl/Makefile
 create mode 100644 drivers/cxl/mem.c
 create mode 100644 drivers/cxl/pci.h

(limited to 'include/linux')

diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst
new file mode 100644
index 000000000000..036e49553542
--- /dev/null
+++ b/Documentation/driver-api/cxl/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Compute Express Link
+====================
+
+.. toctree::
+   :maxdepth: 1
+
+   memory-devices
+
+.. only::  subproject and html
diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst
new file mode 100644
index 000000000000..71617e9002f1
--- /dev/null
+++ b/Documentation/driver-api/cxl/memory-devices.rst
@@ -0,0 +1,15 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===================================
+Compute Express Link Memory Devices
+===================================
+
+A Compute Express Link Memory Device is a CXL component that implements the
+CXL.mem protocol. It contains some amount of volatile memory, persistent memory,
+or both. It is enumerated as a PCI device for configuration and passing
+messages over an MMIO mailbox. Its contribution to the System Physical
+Address space is handled via HDM (Host Managed Device Memory) decoders
+that optionally define a device's contribution to an interleaved address
+range across multiple devices underneath a host-bridge or interleaved
+across host-bridges.
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 2456d0a97ed8..d246a18fd78f 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -35,6 +35,7 @@ available subsections can be seen below.
    usb/index
    firewire
    pci/index
+   cxl/index
    spi
    i2c
    ipmb
diff --git a/drivers/Kconfig b/drivers/Kconfig
index dcecc9f6e33f..62c753a73651 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -6,6 +6,7 @@ menu "Device Drivers"
 source "drivers/amba/Kconfig"
 source "drivers/eisa/Kconfig"
 source "drivers/pci/Kconfig"
+source "drivers/cxl/Kconfig"
 source "drivers/pcmcia/Kconfig"
 source "drivers/rapidio/Kconfig"
 
diff --git a/drivers/Makefile b/drivers/Makefile
index fd11b9ac4cc3..678ea810410f 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
 obj-$(CONFIG_DAX)		+= dax/
+obj-$(CONFIG_CXL_BUS)		+= cxl/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
new file mode 100644
index 000000000000..9e80b311e928
--- /dev/null
+++ b/drivers/cxl/Kconfig
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig CXL_BUS
+	tristate "CXL (Compute Express Link) Devices Support"
+	depends on PCI
+	help
+	  CXL is a bus that is electrically compatible with PCI Express, but
+	  layers three protocols on that signalling (CXL.io, CXL.cache, and
+	  CXL.mem). The CXL.cache protocol allows devices to hold cachelines
+	  locally, the CXL.mem protocol allows devices to be fully coherent
+	  memory targets, the CXL.io protocol is equivalent to PCI Express.
+	  Say 'y' to enable support for the configuration and management of
+	  devices supporting these protocols.
+
+if CXL_BUS
+
+config CXL_MEM
+	tristate "CXL.mem: Memory Devices"
+	help
+	  The CXL.mem protocol allows a device to act as a provider of
+	  "System RAM" and/or "Persistent Memory" that is fully coherent
+	  as if the memory was attached to the typical CPU memory
+	  controller.
+
+	  Say 'y/m' to enable a driver (named "cxl_mem.ko" when built as
+	  a module) that will attach to CXL.mem devices for
+	  configuration, provisioning, and health monitoring. This
+	  driver is required for dynamic provisioning of CXL.mem
+	  attached memory which is a prerequisite for persistent memory
+	  support. Typically volatile memory is mapped by platform
+	  firmware and included in the platform memory map, but in some
+	  cases the OS is responsible for mapping that memory. See
+	  Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification.
+
+	  If unsure say 'm'.
+endif
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
new file mode 100644
index 000000000000..4a30f7c3fc4a
--- /dev/null
+++ b/drivers/cxl/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+
+cxl_mem-y := mem.o
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
new file mode 100644
index 000000000000..ce33c5ee77c9
--- /dev/null
+++ b/drivers/cxl/mem.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include "pci.h"
+
+static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
+{
+	int pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DVSEC);
+	if (!pos)
+		return 0;
+
+	while (pos) {
+		u16 vendor, id;
+
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vendor);
+		pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, &id);
+		if (vendor == PCI_DVSEC_VENDOR_ID_CXL && dvsec == id)
+			return pos;
+
+		pos = pci_find_next_ext_capability(pdev, pos,
+						   PCI_EXT_CAP_ID_DVSEC);
+	}
+
+	return 0;
+}
+
+static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	int regloc;
+
+	regloc = cxl_mem_dvsec(pdev, PCI_DVSEC_ID_CXL_REGLOC_OFFSET);
+	if (!regloc) {
+		dev_err(dev, "register location dvsec not found\n");
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static const struct pci_device_id cxl_mem_pci_tbl[] = {
+	/* PCI class code for CXL.mem Type-3 Devices */
+	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
+	{ /* terminate list */ },
+};
+MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
+
+static struct pci_driver cxl_mem_driver = {
+	.name			= KBUILD_MODNAME,
+	.id_table		= cxl_mem_pci_tbl,
+	.probe			= cxl_mem_probe,
+	.driver	= {
+		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+
+MODULE_LICENSE("GPL v2");
+module_pci_driver(cxl_mem_driver);
diff --git a/drivers/cxl/pci.h b/drivers/cxl/pci.h
new file mode 100644
index 000000000000..e464bea3f4d3
--- /dev/null
+++ b/drivers/cxl/pci.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#ifndef __CXL_PCI_H__
+#define __CXL_PCI_H__
+
+#define CXL_MEMORY_PROGIF	0x10
+
+/*
+ * See section 8.1 Configuration Space Registers in the CXL 2.0
+ * Specification
+ */
+#define PCI_DVSEC_VENDOR_ID_CXL		0x1E98
+#define PCI_DVSEC_ID_CXL		0x0
+
+#define PCI_DVSEC_ID_CXL_REGLOC_OFFSET		0x8
+
+#endif /* __CXL_PCI_H__ */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d8156a5dbee8..766260a9b247 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -51,6 +51,7 @@
 #define PCI_BASE_CLASS_MEMORY		0x05
 #define PCI_CLASS_MEMORY_RAM		0x0500
 #define PCI_CLASS_MEMORY_FLASH		0x0501
+#define PCI_CLASS_MEMORY_CXL		0x0502
 #define PCI_CLASS_MEMORY_OTHER		0x0580
 
 #define PCI_BASE_CLASS_BRIDGE		0x06
-- 
cgit v1.2.3


From 2d24dd5798d0474d9bf705bfca8725e7d20f9d54 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Apr 2020 17:03:22 +0200
Subject: rbtree: Add generic add and find helpers

I've always been bothered by the endless (fragile) boilerplate for
rbtree, and I recently wrote some rbtree helpers for objtool and
figured I should lift them into the kernel and use them more widely.

Provide:

partial-order; less() based:
 - rb_add(): add a new entry to the rbtree
 - rb_add_cached(): like rb_add(), but for a rb_root_cached

total-order; cmp() based:
 - rb_find(): find an entry in an rbtree
 - rb_find_add(): find an entry, and add if not found

 - rb_find_first(): find the first (leftmost) matching entry
 - rb_next_match(): continue from rb_find_first()
 - rb_for_each(): iterate a sub-tree using the previous two

Inlining and constant propagation should see the compiler inline the
whole thing, including the various compare functions.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/linux/rbtree.h       | 190 ++++++++++++++++++++++++++++++++++++++++++
 tools/include/linux/rbtree.h | 192 ++++++++++++++++++++++++++++++++++++++++++-
 tools/objtool/elf.c          |  73 +++-------------
 3 files changed, 392 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index d7db17996322..e0b300de8f3f 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -158,4 +158,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
 	rb_replace_node(victim, new, &root->rb_root);
 }
 
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h
index 30dd21f976c3..2680f2edb837 100644
--- a/tools/include/linux/rbtree.h
+++ b/tools/include/linux/rbtree.h
@@ -152,4 +152,194 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
 	rb_replace_node(victim, new, &root->rb_root);
 }
 
-#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
+/*
+ * The below helper functions use 2 operators with 3 different
+ * calling conventions. The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * rb_find().
+ *
+ * The reason for this is to allow the find() interface without requiring an
+ * on-stack dummy object, which might not be feasible due to object size.
+ */
+
+/**
+ * rb_add_cached() - insert @node into the leftmost cached tree @tree
+ * @node: node to insert
+ * @tree: leftmost cached tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
+	      bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color_cached(node, tree, leftmost);
+}
+
+/**
+ * rb_add() - insert @node into @tree
+ * @node: node to insert
+ * @tree: tree to insert @node into
+ * @less: operator defining the (partial) node order
+ */
+static __always_inline void
+rb_add(struct rb_node *node, struct rb_root *tree,
+       bool (*less)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*link) {
+		parent = *link;
+		if (less(node, parent))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+}
+
+/**
+ * rb_find_add() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add(struct rb_node *node, struct rb_root *tree,
+	    int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
+/**
+ * rb_find() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find(const void *key, const struct rb_root *tree,
+	int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = node->rb_left;
+		else if (c > 0)
+			node = node->rb_right;
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
+/**
+ * rb_find_first() - find the first @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the leftmost node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_first(const void *key, const struct rb_root *tree,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+	struct rb_node *match = NULL;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c <= 0) {
+			if (!c)
+				match = node;
+			node = node->rb_left;
+		} else if (c > 0) {
+			node = node->rb_right;
+		}
+	}
+
+	return match;
+}
+
+/**
+ * rb_next_match() - find the next @key in @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ *
+ * Returns the next node matching @key, or NULL.
+ */
+static __always_inline struct rb_node *
+rb_next_match(const void *key, struct rb_node *node,
+	      int (*cmp)(const void *key, const struct rb_node *))
+{
+	node = rb_next(node);
+	if (node && cmp(key, node))
+		node = NULL;
+	return node;
+}
+
+/**
+ * rb_for_each() - iterates a subtree matching @key
+ * @node: iterator
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining node order
+ */
+#define rb_for_each(node, key, tree, cmp) \
+	for ((node) = rb_find_first((key), (tree), (cmp)); \
+	     (node); (node) = rb_next_match((key), (node), (cmp)))
+
+#endif	/* __TOOLS_LINUX_PERF_RBTREE_H */
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index d8421e1d06be..e85988ce04f1 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -43,75 +43,24 @@ static void elf_hash_init(struct hlist_head *table)
 #define elf_hash_for_each_possible(name, obj, member, key)			\
 	hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member)
 
-static void rb_add(struct rb_root *tree, struct rb_node *node,
-		   int (*cmp)(struct rb_node *, const struct rb_node *))
-{
-	struct rb_node **link = &tree->rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*link) {
-		parent = *link;
-		if (cmp(node, parent) < 0)
-			link = &parent->rb_left;
-		else
-			link = &parent->rb_right;
-	}
-
-	rb_link_node(node, parent, link);
-	rb_insert_color(node, tree);
-}
-
-static struct rb_node *rb_find_first(const struct rb_root *tree, const void *key,
-			       int (*cmp)(const void *key, const struct rb_node *))
-{
-	struct rb_node *node = tree->rb_node;
-	struct rb_node *match = NULL;
-
-	while (node) {
-		int c = cmp(key, node);
-		if (c <= 0) {
-			if (!c)
-				match = node;
-			node = node->rb_left;
-		} else if (c > 0) {
-			node = node->rb_right;
-		}
-	}
-
-	return match;
-}
-
-static struct rb_node *rb_next_match(struct rb_node *node, const void *key,
-				    int (*cmp)(const void *key, const struct rb_node *))
-{
-	node = rb_next(node);
-	if (node && cmp(key, node))
-		node = NULL;
-	return node;
-}
-
-#define rb_for_each(tree, node, key, cmp) \
-	for ((node) = rb_find_first((tree), (key), (cmp)); \
-	     (node); (node) = rb_next_match((node), (key), (cmp)))
-
-static int symbol_to_offset(struct rb_node *a, const struct rb_node *b)
+static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b)
 {
 	struct symbol *sa = rb_entry(a, struct symbol, node);
 	struct symbol *sb = rb_entry(b, struct symbol, node);
 
 	if (sa->offset < sb->offset)
-		return -1;
+		return true;
 	if (sa->offset > sb->offset)
-		return 1;
+		return false;
 
 	if (sa->len < sb->len)
-		return -1;
+		return true;
 	if (sa->len > sb->len)
-		return 1;
+		return false;
 
 	sa->alias = sb;
 
-	return 0;
+	return false;
 }
 
 static int symbol_by_offset(const void *key, const struct rb_node *node)
@@ -165,7 +114,7 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset)
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->offset == offset && s->type != STT_SECTION)
@@ -179,7 +128,7 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset)
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->offset == offset && s->type == STT_FUNC)
@@ -193,7 +142,7 @@ struct symbol *find_symbol_containing(const struct section *sec, unsigned long o
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->type != STT_SECTION)
@@ -207,7 +156,7 @@ struct symbol *find_func_containing(struct section *sec, unsigned long offset)
 {
 	struct rb_node *node;
 
-	rb_for_each(&sec->symbol_tree, node, &offset, symbol_by_offset) {
+	rb_for_each(node, &offset, &sec->symbol_tree, symbol_by_offset) {
 		struct symbol *s = rb_entry(node, struct symbol, node);
 
 		if (s->type == STT_FUNC)
@@ -442,7 +391,7 @@ static int read_symbols(struct elf *elf)
 		sym->offset = sym->sym.st_value;
 		sym->len = sym->sym.st_size;
 
-		rb_add(&sym->sec->symbol_tree, &sym->node, symbol_to_offset);
+		rb_add(&sym->node, &sym->sec->symbol_tree, symbol_to_offset);
 		pnode = rb_prev(&sym->node);
 		if (pnode)
 			entry = &rb_entry(pnode, struct symbol, node)->list;
-- 
cgit v1.2.3


From 8ecca39483ed4e4e97096d0d6f8e25fdd323b189 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Apr 2020 17:04:41 +0200
Subject: rbtree, sched/deadline: Use rb_add_cached()

Reduce rbtree boiler plate by using the new helpers.

Make rb_add_cached() / rb_erase_cached() return a pointer to the
leftmost node to aid in updating additional state.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/linux/rbtree.h  | 18 +++++++++---
 kernel/sched/deadline.c | 77 ++++++++++++++++++-------------------------------
 2 files changed, 42 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e0b300de8f3f..d31ecaf4fdd3 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -141,12 +141,18 @@ static inline void rb_insert_color_cached(struct rb_node *node,
 	rb_insert_color(node, &root->rb_root);
 }
 
-static inline void rb_erase_cached(struct rb_node *node,
-				   struct rb_root_cached *root)
+
+static inline struct rb_node *
+rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
 {
+	struct rb_node *leftmost = NULL;
+
 	if (root->rb_leftmost == node)
-		root->rb_leftmost = rb_next(node);
+		leftmost = root->rb_leftmost = rb_next(node);
+
 	rb_erase(node, &root->rb_root);
+
+	return leftmost;
 }
 
 static inline void rb_replace_node_cached(struct rb_node *victim,
@@ -179,8 +185,10 @@ static inline void rb_replace_node_cached(struct rb_node *victim,
  * @node: node to insert
  * @tree: leftmost cached tree to insert @node into
  * @less: operator defining the (partial) node order
+ *
+ * Returns @node when it is the new leftmost, or NULL.
  */
-static __always_inline void
+static __always_inline struct rb_node *
 rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
 	      bool (*less)(struct rb_node *, const struct rb_node *))
 {
@@ -200,6 +208,8 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
 
 	rb_link_node(node, parent, link);
 	rb_insert_color_cached(node, tree, leftmost);
+
+	return leftmost ? node : NULL;
 }
 
 /**
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5421782fe897..1508d126e88b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -517,58 +517,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	update_dl_migration(dl_rq);
 }
 
+#define __node_2_pdl(node) \
+	rb_entry((node), struct task_struct, pushable_dl_tasks)
+
+static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
+{
+	return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
+}
+
 /*
  * The list of pushable -deadline task is not a plist, like in
  * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
  */
 static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 {
-	struct dl_rq *dl_rq = &rq->dl;
-	struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct task_struct *entry;
-	bool leftmost = true;
+	struct rb_node *leftmost;
 
 	BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
 
-	while (*link) {
-		parent = *link;
-		entry = rb_entry(parent, struct task_struct,
-				 pushable_dl_tasks);
-		if (dl_entity_preempt(&p->dl, &entry->dl))
-			link = &parent->rb_left;
-		else {
-			link = &parent->rb_right;
-			leftmost = false;
-		}
-	}
-
+	leftmost = rb_add_cached(&p->pushable_dl_tasks,
+				 &rq->dl.pushable_dl_tasks_root,
+				 __pushable_less);
 	if (leftmost)
-		dl_rq->earliest_dl.next = p->dl.deadline;
-
-	rb_link_node(&p->pushable_dl_tasks, parent, link);
-	rb_insert_color_cached(&p->pushable_dl_tasks,
-			       &dl_rq->pushable_dl_tasks_root, leftmost);
+		rq->dl.earliest_dl.next = p->dl.deadline;
 }
 
 static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 {
 	struct dl_rq *dl_rq = &rq->dl;
+	struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
+	struct rb_node *leftmost;
 
 	if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
 		return;
 
-	if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
-		struct rb_node *next_node;
-
-		next_node = rb_next(&p->pushable_dl_tasks);
-		if (next_node) {
-			dl_rq->earliest_dl.next = rb_entry(next_node,
-				struct task_struct, pushable_dl_tasks)->dl.deadline;
-		}
-	}
+	leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
+	if (leftmost)
+		dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
 
-	rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
 }
 
@@ -1478,29 +1464,21 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	dec_dl_migration(dl_se, dl_rq);
 }
 
+#define __node_2_dle(node) \
+	rb_entry((node), struct sched_dl_entity, rb_node)
+
+static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
+{
+	return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
+}
+
 static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rb_node **link = &dl_rq->root.rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct sched_dl_entity *entry;
-	int leftmost = 1;
 
 	BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
 
-	while (*link) {
-		parent = *link;
-		entry = rb_entry(parent, struct sched_dl_entity, rb_node);
-		if (dl_time_before(dl_se->deadline, entry->deadline))
-			link = &parent->rb_left;
-		else {
-			link = &parent->rb_right;
-			leftmost = 0;
-		}
-	}
-
-	rb_link_node(&dl_se->rb_node, parent, link);
-	rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
+	rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
 
 	inc_dl_tasks(dl_se, dl_rq);
 }
@@ -1513,6 +1491,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
 		return;
 
 	rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
+
 	RB_CLEAR_NODE(&dl_se->rb_node);
 
 	dec_dl_tasks(dl_se, dl_rq);
-- 
cgit v1.2.3


From a3b89864554bbce1594b7abdb5739fc708c1ca95 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Apr 2020 17:05:15 +0200
Subject: rbtree, perf: Use new rbtree helpers

Reduce rbtree boiler plate by using the new helpers.

One noteworthy change is unification of the various (partial) compare
functions. We construct a subtree match by forcing the sub-order to
always match, see __group_cmp().

Due to 'const' we had to touch cgroup_id().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/linux/cgroup.h |   4 +-
 kernel/events/core.c   | 195 +++++++++++++++++++++++--------------------------
 2 files changed, 92 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 451c2d26a5db..4f2f79de083e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -307,7 +307,7 @@ void css_task_iter_end(struct css_task_iter *it);
  * Inline functions.
  */
 
-static inline u64 cgroup_id(struct cgroup *cgrp)
+static inline u64 cgroup_id(const struct cgroup *cgrp)
 {
 	return cgrp->kn->id;
 }
@@ -701,7 +701,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
 struct cgroup_subsys_state;
 struct cgroup;
 
-static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; }
+static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
 static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 55d18791a72d..3d890961f6e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1595,50 +1595,91 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
 	groups->index = 0;
 }
 
+static inline struct cgroup *event_cgroup(const struct perf_event *event)
+{
+	struct cgroup *cgroup = NULL;
+
+#ifdef CONFIG_CGROUP_PERF
+	if (event->cgrp)
+		cgroup = event->cgrp->css.cgroup;
+#endif
+
+	return cgroup;
+}
+
 /*
  * Compare function for event groups;
  *
  * Implements complex key that first sorts by CPU and then by virtual index
  * which provides ordering when rotating groups for the same CPU.
  */
-static bool
-perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+static __always_inline int
+perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
+		      const u64 left_group_index, const struct perf_event *right)
 {
-	if (left->cpu < right->cpu)
-		return true;
-	if (left->cpu > right->cpu)
-		return false;
+	if (left_cpu < right->cpu)
+		return -1;
+	if (left_cpu > right->cpu)
+		return 1;
 
 #ifdef CONFIG_CGROUP_PERF
-	if (left->cgrp != right->cgrp) {
-		if (!left->cgrp || !left->cgrp->css.cgroup) {
-			/*
-			 * Left has no cgroup but right does, no cgroups come
-			 * first.
-			 */
-			return true;
-		}
-		if (!right->cgrp || !right->cgrp->css.cgroup) {
-			/*
-			 * Right has no cgroup but left does, no cgroups come
-			 * first.
-			 */
-			return false;
-		}
-		/* Two dissimilar cgroups, order by id. */
-		if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
-			return true;
+	{
+		const struct cgroup *right_cgroup = event_cgroup(right);
 
-		return false;
+		if (left_cgroup != right_cgroup) {
+			if (!left_cgroup) {
+				/*
+				 * Left has no cgroup but right does, no
+				 * cgroups come first.
+				 */
+				return -1;
+			}
+			if (!right_cgroup) {
+				/*
+				 * Right has no cgroup but left does, no
+				 * cgroups come first.
+				 */
+				return 1;
+			}
+			/* Two dissimilar cgroups, order by id. */
+			if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+				return -1;
+
+			return 1;
+		}
 	}
 #endif
 
-	if (left->group_index < right->group_index)
-		return true;
-	if (left->group_index > right->group_index)
-		return false;
+	if (left_group_index < right->group_index)
+		return -1;
+	if (left_group_index > right->group_index)
+		return 1;
 
-	return false;
+	return 0;
+}
+
+#define __node_2_pe(node) \
+	rb_entry((node), struct perf_event, group_node)
+
+static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+{
+	struct perf_event *e = __node_2_pe(a);
+	return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
+				     __node_2_pe(b)) < 0;
+}
+
+struct __group_key {
+	int cpu;
+	struct cgroup *cgroup;
+};
+
+static inline int __group_cmp(const void *key, const struct rb_node *node)
+{
+	const struct __group_key *a = key;
+	const struct perf_event *b = __node_2_pe(node);
+
+	/* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
+	return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
 }
 
 /*
@@ -1650,27 +1691,9 @@ static void
 perf_event_groups_insert(struct perf_event_groups *groups,
 			 struct perf_event *event)
 {
-	struct perf_event *node_event;
-	struct rb_node *parent;
-	struct rb_node **node;
-
 	event->group_index = ++groups->index;
 
-	node = &groups->tree.rb_node;
-	parent = *node;
-
-	while (*node) {
-		parent = *node;
-		node_event = container_of(*node, struct perf_event, group_node);
-
-		if (perf_event_groups_less(event, node_event))
-			node = &parent->rb_left;
-		else
-			node = &parent->rb_right;
-	}
-
-	rb_link_node(&event->group_node, parent, node);
-	rb_insert_color(&event->group_node, &groups->tree);
+	rb_add(&event->group_node, &groups->tree, __group_less);
 }
 
 /*
@@ -1718,45 +1741,17 @@ static struct perf_event *
 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
 			struct cgroup *cgrp)
 {
-	struct perf_event *node_event = NULL, *match = NULL;
-	struct rb_node *node = groups->tree.rb_node;
-#ifdef CONFIG_CGROUP_PERF
-	u64 node_cgrp_id, cgrp_id = 0;
-
-	if (cgrp)
-		cgrp_id = cgrp->kn->id;
-#endif
-
-	while (node) {
-		node_event = container_of(node, struct perf_event, group_node);
-
-		if (cpu < node_event->cpu) {
-			node = node->rb_left;
-			continue;
-		}
-		if (cpu > node_event->cpu) {
-			node = node->rb_right;
-			continue;
-		}
-#ifdef CONFIG_CGROUP_PERF
-		node_cgrp_id = 0;
-		if (node_event->cgrp && node_event->cgrp->css.cgroup)
-			node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+	struct __group_key key = {
+		.cpu = cpu,
+		.cgroup = cgrp,
+	};
+	struct rb_node *node;
 
-		if (cgrp_id < node_cgrp_id) {
-			node = node->rb_left;
-			continue;
-		}
-		if (cgrp_id > node_cgrp_id) {
-			node = node->rb_right;
-			continue;
-		}
-#endif
-		match = node_event;
-		node = node->rb_left;
-	}
+	node = rb_find_first(&key, &groups->tree, __group_cmp);
+	if (node)
+		return __node_2_pe(node);
 
-	return match;
+	return NULL;
 }
 
 /*
@@ -1765,27 +1760,17 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
 static struct perf_event *
 perf_event_groups_next(struct perf_event *event)
 {
-	struct perf_event *next;
-#ifdef CONFIG_CGROUP_PERF
-	u64 curr_cgrp_id = 0;
-	u64 next_cgrp_id = 0;
-#endif
-
-	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
-	if (next == NULL || next->cpu != event->cpu)
-		return NULL;
-
-#ifdef CONFIG_CGROUP_PERF
-	if (event->cgrp && event->cgrp->css.cgroup)
-		curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+	struct __group_key key = {
+		.cpu = event->cpu,
+		.cgroup = event_cgroup(event),
+	};
+	struct rb_node *next;
 
-	if (next->cgrp && next->cgrp->css.cgroup)
-		next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+	next = rb_next_match(&key, &event->group_node, __group_cmp);
+	if (next)
+		return __node_2_pe(next);
 
-	if (curr_cgrp_id != next_cgrp_id)
-		return NULL;
-#endif
-	return next;
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3


From ae18ad281e825993d190073d0ae2ea35dee27ee1 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 28 Jan 2021 14:10:38 +0100
Subject: sched: Remove MAX_USER_RT_PRIO

Commit d46523ea32a7 ("[PATCH] fix MAX_USER_RT_PRIO and MAX_RT_PRIO")
was introduced due to a a small time period in which the realtime patch
set was using different values for MAX_USER_RT_PRIO and MAX_RT_PRIO.

This is no longer true, i.e. now MAX_RT_PRIO == MAX_USER_RT_PRIO.

Get rid of MAX_USER_RT_PRIO and make everything use MAX_RT_PRIO
instead.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210128131040.296856-2-dietmar.eggemann@arm.com
---
 include/linux/sched/prio.h | 9 +--------
 kernel/sched/core.c        | 7 +++----
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index 7d64feafc408..d111f2fd77ea 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -11,16 +11,9 @@
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
  * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
  * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
  */
 
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define MAX_RT_PRIO		100
 
 #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6c789dcb8e5a..f0b0b67fcdee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5911,11 +5911,10 @@ recheck:
 
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
-	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
-	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+	if (attr->sched_priority > MAX_RT_PRIO-1)
 		return -EINVAL;
 	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
 	    (rt_policy(policy) != (attr->sched_priority != 0)))
@@ -6983,7 +6982,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
-		ret = MAX_USER_RT_PRIO-1;
+		ret = MAX_RT_PRIO-1;
 		break;
 	case SCHED_DEADLINE:
 	case SCHED_NORMAL:
-- 
cgit v1.2.3


From 9d061ba6bc170045857f3efe0bba5def30188d4d Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 28 Jan 2021 14:10:39 +0100
Subject: sched: Remove USER_PRIO, TASK_USER_PRIO and MAX_USER_PRIO

The only remaining use of MAX_USER_PRIO (and USER_PRIO) is the
SCALE_PRIO() definition in the PowerPC Cell architecture's Synergistic
Processor Unit (SPU) scheduler. TASK_USER_PRIO isn't used anymore.

Commit fe443ef2ac42 ("[POWERPC] spusched: Dynamic timeslicing for
SCHED_OTHER") copied SCALE_PRIO() from the task scheduler in v2.6.23.

Commit a4ec24b48dde ("sched: tidy up SCHED_RR") removed it from the task
scheduler in v2.6.24.

Commit 3ee237dddcd8 ("sched/prio: Add 3 macros of MAX_NICE, MIN_NICE and
NICE_WIDTH in prio.h") introduced NICE_WIDTH much later.

With:

  MAX_USER_PRIO = USER_PRIO(MAX_PRIO)

                = MAX_PRIO - MAX_RT_PRIO

       MAX_PRIO = MAX_RT_PRIO + NICE_WIDTH

  MAX_USER_PRIO = MAX_RT_PRIO + NICE_WIDTH - MAX_RT_PRIO

  MAX_USER_PRIO = NICE_WIDTH

MAX_USER_PRIO can be replaced by NICE_WIDTH to be able to remove all the
{*_}USER_PRIO defines.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210128131040.296856-3-dietmar.eggemann@arm.com
---
 arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
 include/linux/sched/prio.h                | 9 ---------
 kernel/sched/sched.h                      | 2 +-
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index f18d5067cd0f..aeb7f3922106 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -72,7 +72,7 @@ static struct timer_list spuloadavg_timer;
 #define DEF_SPU_TIMESLICE	(100 * HZ / (1000 * SPUSCHED_TICK))
 
 #define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+	max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
 
 /*
  * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index d111f2fd77ea..ab83d85e1183 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -26,15 +26,6 @@
 #define NICE_TO_PRIO(nice)	((nice) + DEFAULT_PRIO)
 #define PRIO_TO_NICE(prio)	((prio) - DEFAULT_PRIO)
 
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
-
 /*
  * Convert nice value [19,-20] to rlimit style value [1,40].
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f519aba2c542..2185b3b435a9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -140,7 +140,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
  * scale_load() and scale_load_down(w) to convert between them. The
  * following must be true:
  *
- *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *  scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD
  *
  */
 #define NICE_0_LOAD		(1L << NICE_0_LOAD_SHIFT)
-- 
cgit v1.2.3


From 880cfed3a012d7863f42251791cea7fe78c39390 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Jan 2021 15:12:18 +0100
Subject: static_call: Pull some static_call declarations to the type headers

Some static call declarations are going to be needed on low level header
files. Move the necessary material to the dedicated static call types
header to avoid inclusion dependency hell.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210118141223.123667-4-frederic@kernel.org
---
 include/linux/static_call.h             | 21 ---------------------
 include/linux/static_call_types.h       | 27 +++++++++++++++++++++++++++
 tools/include/linux/static_call_types.h | 27 +++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 695da4c9b338..a2c064585c03 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -107,26 +107,10 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool
 
 #define STATIC_CALL_TRAMP_ADDR(name) &STATIC_CALL_TRAMP(name)
 
-/*
- * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
- * the symbol table so that objtool can reference it when it generates the
- * .static_call_sites section.
- */
-#define __static_call(name)						\
-({									\
-	__ADDRESSABLE(STATIC_CALL_KEY(name));				\
-	&STATIC_CALL_TRAMP(name);					\
-})
-
 #else
 #define STATIC_CALL_TRAMP_ADDR(name) NULL
 #endif
 
-
-#define DECLARE_STATIC_CALL(name, func)					\
-	extern struct static_call_key STATIC_CALL_KEY(name);		\
-	extern typeof(func) STATIC_CALL_TRAMP(name);
-
 #define static_call_update(name, func)					\
 ({									\
 	BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name)));	\
@@ -174,7 +158,6 @@ extern int static_call_text_reserved(void *start, void *end);
 	};								\
 	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
 
-#define static_call(name)	__static_call(name)
 #define static_call_cond(name)	(void)__static_call(name)
 
 #define EXPORT_STATIC_CALL(name)					\
@@ -207,7 +190,6 @@ struct static_call_key {
 	};								\
 	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
 
-#define static_call(name)	__static_call(name)
 #define static_call_cond(name)	(void)__static_call(name)
 
 static inline
@@ -252,9 +234,6 @@ struct static_call_key {
 		.func = NULL,						\
 	}
 
-#define static_call(name)						\
-	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
-
 static inline void __static_call_nop(void) { }
 
 /*
diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h
index 89135bb35bf7..08f78b1b88b4 100644
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/stringify.h>
+#include <linux/compiler.h>
 
 #define STATIC_CALL_KEY_PREFIX		__SCK__
 #define STATIC_CALL_KEY_PREFIX_STR	__stringify(STATIC_CALL_KEY_PREFIX)
@@ -32,4 +33,30 @@ struct static_call_site {
 	s32 key;
 };
 
+#define DECLARE_STATIC_CALL(name, func)					\
+	extern struct static_call_key STATIC_CALL_KEY(name);		\
+	extern typeof(func) STATIC_CALL_TRAMP(name);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+
+/*
+ * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
+ * the symbol table so that objtool can reference it when it generates the
+ * .static_call_sites section.
+ */
+#define __static_call(name)						\
+({									\
+	__ADDRESSABLE(STATIC_CALL_KEY(name));				\
+	&STATIC_CALL_TRAMP(name);					\
+})
+
+#define static_call(name)	__static_call(name)
+
+#else
+
+#define static_call(name)						\
+	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
+
+#endif /* CONFIG_HAVE_STATIC_CALL */
+
 #endif /* _STATIC_CALL_TYPES_H */
diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h
index 89135bb35bf7..08f78b1b88b4 100644
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/stringify.h>
+#include <linux/compiler.h>
 
 #define STATIC_CALL_KEY_PREFIX		__SCK__
 #define STATIC_CALL_KEY_PREFIX_STR	__stringify(STATIC_CALL_KEY_PREFIX)
@@ -32,4 +33,30 @@ struct static_call_site {
 	s32 key;
 };
 
+#define DECLARE_STATIC_CALL(name, func)					\
+	extern struct static_call_key STATIC_CALL_KEY(name);		\
+	extern typeof(func) STATIC_CALL_TRAMP(name);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+
+/*
+ * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
+ * the symbol table so that objtool can reference it when it generates the
+ * .static_call_sites section.
+ */
+#define __static_call(name)						\
+({									\
+	__ADDRESSABLE(STATIC_CALL_KEY(name));				\
+	&STATIC_CALL_TRAMP(name);					\
+})
+
+#define static_call(name)	__static_call(name)
+
+#else
+
+#define static_call(name)						\
+	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
+
+#endif /* CONFIG_HAVE_STATIC_CALL */
+
 #endif /* _STATIC_CALL_TYPES_H */
-- 
cgit v1.2.3


From 3f2a8fc4b15de18644e8a80a09edda168676e22c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Jan 2021 15:12:16 +0100
Subject: static_call/x86: Add __static_call_return0()

Provide a stub function that return 0 and wire up the static call site
patching to replace the CALL with a single 5 byte instruction that
clears %RAX, the return value register.

The function can be cast to any function pointer type that has a
single %RAX return (including pointers). Also provide a version that
returns an int for convenience. We are clearing the entire %RAX register
in any case, whether the return value is 32 or 64 bits, since %RAX is
always a scratch register anyway.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210118141223.123667-2-frederic@kernel.org
---
 arch/x86/kernel/static_call.c | 17 +++++++++++++++--
 include/linux/static_call.h   | 12 ++++++++++++
 kernel/static_call.c          |  5 +++++
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca9a380d9c0b..9442c4136c38 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,14 +11,26 @@ enum insn_type {
 	RET = 3,  /* tramp / site cond-tail-call */
 };
 
+/*
+ * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
+ * The REX.W cancels the effect of any data16.
+ */
+static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+
 static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
 {
+	const void *emulate = NULL;
 	int size = CALL_INSN_SIZE;
 	const void *code;
 
 	switch (type) {
 	case CALL:
 		code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+		if (func == &__static_call_return0) {
+			emulate = code;
+			code = &xor5rax;
+		}
+
 		break;
 
 	case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
 	if (unlikely(system_state == SYSTEM_BOOTING))
 		return text_poke_early(insn, code, size);
 
-	text_poke_bp(insn, code, size, NULL);
+	text_poke_bp(insn, code, size, emulate);
 }
 
 static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
 			return;
 	} else {
 		if (opcode == CALL_INSN_OPCODE ||
-		    !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
+		    !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+		    !memcmp(insn, xor5rax, 5))
 			return;
 	}
 
diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index a2c064585c03..bd6735d10e2e 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -142,6 +142,8 @@ extern void __static_call_update(struct static_call_key *key, void *tramp, void
 extern int static_call_mod_init(struct module *mod);
 extern int static_call_text_reserved(void *start, void *end);
 
+extern long __static_call_return0(void);
+
 #define DEFINE_STATIC_CALL(name, _func)					\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
@@ -206,6 +208,11 @@ static inline int static_call_text_reserved(void *start, void *end)
 	return 0;
 }
 
+static inline long __static_call_return0(void)
+{
+	return 0;
+}
+
 #define EXPORT_STATIC_CALL(name)					\
 	EXPORT_SYMBOL(STATIC_CALL_KEY(name));				\
 	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
@@ -222,6 +229,11 @@ struct static_call_key {
 	void *func;
 };
 
+static inline long __static_call_return0(void)
+{
+	return 0;
+}
+
 #define DEFINE_STATIC_CALL(name, _func)					\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 84565c2a41b8..0bc11b5ce681 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -438,6 +438,11 @@ int __init static_call_init(void)
 }
 early_initcall(static_call_init);
 
+long __static_call_return0(void)
+{
+	return 0;
+}
+
 #ifdef CONFIG_STATIC_CALL_SELFTEST
 
 static int func_a(int x)
-- 
cgit v1.2.3


From 29fd01944b7273bb630c649a2104b7f9e4ef3fa6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 18 Jan 2021 15:12:17 +0100
Subject: static_call: Provide DEFINE_STATIC_CALL_RET0()

DECLARE_STATIC_CALL() must pass the original function targeted for a
given static call. But DEFINE_STATIC_CALL() may want to initialize it as
off. In this case we can't pass NULL (for functions without return value)
or __static_call_return0 (for functions returning a value) directly
to DEFINE_STATIC_CALL() as that may trigger a static call redeclaration
with a different function prototype. Type casts neither can work around
that as they don't get along with typeof().

The proper way to do that for functions that don't return a value is
to use DEFINE_STATIC_CALL_NULL(). But functions returning a actual value
don't have an equivalent yet.

Provide DEFINE_STATIC_CALL_RET0() to solve this situation.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210118141223.123667-3-frederic@kernel.org
---
 include/linux/static_call.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index bd6735d10e2e..d69dd8b976ca 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -144,13 +144,13 @@ extern int static_call_text_reserved(void *start, void *end);
 
 extern long __static_call_return0(void);
 
-#define DEFINE_STATIC_CALL(name, _func)					\
+#define __DEFINE_STATIC_CALL(name, _func, _func_init)			\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
-		.func = _func,						\
+		.func = _func_init,					\
 		.type = 1,						\
 	};								\
-	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
+	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func_init)
 
 #define DEFINE_STATIC_CALL_NULL(name, _func)				\
 	DECLARE_STATIC_CALL(name, _func);				\
@@ -178,12 +178,12 @@ struct static_call_key {
 	void *func;
 };
 
-#define DEFINE_STATIC_CALL(name, _func)					\
+#define __DEFINE_STATIC_CALL(name, _func, _func_init)			\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
-		.func = _func,						\
+		.func = _func_init,					\
 	};								\
-	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
+	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func_init)
 
 #define DEFINE_STATIC_CALL_NULL(name, _func)				\
 	DECLARE_STATIC_CALL(name, _func);				\
@@ -234,10 +234,10 @@ static inline long __static_call_return0(void)
 	return 0;
 }
 
-#define DEFINE_STATIC_CALL(name, _func)					\
+#define __DEFINE_STATIC_CALL(name, _func, _func_init)			\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
-		.func = _func,						\
+		.func = _func_init,					\
 	}
 
 #define DEFINE_STATIC_CALL_NULL(name, _func)				\
@@ -286,4 +286,10 @@ static inline int static_call_text_reserved(void *start, void *end)
 
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
+#define DEFINE_STATIC_CALL(name, _func)					\
+	__DEFINE_STATIC_CALL(name, _func, _func)
+
+#define DEFINE_STATIC_CALL_RET0(name, _func)				\
+	__DEFINE_STATIC_CALL(name, _func, __static_call_return0)
+
 #endif /* _LINUX_STATIC_CALL_H */
-- 
cgit v1.2.3


From b965f1ddb47daa5b8b2e2bc9c921431236830367 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Mon, 18 Jan 2021 15:12:20 +0100
Subject: preempt/dynamic: Provide cond_resched() and might_resched() static
 calls

Provide static calls to control cond_resched() (called in !CONFIG_PREEMPT)
and might_resched() (called in CONFIG_PREEMPT_VOLUNTARY) to that we
can override their behaviour when preempt= is overriden.

Since the default behaviour is full preemption, both their calls are
ignored when preempt= isn't passed.

  [fweisbec: branch might_resched() directly to __cond_resched(), only
             define static calls when PREEMPT_DYNAMIC]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210118141223.123667-6-frederic@kernel.org
---
 include/linux/kernel.h | 23 +++++++++++++++++++----
 include/linux/sched.h  | 27 ++++++++++++++++++++++++---
 kernel/sched/core.c    | 16 +++++++++++++---
 3 files changed, 56 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f7902d8c1048..cfd3d349f905 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,7 +15,7 @@
 #include <linux/typecheck.h>
 #include <linux/printk.h>
 #include <linux/build_bug.h>
-
+#include <linux/static_call_types.h>
 #include <asm/byteorder.h>
 
 #include <uapi/linux/kernel.h>
@@ -81,11 +81,26 @@ struct pt_regs;
 struct user;
 
 #ifdef CONFIG_PREEMPT_VOLUNTARY
-extern int _cond_resched(void);
-# define might_resched() _cond_resched()
+
+extern int __cond_resched(void);
+# define might_resched() __cond_resched()
+
+#elif defined(CONFIG_PREEMPT_DYNAMIC)
+
+extern int __cond_resched(void);
+
+DECLARE_STATIC_CALL(might_resched, __cond_resched);
+
+static __always_inline void might_resched(void)
+{
+	static_call(might_resched)();
+}
+
 #else
+
 # define might_resched() do { } while (0)
-#endif
+
+#endif /* CONFIG_PREEMPT_* */
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 extern void ___might_sleep(const char *file, int line, int preempt_offset);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e1152227bb79..2f35594b8b53 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1871,11 +1871,32 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
  * value indicates whether a reschedule was done in fact.
  * cond_resched_lock() will drop the spinlock before scheduling,
  */
-#ifndef CONFIG_PREEMPTION
-extern int _cond_resched(void);
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+extern int __cond_resched(void);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(cond_resched, __cond_resched);
+
+static __always_inline int _cond_resched(void)
+{
+	return static_call(cond_resched)();
+}
+
 #else
+
+static inline int _cond_resched(void)
+{
+	return __cond_resched();
+}
+
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+#else
+
 static inline int _cond_resched(void) { return 0; }
-#endif
+
+#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
 
 #define cond_resched() ({			\
 	___might_sleep(__FILE__, __LINE__, 0);	\
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4afbdd27f46d..f7c8fd8fa177 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6785,17 +6785,27 @@ SYSCALL_DEFINE0(sched_yield)
 	return 0;
 }
 
-#ifndef CONFIG_PREEMPTION
-int __sched _cond_resched(void)
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
 {
 	if (should_resched(0)) {
 		preempt_schedule_common();
 		return 1;
 	}
+#ifndef CONFIG_PREEMPT_RCU
 	rcu_all_qs();
+#endif
 	return 0;
 }
-EXPORT_SYMBOL(_cond_resched);
+EXPORT_SYMBOL(__cond_resched);
+#endif
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL(cond_resched);
+
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL(might_resched);
 #endif
 
 /*
-- 
cgit v1.2.3


From 40607ee97e4eec5655cc0f76a720bdc4c63a6434 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Mon, 18 Jan 2021 15:12:22 +0100
Subject: preempt/dynamic: Provide irqentry_exit_cond_resched() static call

Provide static call to control IRQ preemption (called in CONFIG_PREEMPT)
so that we can override its behaviour when preempt= is overriden.

Since the default behaviour is full preemption, its call is
initialized to provide IRQ preemption when preempt= isn't passed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210118141223.123667-8-frederic@kernel.org
---
 include/linux/entry-common.h |  4 ++++
 kernel/entry/common.c        | 10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index a104b298019a..883acef895bc 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_ENTRYCOMMON_H
 #define __LINUX_ENTRYCOMMON_H
 
+#include <linux/static_call_types.h>
 #include <linux/tracehook.h>
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
@@ -454,6 +455,9 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
  * Conditional reschedule with additional sanity checks.
  */
 void irqentry_exit_cond_resched(void);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DECLARE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#endif
 
 /**
  * irqentry_exit - Handle return from exception that used irqentry_enter()
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f9d491b17b78..f09cae37ddd5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -385,6 +385,9 @@ void irqentry_exit_cond_resched(void)
 			preempt_schedule_irq();
 	}
 }
+#ifdef CONFIG_PREEMPT_DYNAMIC
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#endif
 
 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 {
@@ -411,8 +414,13 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 		}
 
 		instrumentation_begin();
-		if (IS_ENABLED(CONFIG_PREEMPTION))
+		if (IS_ENABLED(CONFIG_PREEMPTION)) {
+#ifdef CONFIG_PREEMT_DYNAMIC
+			static_call(irqentry_exit_cond_resched)();
+#else
 			irqentry_exit_cond_resched();
+#endif
+		}
 		/* Covers both tracing and lockdep */
 		trace_hardirqs_on();
 		instrumentation_end();
-- 
cgit v1.2.3


From 73f44fe19d359635a607e8e8daa0da4001c1cfc2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 27 Jan 2021 17:18:37 -0600
Subject: static_call: Allow module use without exposing static_call_key

When exporting static_call_key; with EXPORT_STATIC_CALL*(), the module
can use static_call_update() to change the function called.  This is
not desirable in general.

Not exporting static_call_key however also disallows usage of
static_call(), since objtool needs the key to construct the
static_call_site.

Solve this by allowing objtool to create the static_call_site using
the trampoline address when it builds a module and cannot find the
static_call_key symbol. The module loader will then try and map the
trampole back to a key before it constructs the normal sites list.

Doing this requires a trampoline -> key associsation, so add another
magic section that keeps those.

Originally-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210127231837.ifddpn7rhwdaepiu@treble
---
 arch/x86/include/asm/static_call.h      |  7 +++++
 include/asm-generic/vmlinux.lds.h       |  5 ++-
 include/linux/static_call.h             | 22 +++++++++++--
 include/linux/static_call_types.h       | 27 ++++++++++++++--
 kernel/static_call.c                    | 55 +++++++++++++++++++++++++++++++--
 tools/include/linux/static_call_types.h | 27 ++++++++++++++--
 tools/objtool/check.c                   | 17 ++++++++--
 7 files changed, 149 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index c37f11999d0c..cbb67b6030f9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -37,4 +37,11 @@
 #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
 	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
 
+
+#define ARCH_ADD_TRAMP_KEY(name)					\
+	asm(".pushsection .static_call_tramp_key, \"a\"		\n"	\
+	    ".long " STATIC_CALL_TRAMP_STR(name) " - .		\n"	\
+	    ".long " STATIC_CALL_KEY_STR(name) " - .		\n"	\
+	    ".popsection					\n")
+
 #endif /* _ASM_STATIC_CALL_H */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b97c628ad91f..3f747de1934d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -393,7 +393,10 @@
 	. = ALIGN(8);							\
 	__start_static_call_sites = .;					\
 	KEEP(*(.static_call_sites))					\
-	__stop_static_call_sites = .;
+	__stop_static_call_sites = .;					\
+	__start_static_call_tramp_key = .;				\
+	KEEP(*(.static_call_tramp_key))					\
+	__stop_static_call_tramp_key = .;
 
 /*
  * Allow architectures to handle ro_after_init data on their
diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index d69dd8b976ca..85ecc789f4ff 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -138,6 +138,12 @@ struct static_call_key {
 	};
 };
 
+/* For finding the key associated with a trampoline */
+struct static_call_tramp_key {
+	s32 tramp;
+	s32 key;
+};
+
 extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
 extern int static_call_mod_init(struct module *mod);
 extern int static_call_text_reserved(void *start, void *end);
@@ -165,11 +171,18 @@ extern long __static_call_return0(void);
 #define EXPORT_STATIC_CALL(name)					\
 	EXPORT_SYMBOL(STATIC_CALL_KEY(name));				\
 	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-
 #define EXPORT_STATIC_CALL_GPL(name)					\
 	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
 	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
 
+/* Leave the key unexported, so modules can't change static call targets: */
+#define EXPORT_STATIC_CALL_TRAMP(name)					\
+	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name));				\
+	ARCH_ADD_TRAMP_KEY(name)
+#define EXPORT_STATIC_CALL_TRAMP_GPL(name)				\
+	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name));			\
+	ARCH_ADD_TRAMP_KEY(name)
+
 #elif defined(CONFIG_HAVE_STATIC_CALL)
 
 static inline int static_call_init(void) { return 0; }
@@ -216,11 +229,16 @@ static inline long __static_call_return0(void)
 #define EXPORT_STATIC_CALL(name)					\
 	EXPORT_SYMBOL(STATIC_CALL_KEY(name));				\
 	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-
 #define EXPORT_STATIC_CALL_GPL(name)					\
 	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
 	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
 
+/* Leave the key unexported, so modules can't change static call targets: */
+#define EXPORT_STATIC_CALL_TRAMP(name)					\
+	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
+#define EXPORT_STATIC_CALL_TRAMP_GPL(name)				\
+	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+
 #else /* Generic implementation */
 
 static inline int static_call_init(void) { return 0; }
diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h
index 08f78b1b88b4..ae5662d368b9 100644
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -10,6 +10,7 @@
 #define STATIC_CALL_KEY_PREFIX_STR	__stringify(STATIC_CALL_KEY_PREFIX)
 #define STATIC_CALL_KEY_PREFIX_LEN	(sizeof(STATIC_CALL_KEY_PREFIX_STR) - 1)
 #define STATIC_CALL_KEY(name)		__PASTE(STATIC_CALL_KEY_PREFIX, name)
+#define STATIC_CALL_KEY_STR(name)	__stringify(STATIC_CALL_KEY(name))
 
 #define STATIC_CALL_TRAMP_PREFIX	__SCT__
 #define STATIC_CALL_TRAMP_PREFIX_STR	__stringify(STATIC_CALL_TRAMP_PREFIX)
@@ -39,17 +40,39 @@ struct static_call_site {
 
 #ifdef CONFIG_HAVE_STATIC_CALL
 
+#define __raw_static_call(name)	(&STATIC_CALL_TRAMP(name))
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+
 /*
  * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
  * the symbol table so that objtool can reference it when it generates the
  * .static_call_sites section.
  */
+#define __STATIC_CALL_ADDRESSABLE(name) \
+	__ADDRESSABLE(STATIC_CALL_KEY(name))
+
 #define __static_call(name)						\
 ({									\
-	__ADDRESSABLE(STATIC_CALL_KEY(name));				\
-	&STATIC_CALL_TRAMP(name);					\
+	__STATIC_CALL_ADDRESSABLE(name);				\
+	__raw_static_call(name);					\
 })
 
+#else /* !CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#define __STATIC_CALL_ADDRESSABLE(name)
+#define __static_call(name)	__raw_static_call(name)
+
+#endif /* CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#ifdef MODULE
+#define __STATIC_CALL_MOD_ADDRESSABLE(name)
+#define static_call_mod(name)	__raw_static_call(name)
+#else
+#define __STATIC_CALL_MOD_ADDRESSABLE(name) __STATIC_CALL_ADDRESSABLE(name)
+#define static_call_mod(name)	__static_call(name)
+#endif
+
 #define static_call(name)	__static_call(name)
 
 #else
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 0bc11b5ce681..6906c6ec4c97 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -12,6 +12,8 @@
 
 extern struct static_call_site __start_static_call_sites[],
 			       __stop_static_call_sites[];
+extern struct static_call_tramp_key __start_static_call_tramp_key[],
+				    __stop_static_call_tramp_key[];
 
 static bool static_call_initialized;
 
@@ -323,10 +325,59 @@ static int __static_call_mod_text_reserved(void *start, void *end)
 	return ret;
 }
 
+static unsigned long tramp_key_lookup(unsigned long addr)
+{
+	struct static_call_tramp_key *start = __start_static_call_tramp_key;
+	struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
+	struct static_call_tramp_key *tramp_key;
+
+	for (tramp_key = start; tramp_key != stop; tramp_key++) {
+		unsigned long tramp;
+
+		tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
+		if (tramp == addr)
+			return (long)tramp_key->key + (long)&tramp_key->key;
+	}
+
+	return 0;
+}
+
 static int static_call_add_module(struct module *mod)
 {
-	return __static_call_init(mod, mod->static_call_sites,
-				  mod->static_call_sites + mod->num_static_call_sites);
+	struct static_call_site *start = mod->static_call_sites;
+	struct static_call_site *stop = start + mod->num_static_call_sites;
+	struct static_call_site *site;
+
+	for (site = start; site != stop; site++) {
+		unsigned long addr = (unsigned long)static_call_key(site);
+		unsigned long key;
+
+		/*
+		 * Is the key is exported, 'addr' points to the key, which
+		 * means modules are allowed to call static_call_update() on
+		 * it.
+		 *
+		 * Otherwise, the key isn't exported, and 'addr' points to the
+		 * trampoline so we need to lookup the key.
+		 *
+		 * We go through this dance to prevent crazy modules from
+		 * abusing sensitive static calls.
+		 */
+		if (!kernel_text_address(addr))
+			continue;
+
+		key = tramp_key_lookup(addr);
+		if (!key) {
+			pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
+				static_call_addr(site));
+			return -EINVAL;
+		}
+
+		site->key = (key - (long)&site->key) |
+			    (site->key & STATIC_CALL_SITE_FLAGS);
+	}
+
+	return __static_call_init(mod, start, stop);
 }
 
 static void static_call_del_module(struct module *mod)
diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h
index 08f78b1b88b4..ae5662d368b9 100644
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -10,6 +10,7 @@
 #define STATIC_CALL_KEY_PREFIX_STR	__stringify(STATIC_CALL_KEY_PREFIX)
 #define STATIC_CALL_KEY_PREFIX_LEN	(sizeof(STATIC_CALL_KEY_PREFIX_STR) - 1)
 #define STATIC_CALL_KEY(name)		__PASTE(STATIC_CALL_KEY_PREFIX, name)
+#define STATIC_CALL_KEY_STR(name)	__stringify(STATIC_CALL_KEY(name))
 
 #define STATIC_CALL_TRAMP_PREFIX	__SCT__
 #define STATIC_CALL_TRAMP_PREFIX_STR	__stringify(STATIC_CALL_TRAMP_PREFIX)
@@ -39,17 +40,39 @@ struct static_call_site {
 
 #ifdef CONFIG_HAVE_STATIC_CALL
 
+#define __raw_static_call(name)	(&STATIC_CALL_TRAMP(name))
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+
 /*
  * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from
  * the symbol table so that objtool can reference it when it generates the
  * .static_call_sites section.
  */
+#define __STATIC_CALL_ADDRESSABLE(name) \
+	__ADDRESSABLE(STATIC_CALL_KEY(name))
+
 #define __static_call(name)						\
 ({									\
-	__ADDRESSABLE(STATIC_CALL_KEY(name));				\
-	&STATIC_CALL_TRAMP(name);					\
+	__STATIC_CALL_ADDRESSABLE(name);				\
+	__raw_static_call(name);					\
 })
 
+#else /* !CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#define __STATIC_CALL_ADDRESSABLE(name)
+#define __static_call(name)	__raw_static_call(name)
+
+#endif /* CONFIG_HAVE_STATIC_CALL_INLINE */
+
+#ifdef MODULE
+#define __STATIC_CALL_MOD_ADDRESSABLE(name)
+#define static_call_mod(name)	__raw_static_call(name)
+#else
+#define __STATIC_CALL_MOD_ADDRESSABLE(name) __STATIC_CALL_ADDRESSABLE(name)
+#define static_call_mod(name)	__static_call(name)
+#endif
+
 #define static_call(name)	__static_call(name)
 
 #else
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4bd30315eb62..f2e5e5ce1a05 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -502,8 +502,21 @@ static int create_static_call_sections(struct objtool_file *file)
 
 		key_sym = find_symbol_by_name(file->elf, tmp);
 		if (!key_sym) {
-			WARN("static_call: can't find static_call_key symbol: %s", tmp);
-			return -1;
+			if (!module) {
+				WARN("static_call: can't find static_call_key symbol: %s", tmp);
+				return -1;
+			}
+
+			/*
+			 * For modules(), the key might not be exported, which
+			 * means the module can make static calls but isn't
+			 * allowed to change them.
+			 *
+			 * In that case we temporarily set the key to be the
+			 * trampoline address.  This is fixed up in
+			 * static_call_add_module().
+			 */
+			key_sym = insn->call_dest;
 		}
 		free(key_name);
 
-- 
cgit v1.2.3


From ef72661e28c64ad610f89acc2832ec67b27ba438 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 25 Jan 2021 16:26:50 +0100
Subject: sched: Harden PREEMPT_DYNAMIC

Use the new EXPORT_STATIC_CALL_TRAMP() / static_call_mod() to unexport
the static_call_key for the PREEMPT_DYNAMIC calls such that modules
can no longer update these calls.

Having modules change/hi-jack the preemption calls would be horrible.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/preempt.h | 4 ++--
 include/linux/kernel.h         | 2 +-
 include/linux/sched.h          | 2 +-
 kernel/sched/core.c            | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 9b12dce9bda5..0aa96f824af1 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -114,7 +114,7 @@ DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
 
 #define __preempt_schedule() \
 do { \
-	__ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule)); \
+	__STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
 	asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
 } while (0)
 
@@ -127,7 +127,7 @@ DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
 
 #define __preempt_schedule_notrace() \
 do { \
-	__ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule_notrace)); \
+	__STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
 	asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
 } while (0)
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cfd3d349f905..5b7ed6dc99ac 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -93,7 +93,7 @@ DECLARE_STATIC_CALL(might_resched, __cond_resched);
 
 static __always_inline void might_resched(void)
 {
-	static_call(might_resched)();
+	static_call_mod(might_resched)();
 }
 
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2f35594b8b53..4d568288abf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1880,7 +1880,7 @@ DECLARE_STATIC_CALL(cond_resched, __cond_resched);
 
 static __always_inline int _cond_resched(void)
 {
-	return static_call(cond_resched)();
+	return static_call_mod(cond_resched)();
 }
 
 #else
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a17bb5f28b0..cec507be460c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5267,7 +5267,7 @@ EXPORT_SYMBOL(preempt_schedule);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
-EXPORT_STATIC_CALL(preempt_schedule);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
 #endif
 
 
@@ -5325,7 +5325,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-EXPORT_STATIC_CALL(preempt_schedule_notrace);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
 #endif
 
 #endif /* CONFIG_PREEMPTION */
@@ -6997,10 +6997,10 @@ EXPORT_SYMBOL(__cond_resched);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
-EXPORT_STATIC_CALL(cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
 
 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
-EXPORT_STATIC_CALL(might_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
 #endif
 
 /*
-- 
cgit v1.2.3


From 43789ef3f7d61aa7bed0cb2764e588fc990c30ef Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 1 Feb 2021 00:05:45 +0100
Subject: rcu/nocb: Perform deferred wake up before last idle's need_resched()
 check

Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP
kthread (rcuog) to be serviced.

Usually a local wake up happening while running the idle task is handled
in one of the need_resched() checks carefully placed within the idle
loop that can break to the scheduler.

Unfortunately the call to rcu_idle_enter() is already beyond the last
generic need_resched() check and we may halt the CPU with a resched
request unhandled, leaving the task hanging.

Fix this with splitting the rcuog wakeup handling from rcu_idle_enter()
and place it before the last generic need_resched() check in the idle
loop. It is then assumed that no call to call_rcu() will be performed
after that in the idle loop until the CPU is put in low power mode.

Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and perf)
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210131230548.32970-3-frederic@kernel.org
---
 include/linux/rcupdate.h | 2 ++
 kernel/rcu/tree.c        | 3 ---
 kernel/rcu/tree_plugin.h | 5 +++++
 kernel/sched/idle.c      | 1 +
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index fd02c5fa60cb..36c2119de702 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -110,8 +110,10 @@ static inline void rcu_user_exit(void) { }
 
 #ifdef CONFIG_RCU_NOCB_CPU
 void rcu_init_nohz(void);
+void rcu_nocb_flush_deferred_wakeup(void);
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 static inline void rcu_init_nohz(void) { }
+static inline void rcu_nocb_flush_deferred_wakeup(void) { }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 /**
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 63032e5620b9..82838e93b498 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -671,10 +671,7 @@ static noinstr void rcu_eqs_enter(bool user)
  */
 void rcu_idle_enter(void)
 {
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
 	lockdep_assert_irqs_disabled();
-	do_nocb_deferred_wakeup(rdp);
 	rcu_eqs_enter(false);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6..d5b38c28abd1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2187,6 +2187,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 		do_nocb_deferred_wakeup_common(rdp);
 }
 
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+	do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+
 void __init rcu_init_nohz(void)
 {
 	int cpu;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 305727ea0677..7199e6f23789 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -285,6 +285,7 @@ static void do_idle(void)
 		}
 
 		arch_cpu_idle_enter();
+		rcu_nocb_flush_deferred_wakeup();
 
 		/*
 		 * In poll mode we reenable interrupts and spin. Also if we
-- 
cgit v1.2.3


From 4ae7dc97f726ea95c58ac58af71cc034ad22d7de Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 1 Feb 2021 00:05:48 +0100
Subject: entry/kvm: Explicitly flush pending rcuog wakeup before last
 rescheduling point

Following the idle loop model, cleanly check for pending rcuog wakeup
before the last rescheduling point upon resuming to guest mode. This
way we can avoid to do it from rcu_user_enter() with the last resort
self-IPI hack that enforces rescheduling.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20210131230548.32970-6-frederic@kernel.org
---
 arch/x86/kvm/x86.c        |  1 +
 include/linux/entry-kvm.h | 14 ++++++++++++++
 kernel/rcu/tree.c         | 44 ++++++++++++++++++++++++++++++++++----------
 kernel/rcu/tree_plugin.h  |  1 +
 4 files changed, 50 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b404e4d7dd8..b967c1c774a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1782,6 +1782,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
 bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
+	xfer_to_guest_mode_prepare();
 	return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
 		xfer_to_guest_mode_work_pending();
 }
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 9b93f8584ff7..8b2b1d68b954 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -46,6 +46,20 @@ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu,
  */
 int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
 
+/**
+ * xfer_to_guest_mode_prepare - Perform last minute preparation work that
+ *				need to be handled while IRQs are disabled
+ *				upon entering to guest.
+ *
+ * Has to be invoked with interrupts disabled before the last call
+ * to xfer_to_guest_mode_work_pending().
+ */
+static inline void xfer_to_guest_mode_prepare(void)
+{
+	lockdep_assert_irqs_disabled();
+	rcu_nocb_flush_deferred_wakeup();
+}
+
 /**
  * __xfer_to_guest_mode_work_pending - Check if work is pending
  *
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2ebc211fffcb..ce17b8477442 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -678,9 +678,10 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
 #ifdef CONFIG_NO_HZ_FULL
 
+#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
 /*
  * An empty function that will trigger a reschedule on
- * IRQ tail once IRQs get re-enabled on userspace resume.
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
  */
 static void late_wakeup_func(struct irq_work *work)
 {
@@ -689,6 +690,37 @@ static void late_wakeup_func(struct irq_work *work)
 static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
 	IRQ_WORK_INIT(late_wakeup_func);
 
+/*
+ * If either:
+ *
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
+ *
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
+ */
+noinstr static void rcu_irq_work_resched(void)
+{
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+	if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+		return;
+
+	if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+		return;
+
+	instrumentation_begin();
+	if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+		irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+	}
+	instrumentation_end();
+}
+
+#else
+static inline void rcu_irq_work_resched(void) { }
+#endif
+
 /**
  * rcu_user_enter - inform RCU that we are resuming userspace.
  *
@@ -702,8 +734,6 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
  */
 noinstr void rcu_user_enter(void)
 {
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
 	lockdep_assert_irqs_disabled();
 
 	/*
@@ -711,13 +741,7 @@ noinstr void rcu_user_enter(void)
 	 * rescheduling opportunity in the entry code. Trigger a self IPI
 	 * that will fire and reschedule once we resume in user/guest mode.
 	 */
-	instrumentation_begin();
-	if (!IS_ENABLED(CONFIG_GENERIC_ENTRY) || (current->flags & PF_VCPU)) {
-		if (do_nocb_deferred_wakeup(rdp) && need_resched())
-			irq_work_queue(this_cpu_ptr(&late_wakeup_work));
-	}
-	instrumentation_end();
-
+	rcu_irq_work_resched();
 	rcu_eqs_enter(true);
 }
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 384856e4d13e..cdc1b7651c03 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2197,6 +2197,7 @@ void rcu_nocb_flush_deferred_wakeup(void)
 {
 	do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
 }
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
 
 void __init rcu_init_nohz(void)
 {
-- 
cgit v1.2.3


From 96313e1db8e5629cc2217616dca78f03e6463008 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 17 Feb 2021 12:25:57 -0800
Subject: net: mdio: Remove of_phy_attach()

We have no in-tree users, also update the sfp-phylink.rst documentation
to indicate that phy_attach_direct() is used instead of of_phy_attach().

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/sfp-phylink.rst |  2 +-
 drivers/net/mdio/of_mdio.c               | 30 ------------------------------
 include/linux/of_mdio.h                  | 10 ----------
 3 files changed, 1 insertion(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 5aec7c8857d0..328f8d39eeb3 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -163,7 +163,7 @@ this documentation.
 	err = phylink_of_phy_connect(priv->phylink, node, flags);
 
    For the most part, ``flags`` can be zero; these flags are passed to
-   the of_phy_attach() inside this function call if a PHY is specified
+   the phy_attach_direct() inside this function call if a PHY is specified
    in the DT node ``node``.
 
    ``node`` should be the DT node which contains the network phy property,
diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c
index 4daf94bb56a5..ea9d5855fb52 100644
--- a/drivers/net/mdio/of_mdio.c
+++ b/drivers/net/mdio/of_mdio.c
@@ -462,36 +462,6 @@ struct phy_device *of_phy_get_and_connect(struct net_device *dev,
 }
 EXPORT_SYMBOL(of_phy_get_and_connect);
 
-/**
- * of_phy_attach - Attach to a PHY without starting the state machine
- * @dev: pointer to net_device claiming the phy
- * @phy_np: Node pointer for the PHY
- * @flags: flags to pass to the PHY
- * @iface: PHY data interface type
- *
- * If successful, returns a pointer to the phy_device with the embedded
- * struct device refcount incremented by one, or NULL on failure. The
- * refcount must be dropped by calling phy_disconnect() or phy_detach().
- */
-struct phy_device *of_phy_attach(struct net_device *dev,
-				 struct device_node *phy_np, u32 flags,
-				 phy_interface_t iface)
-{
-	struct phy_device *phy = of_phy_find_device(phy_np);
-	int ret;
-
-	if (!phy)
-		return NULL;
-
-	ret = phy_attach_direct(dev, phy, flags, iface);
-
-	/* refcount is held by phy_attach_direct() on success */
-	put_device(&phy->mdio.dev);
-
-	return ret ? NULL : phy;
-}
-EXPORT_SYMBOL(of_phy_attach);
-
 /*
  * of_phy_is_fixed_link() and of_phy_register_fixed_link() must
  * support two DT bindings:
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index cfe8c607a628..2b05e7f7c238 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -26,9 +26,6 @@ of_phy_connect(struct net_device *dev, struct device_node *phy_np,
 struct phy_device *
 of_phy_get_and_connect(struct net_device *dev, struct device_node *np,
 		       void (*hndlr)(struct net_device *));
-struct phy_device *
-of_phy_attach(struct net_device *dev, struct device_node *phy_np,
-	      u32 flags, phy_interface_t iface);
 
 struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np);
 int of_phy_register_fixed_link(struct device_node *np);
@@ -100,13 +97,6 @@ of_phy_get_and_connect(struct net_device *dev, struct device_node *np,
 	return NULL;
 }
 
-static inline struct phy_device *of_phy_attach(struct net_device *dev,
-					       struct device_node *phy_np,
-					       u32 flags, phy_interface_t iface)
-{
-	return NULL;
-}
-
 static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np)
 {
 	return NULL;
-- 
cgit v1.2.3


From 20e07e2c3cf310578ef19fb4f1e64dc9832abd9d Mon Sep 17 00:00:00 2001
From: Wong Vee Khee <vee.khee.wong@intel.com>
Date: Wed, 17 Feb 2021 17:57:05 +0800
Subject: net: stmmac: Add PCI bus info to ethtool driver query output

This patch populates the PCI bus info in the ethtool driver query data.

Users will be able to view PCI bus info using 'ethtool -i <interface>'.

Signed-off-by: Wong Vee Khee <vee.khee.wong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c    | 1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 4 ++++
 include/linux/stmmac.h                               | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 1c9c67b641a2..751dfdeec41c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -236,6 +236,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	int ret;
 	int i;
 
+	plat->pdev = pdev;
 	plat->phy_addr = -1;
 	plat->clk_csr = 5;
 	plat->has_gmac = 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 9e54f953634b..c5642985ef95 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -268,6 +268,10 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev,
 		strlcpy(info->driver, MAC100_ETHTOOL_NAME,
 			sizeof(info->driver));
 
+	if (priv->plat->pdev) {
+		strlcpy(info->bus_info, pci_name(priv->plat->pdev),
+			sizeof(info->bus_info));
+	}
 	strlcpy(info->version, DRV_MODULE_VERSION, sizeof(info->version));
 }
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 15ca6b4167cc..a302982de2d7 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -202,5 +202,6 @@ struct plat_stmmacenet_data {
 	bool vlan_fail_q_en;
 	u8 vlan_fail_q;
 	unsigned int eee_usecs_rate;
+	struct pci_dev *pdev;
 };
 #endif
-- 
cgit v1.2.3


From fade5cad9339a627c5ad029e3577582b6292df03 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 15 Jan 2021 13:46:03 +0800
Subject: initrd: Add the preprocessor guard in initrd.h

Add the preprocessor guard in initrd.h to prevent possible
build error from the multiple inclusion of same header file
multiple time.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 include/linux/initrd.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/initrd.h b/include/linux/initrd.h
index 8db6f8c8030b..fc30ac30e10e 100644
--- a/include/linux/initrd.h
+++ b/include/linux/initrd.h
@@ -1,5 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#ifndef __LINUX_INITRD_H
+#define __LINUX_INITRD_H
+
 #define INITRD_MINOR 250 /* shouldn't collide with /dev/ram* too soon ... */
 
 /* starting block # of image */
@@ -24,3 +27,5 @@ extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 
 void console_on_rootfs(void);
+
+#endif /* __LINUX_INITRD_H */
-- 
cgit v1.2.3


From c72160fe05fb978ad859ba053c4462c2bb960b13 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 15 Jan 2021 13:46:04 +0800
Subject: initramfs: Provide a common initrd reserve function

Some architectures(eg, ARM and riscv) have similar logic to
check and reserve the memory of initrd, let's provide a common
function reserve_initrd_mem() to reduce duplicated code.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 include/linux/initrd.h |  6 ++++++
 init/initramfs.c       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/initrd.h b/include/linux/initrd.h
index fc30ac30e10e..85c15717af34 100644
--- a/include/linux/initrd.h
+++ b/include/linux/initrd.h
@@ -18,6 +18,12 @@ extern int initrd_below_start_ok;
 extern unsigned long initrd_start, initrd_end;
 extern void free_initrd_mem(unsigned long, unsigned long);
 
+#ifdef CONFIG_BLK_DEV_INITRD
+extern void __init reserve_initrd_mem(void);
+#else
+static inline void __init reserve_initrd_mem(void) {}
+#endif
+
 extern phys_addr_t phys_initrd_start;
 extern unsigned long phys_initrd_size;
 
diff --git a/init/initramfs.c b/init/initramfs.c
index 55b74d7e5260..f75c89e9d602 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -535,6 +535,51 @@ extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
 #include <linux/kexec.h>
 
+void __init reserve_initrd_mem(void)
+{
+	phys_addr_t start;
+	unsigned long size;
+
+	/* Ignore the virtul address computed during device tree parsing */
+	initrd_start = initrd_end = 0;
+
+	if (!phys_initrd_size)
+		return;
+	/*
+	 * Round the memory region to page boundaries as per free_initrd_mem()
+	 * This allows us to detect whether the pages overlapping the initrd
+	 * are in use, but more importantly, reserves the entire set of pages
+	 * as we don't want these pages allocated for other purposes.
+	 */
+	start = round_down(phys_initrd_start, PAGE_SIZE);
+	size = phys_initrd_size + (phys_initrd_start - start);
+	size = round_up(size, PAGE_SIZE);
+
+	if (!memblock_is_region_memory(start, size)) {
+		pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region",
+		       (u64)start, size);
+		goto disable;
+	}
+
+	if (memblock_is_region_reserved(start, size)) {
+		pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n",
+		       (u64)start, size);
+		goto disable;
+	}
+
+	memblock_reserve(start, size);
+	/* Now convert initrd to virtual addresses */
+	initrd_start = (unsigned long)__va(phys_initrd_start);
+	initrd_end = initrd_start + phys_initrd_size;
+	initrd_below_start_ok = 1;
+
+	return;
+disable:
+	pr_cont(" - disabling initrd\n");
+	initrd_start = 0;
+	initrd_end = 0;
+}
+
 void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
 {
 #ifdef CONFIG_ARCH_KEEP_MEMBLOCK
-- 
cgit v1.2.3


From af0bfab907a011e146304d20d81dddce4e4d62d0 Mon Sep 17 00:00:00 2001
From: Abanoub Sameh <abanoubsameh8@gmail.com>
Date: Fri, 11 Dec 2020 22:42:08 +0200
Subject: leds: led-core: Get rid of enum led_brightness

This gets rid of enum led_brightness in the main led files,
because it is deprecated, and an unsigned int can be used instead.

We can get rid of led_brightness completely and
patches can also be supplied for the other drivers' files.

Signed-off-by: Abanoub Sameh <abanoubsameh@protonmail.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 drivers/leds/led-class.c |  3 +--
 drivers/leds/led-core.c  | 20 +++++++-------------
 drivers/leds/leds.h      |  6 ++----
 include/linux/leds.h     | 12 +++++-------
 4 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 131ca83f5fb3..2e495ff67856 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -145,8 +145,7 @@ static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev)
 	device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed);
 }
 
-void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev,
-					       enum led_brightness brightness)
+void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness)
 {
 	if (WARN_ON(!led_cdev->brightness_hw_changed_kn))
 		return;
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index c4e780bdb385..8eb8054ef9c6 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -39,8 +39,7 @@ const char * const led_colors[LED_COLOR_ID_MAX] = {
 };
 EXPORT_SYMBOL_GPL(led_colors);
 
-static int __led_set_brightness(struct led_classdev *led_cdev,
-				enum led_brightness value)
+static int __led_set_brightness(struct led_classdev *led_cdev, unsigned int value)
 {
 	if (!led_cdev->brightness_set)
 		return -ENOTSUPP;
@@ -50,8 +49,7 @@ static int __led_set_brightness(struct led_classdev *led_cdev,
 	return 0;
 }
 
-static int __led_set_brightness_blocking(struct led_classdev *led_cdev,
-					 enum led_brightness value)
+static int __led_set_brightness_blocking(struct led_classdev *led_cdev, unsigned int value)
 {
 	if (!led_cdev->brightness_set_blocking)
 		return -ENOTSUPP;
@@ -240,8 +238,7 @@ void led_stop_software_blink(struct led_classdev *led_cdev)
 }
 EXPORT_SYMBOL_GPL(led_stop_software_blink);
 
-void led_set_brightness(struct led_classdev *led_cdev,
-			enum led_brightness brightness)
+void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness)
 {
 	/*
 	 * If software blink is active, delay brightness setting
@@ -253,7 +250,7 @@ void led_set_brightness(struct led_classdev *led_cdev,
 		 * work queue task to avoid problems in case we are called
 		 * from hard irq context.
 		 */
-		if (brightness == LED_OFF) {
+		if (!brightness) {
 			set_bit(LED_BLINK_DISABLE, &led_cdev->work_flags);
 			schedule_work(&led_cdev->set_brightness_work);
 		} else {
@@ -268,8 +265,7 @@ void led_set_brightness(struct led_classdev *led_cdev,
 }
 EXPORT_SYMBOL_GPL(led_set_brightness);
 
-void led_set_brightness_nopm(struct led_classdev *led_cdev,
-			      enum led_brightness value)
+void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value)
 {
 	/* Use brightness_set op if available, it is guaranteed not to sleep */
 	if (!__led_set_brightness(led_cdev, value))
@@ -281,8 +277,7 @@ void led_set_brightness_nopm(struct led_classdev *led_cdev,
 }
 EXPORT_SYMBOL_GPL(led_set_brightness_nopm);
 
-void led_set_brightness_nosleep(struct led_classdev *led_cdev,
-				enum led_brightness value)
+void led_set_brightness_nosleep(struct led_classdev *led_cdev, unsigned int value)
 {
 	led_cdev->brightness = min(value, led_cdev->max_brightness);
 
@@ -293,8 +288,7 @@ void led_set_brightness_nosleep(struct led_classdev *led_cdev,
 }
 EXPORT_SYMBOL_GPL(led_set_brightness_nosleep);
 
-int led_set_brightness_sync(struct led_classdev *led_cdev,
-			    enum led_brightness value)
+int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value)
 {
 	if (led_cdev->blink_delay_on || led_cdev->blink_delay_off)
 		return -EBUSY;
diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h
index 2d9eb48bbed9..345062ccabda 100644
--- a/drivers/leds/leds.h
+++ b/drivers/leds/leds.h
@@ -19,10 +19,8 @@ static inline int led_get_brightness(struct led_classdev *led_cdev)
 
 void led_init_core(struct led_classdev *led_cdev);
 void led_stop_software_blink(struct led_classdev *led_cdev);
-void led_set_brightness_nopm(struct led_classdev *led_cdev,
-				enum led_brightness value);
-void led_set_brightness_nosleep(struct led_classdev *led_cdev,
-				enum led_brightness value);
+void led_set_brightness_nopm(struct led_classdev *led_cdev, unsigned int value);
+void led_set_brightness_nosleep(struct led_classdev *led_cdev, unsigned int value);
 ssize_t led_trigger_read(struct file *filp, struct kobject *kobj,
 			struct bin_attribute *attr, char *buf,
 			loff_t pos, size_t count);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6a8d6409c993..329fd914cf24 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -63,8 +63,8 @@ struct led_hw_trigger_type {
 
 struct led_classdev {
 	const char		*name;
-	enum led_brightness	 brightness;
-	enum led_brightness	 max_brightness;
+	unsigned int brightness;
+	unsigned int max_brightness;
 	int			 flags;
 
 	/* Lower 16 bits reflect status */
@@ -253,8 +253,7 @@ void led_blink_set_oneshot(struct led_classdev *led_cdev,
  * software blink timer that implements blinking when the
  * hardware doesn't. This function is guaranteed not to sleep.
  */
-void led_set_brightness(struct led_classdev *led_cdev,
-			enum led_brightness brightness);
+void led_set_brightness(struct led_classdev *led_cdev, unsigned int brightness);
 
 /**
  * led_set_brightness_sync - set LED brightness synchronously
@@ -267,8 +266,7 @@ void led_set_brightness(struct led_classdev *led_cdev,
  *
  * Returns: 0 on success or negative error value on failure
  */
-int led_set_brightness_sync(struct led_classdev *led_cdev,
-			    enum led_brightness value);
+int led_set_brightness_sync(struct led_classdev *led_cdev, unsigned int value);
 
 /**
  * led_update_brightness - update LED brightness
@@ -565,7 +563,7 @@ static inline void ledtrig_cpu(enum cpu_led_event evt)
 
 #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
 void led_classdev_notify_brightness_hw_changed(
-	struct led_classdev *led_cdev, enum led_brightness brightness);
+	struct led_classdev *led_cdev, unsigned int brightness);
 #else
 static inline void led_classdev_notify_brightness_hw_changed(
 	struct led_classdev *led_cdev, enum led_brightness brightness) { }
-- 
cgit v1.2.3


From 8e5c38a33c84935d66cfcf23c96960b6c4b484ef Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Mon, 21 Dec 2020 18:45:50 +0800
Subject: leds: flash: Add flash registration with undefined
 CONFIG_LEDS_CLASS_FLASH

Add flash registration with undefined CONFIG_LEDS_CLASS_FLASH,
and move the same registration functions outside of #ifdef block.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 include/linux/led-class-flash.h | 42 ++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h
index 21a3358a1731..612b4cab3819 100644
--- a/include/linux/led-class-flash.h
+++ b/include/linux/led-class-flash.h
@@ -85,6 +85,7 @@ static inline struct led_classdev_flash *lcdev_to_flcdev(
 	return container_of(lcdev, struct led_classdev_flash, led_cdev);
 }
 
+#if IS_ENABLED(CONFIG_LEDS_CLASS_FLASH)
 /**
  * led_classdev_flash_register_ext - register a new object of LED class with
  *				     init data and with support for flash LEDs
@@ -98,12 +99,6 @@ int led_classdev_flash_register_ext(struct device *parent,
 				    struct led_classdev_flash *fled_cdev,
 				    struct led_init_data *init_data);
 
-static inline int led_classdev_flash_register(struct device *parent,
-					   struct led_classdev_flash *fled_cdev)
-{
-	return led_classdev_flash_register_ext(parent, fled_cdev, NULL);
-}
-
 /**
  * led_classdev_flash_unregister - unregisters an object of led_classdev class
  *				   with support for flash LEDs
@@ -118,15 +113,44 @@ int devm_led_classdev_flash_register_ext(struct device *parent,
 				     struct led_init_data *init_data);
 
 
+void devm_led_classdev_flash_unregister(struct device *parent,
+					struct led_classdev_flash *fled_cdev);
+
+#else
+
+static inline int led_classdev_flash_register_ext(struct device *parent,
+				    struct led_classdev_flash *fled_cdev,
+				    struct led_init_data *init_data)
+{
+	return 0;
+}
+
+static inline void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev) {};
+static inline int devm_led_classdev_flash_register_ext(struct device *parent,
+				     struct led_classdev_flash *fled_cdev,
+				     struct led_init_data *init_data)
+{
+	return 0;
+}
+
+static inline void devm_led_classdev_flash_unregister(struct device *parent,
+					struct led_classdev_flash *fled_cdev)
+{};
+
+#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) */
+
+static inline int led_classdev_flash_register(struct device *parent,
+					   struct led_classdev_flash *fled_cdev)
+{
+	return led_classdev_flash_register_ext(parent, fled_cdev, NULL);
+}
+
 static inline int devm_led_classdev_flash_register(struct device *parent,
 				     struct led_classdev_flash *fled_cdev)
 {
 	return devm_led_classdev_flash_register_ext(parent, fled_cdev, NULL);
 }
 
-void devm_led_classdev_flash_unregister(struct device *parent,
-					struct led_classdev_flash *fled_cdev);
-
 /**
  * led_set_flash_strobe - setup flash strobe
  * @fled_cdev: the flash LED to set strobe on
-- 
cgit v1.2.3


From 6039b7e87be0b350a5f8fc135adfb5d1f4ba66ad Mon Sep 17 00:00:00 2001
From: Gene Chen <gene_chen@richtek.com>
Date: Mon, 21 Dec 2020 18:45:51 +0800
Subject: leds: flash: Fix multicolor no-ops registration by return 0

Fix multicolor no-ops registration by return 0,
and move the same registration functions outside of #ifdef block.

Signed-off-by: Gene Chen <gene_chen@richtek.com>
Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
---
 include/linux/led-class-multicolor.h | 42 +++++++++++++-----------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/led-class-multicolor.h b/include/linux/led-class-multicolor.h
index 5116f9a866cc..210d57bcd767 100644
--- a/include/linux/led-class-multicolor.h
+++ b/include/linux/led-class-multicolor.h
@@ -44,12 +44,6 @@ int led_classdev_multicolor_register_ext(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev,
 					    struct led_init_data *init_data);
 
-static inline int led_classdev_multicolor_register(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{
-	return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL);
-}
-
 /**
  * led_classdev_multicolor_unregister - unregisters an object of led_classdev
  *					class with support for multicolor LEDs
@@ -68,13 +62,6 @@ int devm_led_classdev_multicolor_register_ext(struct device *parent,
 					  struct led_classdev_mc *mcled_cdev,
 					  struct led_init_data *init_data);
 
-static inline int devm_led_classdev_multicolor_register(struct device *parent,
-				     struct led_classdev_mc *mcled_cdev)
-{
-	return devm_led_classdev_multicolor_register_ext(parent, mcled_cdev,
-							 NULL);
-}
-
 void devm_led_classdev_multicolor_unregister(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev);
 #else
@@ -83,27 +70,33 @@ static inline int led_classdev_multicolor_register_ext(struct device *parent,
 					    struct led_classdev_mc *mcled_cdev,
 					    struct led_init_data *init_data)
 {
-	return -EINVAL;
-}
-
-static inline int led_classdev_multicolor_register(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{
-	return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL);
+	return 0;
 }
 
 static inline void led_classdev_multicolor_unregister(struct led_classdev_mc *mcled_cdev) {};
 static inline int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev,
 					       enum led_brightness brightness)
 {
-	return -EINVAL;
+	return 0;
 }
 
 static inline int devm_led_classdev_multicolor_register_ext(struct device *parent,
 					  struct led_classdev_mc *mcled_cdev,
 					  struct led_init_data *init_data)
 {
-	return -EINVAL;
+	return 0;
+}
+
+static inline void devm_led_classdev_multicolor_unregister(struct device *parent,
+					    struct led_classdev_mc *mcled_cdev)
+{};
+
+#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */
+
+static inline int led_classdev_multicolor_register(struct device *parent,
+					    struct led_classdev_mc *mcled_cdev)
+{
+	return led_classdev_multicolor_register_ext(parent, mcled_cdev, NULL);
 }
 
 static inline int devm_led_classdev_multicolor_register(struct device *parent,
@@ -113,9 +106,4 @@ static inline int devm_led_classdev_multicolor_register(struct device *parent,
 							 NULL);
 }
 
-static inline void devm_led_classdev_multicolor_unregister(struct device *parent,
-					    struct led_classdev_mc *mcled_cdev)
-{};
-
-#endif  /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */
 #endif	/* _LINUX_MULTICOLOR_LEDS_H_INCLUDED */
-- 
cgit v1.2.3


From 36950f2da1ea4cb683be174f6f581e25b2d33e71 Mon Sep 17 00:00:00 2001
From: Jianxiong Gao <jxgao@google.com>
Date: Mon, 1 Feb 2021 10:30:15 -0800
Subject: driver core: add a min_align_mask field to struct
 device_dma_parameters

Some devices rely on the address offset in a page to function
correctly (NVMe driver as an example). These devices may use
a different page size than the Linux kernel. The address offset
has to be preserved upon mapping, and in order to do so, we
need to record the page_offset_mask first.

Signed-off-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/device.h      |  1 +
 include/linux/dma-mapping.h | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 1779f90eeb4c..7960bf516dd7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -291,6 +291,7 @@ struct device_dma_parameters {
 	 * sg limitations.
 	 */
 	unsigned int max_segment_size;
+	unsigned int min_align_mask;
 	unsigned long segment_boundary_mask;
 };
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2e49996a8f39..9c26225754e7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -500,6 +500,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 	return -EIO;
 }
 
+static inline unsigned int dma_get_min_align_mask(struct device *dev)
+{
+	if (dev->dma_parms)
+		return dev->dma_parms->min_align_mask;
+	return 0;
+}
+
+static inline int dma_set_min_align_mask(struct device *dev,
+		unsigned int min_align_mask)
+{
+	if (WARN_ON_ONCE(!dev->dma_parms))
+		return -EIO;
+	dev->dma_parms->min_align_mask = min_align_mask;
+	return 0;
+}
+
 static inline int dma_get_cache_alignment(void)
 {
 #ifdef ARCH_DMA_MINALIGN
-- 
cgit v1.2.3


From b5d7ccb7aac3895c2138fe0980a109116ce15eff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Feb 2021 11:18:40 +0100
Subject: swiotlb: add a IO_TLB_SIZE define

Add a new IO_TLB_SIZE define instead open coding it using
IO_TLB_SHIFT all over.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jianxiong Gao <jxgao@google.com>
Tested-by: Jianxiong Gao <jxgao@google.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h |  1 +
 kernel/dma/swiotlb.c    | 14 +++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d9c9fc9ca5d2..5857a937c637 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,6 +29,7 @@ enum swiotlb_force {
  * controllable.
  */
 #define IO_TLB_SHIFT 11
+#define IO_TLB_SIZE (1 << IO_TLB_SHIFT)
 
 /* default to 64MB */
 #define IO_TLB_DEFAULT_SIZE (64UL<<20)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 7c42df6e6100..768187d0e17e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -171,7 +171,7 @@ void __init swiotlb_adjust_size(unsigned long new_size)
 	 * adjust/expand SWIOTLB size for their use.
 	 */
 	if (!io_tlb_nslabs) {
-		size = ALIGN(new_size, 1 << IO_TLB_SHIFT);
+		size = ALIGN(new_size, IO_TLB_SIZE);
 		io_tlb_nslabs = size >> IO_TLB_SHIFT;
 		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
 
@@ -491,20 +491,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
 
 	tbl_dma_addr &= mask;
 
-	offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 
 	/*
 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
 	 */
 	max_slots = mask + 1
-		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+		    ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
 
 	/*
 	 * For mappings greater than or equal to a page, we limit the stride
 	 * (and hence alignment) to a page size.
 	 */
-	nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 	if (alloc_size >= PAGE_SIZE)
 		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
 	else
@@ -598,7 +598,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      enum dma_data_direction dir, unsigned long attrs)
 {
 	unsigned long flags;
-	int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT;
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
@@ -649,7 +649,7 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
-	orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+	orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -707,7 +707,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 
 size_t swiotlb_max_mapping_size(struct device *dev)
 {
-	return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
+	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
 bool is_swiotlb_active(void)
-- 
cgit v1.2.3


From 5aa75ed5b93f086c455a3c67239b0471ff5a1526 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Feb 2021 12:56:50 -0700
Subject: io_uring: tie async worker side to the task context

Move it outside of the io_ring_ctx, and tie it to the io_uring task
context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 84 ++++++++++++++++++++----------------------------
 include/linux/io_uring.h |  1 +
 2 files changed, 35 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d6c2ff6124fd..31402a19fca6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -366,9 +366,6 @@ struct io_ring_ctx {
 
 	struct io_rings	*rings;
 
-	/* IO offload */
-	struct io_wq		*io_wq;
-
 	/*
 	 * For SQPOLL usage - we hold a reference to the parent task, so we
 	 * have access to the ->files
@@ -1634,10 +1631,11 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link = io_prep_linked_timeout(req);
+	struct io_uring_task *tctx = req->task->io_uring;
 
 	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 					&req->work, req->flags);
-	io_wq_enqueue(ctx->io_wq, &req->work);
+	io_wq_enqueue(tctx->io_wq, &req->work);
 	return link;
 }
 
@@ -5960,12 +5958,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	return req->user_data == (unsigned long) data;
 }
 
-static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
 {
 	enum io_wq_cancel cancel_ret;
 	int ret = 0;
 
-	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+	if (!tctx->io_wq)
+		return -ENOENT;
+
+	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
 		ret = 0;
@@ -5988,7 +5989,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	unsigned long flags;
 	int ret;
 
-	ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+	ret = io_async_cancel_one(req->task->io_uring,
+					(void *) (unsigned long) sqe_addr);
 	if (ret != -ENOENT) {
 		spin_lock_irqsave(&ctx->completion_lock, flags);
 		goto done;
@@ -7537,16 +7539,6 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 	}
 }
 
-static void io_finish_async(struct io_ring_ctx *ctx)
-{
-	io_sq_thread_stop(ctx);
-
-	if (ctx->io_wq) {
-		io_wq_destroy(ctx->io_wq);
-		ctx->io_wq = NULL;
-	}
-}
-
 #if defined(CONFIG_UNIX)
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
@@ -8105,11 +8097,10 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
 	return req ? &req->work : NULL;
 }
 
-static int io_init_wq_offload(struct io_ring_ctx *ctx)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 {
 	struct io_wq_data data;
 	unsigned int concurrency;
-	int ret = 0;
 
 	data.user = ctx->user;
 	data.free_work = io_free_work;
@@ -8118,16 +8109,11 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx)
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 
-	ctx->io_wq = io_wq_create(concurrency, &data);
-	if (IS_ERR(ctx->io_wq)) {
-		ret = PTR_ERR(ctx->io_wq);
-		ctx->io_wq = NULL;
-	}
-
-	return ret;
+	return io_wq_create(concurrency, &data);
 }
 
-static int io_uring_alloc_task_context(struct task_struct *task)
+static int io_uring_alloc_task_context(struct task_struct *task,
+				       struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx;
 	int ret;
@@ -8142,6 +8128,14 @@ static int io_uring_alloc_task_context(struct task_struct *task)
 		return ret;
 	}
 
+	tctx->io_wq = io_init_wq_offload(ctx);
+	if (IS_ERR(tctx->io_wq)) {
+		ret = PTR_ERR(tctx->io_wq);
+		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx);
+		return ret;
+	}
+
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
 	tctx->last = NULL;
@@ -8214,7 +8208,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 			ctx->sq_thread_idle = HZ;
 
 		if (sqd->thread)
-			goto done;
+			return 0;
 
 		if (p->flags & IORING_SETUP_SQ_AFF) {
 			int cpu = p->sq_thread_cpu;
@@ -8236,7 +8230,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 			sqd->thread = NULL;
 			goto err;
 		}
-		ret = io_uring_alloc_task_context(sqd->thread);
+		ret = io_uring_alloc_task_context(sqd->thread, ctx);
 		if (ret)
 			goto err;
 	} else if (p->flags & IORING_SETUP_SQ_AFF) {
@@ -8245,14 +8239,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		goto err;
 	}
 
-done:
-	ret = io_init_wq_offload(ctx);
-	if (ret)
-		goto err;
-
 	return 0;
 err:
-	io_finish_async(ctx);
+	io_sq_thread_stop(ctx);
 	return ret;
 }
 
@@ -8727,7 +8716,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	mutex_lock(&ctx->uring_lock);
 	mutex_unlock(&ctx->uring_lock);
 
-	io_finish_async(ctx);
+	io_sq_thread_stop(ctx);
 	io_sqe_buffers_unregister(ctx);
 
 	if (ctx->sqo_task) {
@@ -8870,13 +8859,6 @@ static void io_ring_exit_work(struct work_struct *work)
 	io_ring_ctx_free(ctx);
 }
 
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-	return req->ctx == data;
-}
-
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -8895,9 +8877,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	io_kill_timeouts(ctx, NULL, NULL);
 	io_poll_remove_all(ctx, NULL, NULL);
 
-	if (ctx->io_wq)
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
-
 	/* if we failed setting up the ctx, we might not have any rings */
 	io_iopoll_try_reap_events(ctx);
 
@@ -8976,13 +8955,14 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct files_struct *files)
 {
 	struct io_task_cancel cancel = { .task = task, .files = files, };
+	struct io_uring_task *tctx = current->io_uring;
 
 	while (1) {
 		enum io_wq_cancel cret;
 		bool ret = false;
 
-		if (ctx->io_wq) {
-			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+		if (tctx && tctx->io_wq) {
+			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
 					       &cancel, true);
 			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 		}
@@ -9094,7 +9074,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 	int ret;
 
 	if (unlikely(!tctx)) {
-		ret = io_uring_alloc_task_context(current);
+		ret = io_uring_alloc_task_context(current, ctx);
 		if (unlikely(ret))
 			return ret;
 		tctx = current->io_uring;
@@ -9164,8 +9144,12 @@ void __io_uring_files_cancel(struct files_struct *files)
 		io_uring_cancel_task_requests(file->private_data, files);
 	atomic_dec(&tctx->in_idle);
 
-	if (files)
+	if (files) {
 		io_uring_remove_task_files(tctx);
+	} else if (tctx->io_wq && current->flags & PF_EXITING) {
+		io_wq_destroy(tctx->io_wq);
+		tctx->io_wq = NULL;
+	}
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 2eb6d19de336..0e95398998b6 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -36,6 +36,7 @@ struct io_uring_task {
 	struct xarray		xa;
 	struct wait_queue_head	wait;
 	struct file		*last;
+	void			*io_wq;
 	struct percpu_counter	inflight;
 	struct io_identity	__identity;
 	struct io_identity	*identity;
-- 
cgit v1.2.3


From 3bfe6106693b6b4ba175ad1f929c4660b8f59ca8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Feb 2021 14:15:30 -0700
Subject: io-wq: fork worker threads from original task

Instead of using regular kthread kernel threads, create kernel threads
that are like a real thread that the task would create. This ensures that
we get all the context that we need, without having to carry that state
around. This greatly reduces the code complexity, and the risk of missing
state for a given request type.

With the move away from kthread, we can also dump everything related to
assigned state to the new threads.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c            | 301 +++++++++++++++++---------------------------------
 fs/io-wq.h            |   3 +-
 fs/io_uring.c         |   7 ++
 include/linux/sched.h |   3 +
 4 files changed, 116 insertions(+), 198 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index ec7f1106b659..b53f569b5b4e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,12 +13,9 @@
 #include <linux/sched/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
-#include <linux/kthread.h>
 #include <linux/rculist_nulls.h>
-#include <linux/fs_struct.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
 #include <linux/cpu.h>
+#include <linux/tracehook.h>
 
 #include "../kernel/sched/sched.h"
 #include "io-wq.h"
@@ -57,13 +54,6 @@ struct io_worker {
 	spinlock_t lock;
 
 	struct rcu_head rcu;
-	struct mm_struct *mm;
-#ifdef CONFIG_BLK_CGROUP
-	struct cgroup_subsys_state *blkcg_css;
-#endif
-	const struct cred *cur_creds;
-	const struct cred *saved_creds;
-	struct nsproxy *restore_nsproxy;
 };
 
 #if BITS_PER_LONG == 64
@@ -122,6 +112,8 @@ struct io_wq {
 	struct completion done;
 
 	struct hlist_node cpuhp_node;
+
+	pid_t task_pid;
 };
 
 static enum cpuhp_state io_wq_online;
@@ -137,61 +129,6 @@ static void io_worker_release(struct io_worker *worker)
 		wake_up_process(worker->task);
 }
 
-/*
- * Note: drops the wqe->lock if returning true! The caller must re-acquire
- * the lock in that case. Some callers need to restart handling if this
- * happens, so we can't just re-acquire the lock on behalf of the caller.
- */
-static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
-{
-	bool dropped_lock = false;
-
-	if (worker->saved_creds) {
-		revert_creds(worker->saved_creds);
-		worker->cur_creds = worker->saved_creds = NULL;
-	}
-
-	if (current->files) {
-		__acquire(&wqe->lock);
-		raw_spin_unlock_irq(&wqe->lock);
-		dropped_lock = true;
-
-		task_lock(current);
-		current->files = NULL;
-		current->nsproxy = worker->restore_nsproxy;
-		task_unlock(current);
-	}
-
-	if (current->fs)
-		current->fs = NULL;
-
-	/*
-	 * If we have an active mm, we need to drop the wq lock before unusing
-	 * it. If we do, return true and let the caller retry the idle loop.
-	 */
-	if (worker->mm) {
-		if (!dropped_lock) {
-			__acquire(&wqe->lock);
-			raw_spin_unlock_irq(&wqe->lock);
-			dropped_lock = true;
-		}
-		__set_current_state(TASK_RUNNING);
-		kthread_unuse_mm(worker->mm);
-		mmput(worker->mm);
-		worker->mm = NULL;
-	}
-
-#ifdef CONFIG_BLK_CGROUP
-	if (worker->blkcg_css) {
-		kthread_associate_blkcg(NULL);
-		worker->blkcg_css = NULL;
-	}
-#endif
-	if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	return dropped_lock;
-}
-
 static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
 						   struct io_wq_work *work)
 {
@@ -237,10 +174,6 @@ static void io_worker_exit(struct io_worker *worker)
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
-	if (__io_worker_unuse(wqe, worker)) {
-		__release(&wqe->lock);
-		raw_spin_lock_irq(&wqe->lock);
-	}
 	acct->nr_workers--;
 	raw_spin_unlock_irq(&wqe->lock);
 
@@ -323,14 +256,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 
 static void io_worker_start(struct io_worker *worker)
 {
-	allow_kernel_signal(SIGINT);
-
-	current->flags |= PF_IO_WORKER;
-	current->fs = NULL;
-	current->files = NULL;
-
 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
-	worker->restore_nsproxy = current->nsproxy;
 	io_wqe_inc_running(worker);
 }
 
@@ -387,7 +313,7 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	}
 
-	return __io_worker_unuse(wqe, worker);
+	return false;
 }
 
 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
@@ -426,96 +352,23 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 	return NULL;
 }
 
-static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
+static void io_flush_signals(void)
 {
-	if (worker->mm) {
-		kthread_unuse_mm(worker->mm);
-		mmput(worker->mm);
-		worker->mm = NULL;
+	if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
+		if (current->task_works)
+			task_work_run();
+		clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
 	}
-
-	if (mmget_not_zero(work->identity->mm)) {
-		kthread_use_mm(work->identity->mm);
-		worker->mm = work->identity->mm;
-		return;
-	}
-
-	/* failed grabbing mm, ensure work gets cancelled */
-	work->flags |= IO_WQ_WORK_CANCEL;
-}
-
-static inline void io_wq_switch_blkcg(struct io_worker *worker,
-				      struct io_wq_work *work)
-{
-#ifdef CONFIG_BLK_CGROUP
-	if (!(work->flags & IO_WQ_WORK_BLKCG))
-		return;
-	if (work->identity->blkcg_css != worker->blkcg_css) {
-		kthread_associate_blkcg(work->identity->blkcg_css);
-		worker->blkcg_css = work->identity->blkcg_css;
-	}
-#endif
-}
-
-static void io_wq_switch_creds(struct io_worker *worker,
-			       struct io_wq_work *work)
-{
-	const struct cred *old_creds = override_creds(work->identity->creds);
-
-	worker->cur_creds = work->identity->creds;
-	if (worker->saved_creds)
-		put_cred(old_creds); /* creds set by previous switch */
-	else
-		worker->saved_creds = old_creds;
-}
-
-static void io_impersonate_work(struct io_worker *worker,
-				struct io_wq_work *work)
-{
-	if ((work->flags & IO_WQ_WORK_FILES) &&
-	    current->files != work->identity->files) {
-		task_lock(current);
-		current->files = work->identity->files;
-		current->nsproxy = work->identity->nsproxy;
-		task_unlock(current);
-		if (!work->identity->files) {
-			/* failed grabbing files, ensure work gets cancelled */
-			work->flags |= IO_WQ_WORK_CANCEL;
-		}
-	}
-	if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
-		current->fs = work->identity->fs;
-	if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
-		io_wq_switch_mm(worker, work);
-	if ((work->flags & IO_WQ_WORK_CREDS) &&
-	    worker->cur_creds != work->identity->creds)
-		io_wq_switch_creds(worker, work);
-	if (work->flags & IO_WQ_WORK_FSIZE)
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
-	else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
-		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	io_wq_switch_blkcg(worker, work);
-#ifdef CONFIG_AUDIT
-	current->loginuid = work->identity->loginuid;
-	current->sessionid = work->identity->sessionid;
-#endif
 }
 
 static void io_assign_current_work(struct io_worker *worker,
 				   struct io_wq_work *work)
 {
 	if (work) {
-		/* flush pending signals before assigning new work */
-		if (signal_pending(current))
-			flush_signals(current);
+		io_flush_signals();
 		cond_resched();
 	}
 
-#ifdef CONFIG_AUDIT
-	current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
-	current->sessionid = AUDIT_SID_UNSET;
-#endif
-
 	spin_lock_irq(&worker->lock);
 	worker->cur_work = work;
 	spin_unlock_irq(&worker->lock);
@@ -556,7 +409,6 @@ get_next:
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
-			io_impersonate_work(worker, work);
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
@@ -608,10 +460,11 @@ loop:
 			goto loop;
 		}
 		raw_spin_unlock_irq(&wqe->lock);
-		if (signal_pending(current))
-			flush_signals(current);
+		io_flush_signals();
 		if (schedule_timeout(WORKER_IDLE_TIMEOUT))
 			continue;
+		if (fatal_signal_pending(current))
+			break;
 		/* timed out, exit unless we're the fixed worker */
 		if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
 		    !(worker->flags & IO_WORKER_F_FIXED))
@@ -635,8 +488,10 @@ loop:
  */
 void io_wq_worker_running(struct task_struct *tsk)
 {
-	struct io_worker *worker = kthread_data(tsk);
+	struct io_worker *worker = tsk->pf_io_worker;
 
+	if (!worker)
+		return;
 	if (!(worker->flags & IO_WORKER_F_UP))
 		return;
 	if (worker->flags & IO_WORKER_F_RUNNING)
@@ -652,9 +507,10 @@ void io_wq_worker_running(struct task_struct *tsk)
  */
 void io_wq_worker_sleeping(struct task_struct *tsk)
 {
-	struct io_worker *worker = kthread_data(tsk);
-	struct io_wqe *wqe = worker->wqe;
+	struct io_worker *worker = tsk->pf_io_worker;
 
+	if (!worker)
+		return;
 	if (!(worker->flags & IO_WORKER_F_UP))
 		return;
 	if (!(worker->flags & IO_WORKER_F_RUNNING))
@@ -662,32 +518,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 
 	worker->flags &= ~IO_WORKER_F_RUNNING;
 
-	raw_spin_lock_irq(&wqe->lock);
+	raw_spin_lock_irq(&worker->wqe->lock);
 	io_wqe_dec_running(worker);
-	raw_spin_unlock_irq(&wqe->lock);
+	raw_spin_unlock_irq(&worker->wqe->lock);
 }
 
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static int task_thread(void *data, int index)
 {
+	struct io_worker *worker = data;
+	struct io_wqe *wqe = worker->wqe;
 	struct io_wqe_acct *acct = &wqe->acct[index];
-	struct io_worker *worker;
+	struct io_wq *wq = wqe->wq;
+	char buf[TASK_COMM_LEN];
 
-	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
-	if (!worker)
-		return false;
+	sprintf(buf, "iou-wrk-%d", wq->task_pid);
+	set_task_comm(current, buf);
 
-	refcount_set(&worker->ref, 1);
-	worker->nulls_node.pprev = NULL;
-	worker->wqe = wqe;
-	spin_lock_init(&worker->lock);
+	current->pf_io_worker = worker;
+	worker->task = current;
 
-	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
-				"io_wqe_worker-%d/%d", index, wqe->node);
-	if (IS_ERR(worker->task)) {
-		kfree(worker);
-		return false;
-	}
-	kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
+	set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node));
+	current->flags |= PF_NO_SETAFFINITY;
 
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
@@ -703,8 +554,58 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	if (index == IO_WQ_ACCT_UNBOUND)
 		atomic_inc(&wq->user->processes);
 
+	io_wqe_worker(data);
+	do_exit(0);
+}
+
+static int task_thread_bound(void *data)
+{
+	return task_thread(data, IO_WQ_ACCT_BOUND);
+}
+
+static int task_thread_unbound(void *data)
+{
+	return task_thread(data, IO_WQ_ACCT_UNBOUND);
+}
+
+static pid_t fork_thread(int (*fn)(void *), void *arg)
+{
+	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+				CLONE_IO|SIGCHLD;
+	struct kernel_clone_args args = {
+		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+				    CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+	};
+
+	return kernel_clone(&args);
+}
+
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+{
+	struct io_worker *worker;
+	pid_t pid;
+
+	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+	if (!worker)
+		return false;
+
+	refcount_set(&worker->ref, 1);
+	worker->nulls_node.pprev = NULL;
+	worker->wqe = wqe;
+	spin_lock_init(&worker->lock);
+
+	if (index == IO_WQ_ACCT_BOUND)
+		pid = fork_thread(task_thread_bound, worker);
+	else
+		pid = fork_thread(task_thread_unbound, worker);
+	if (pid < 0) {
+		kfree(worker);
+		return false;
+	}
 	refcount_inc(&wq->refs);
-	wake_up_process(worker->task);
 	return true;
 }
 
@@ -756,12 +657,17 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 static int io_wq_manager(void *data)
 {
 	struct io_wq *wq = data;
+	char buf[TASK_COMM_LEN];
 	int node;
 
-	refcount_set(&wq->refs, 1);
+	sprintf(buf, "iou-mgr-%d", wq->task_pid);
+	set_task_comm(current, buf);
+	current->flags |= PF_IO_WORKER;
+	wq->manager = current;
+
 	complete(&wq->done);
 
-	while (!kthread_should_stop()) {
+	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		for_each_node(node) {
 			struct io_wqe *wqe = wq->wqes[node];
 			bool fork_worker[2] = { false, false };
@@ -782,11 +688,13 @@ static int io_wq_manager(void *data)
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ);
+		if (fatal_signal_pending(current))
+			set_bit(IO_WQ_BIT_EXIT, &wq->state);
 	}
 
 	if (refcount_dec_and_test(&wq->refs)) {
 		complete(&wq->done);
-		return 0;
+		do_exit(0);
 	}
 	/* if ERROR is set and we get here, we have workers to wake */
 	if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
@@ -795,7 +703,7 @@ static int io_wq_manager(void *data)
 			io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
 		rcu_read_unlock();
 	}
-	return 0;
+	do_exit(0);
 }
 
 static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
@@ -919,7 +827,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	spin_lock_irqsave(&worker->lock, flags);
 	if (worker->cur_work &&
 	    match->fn(worker->cur_work, match->data)) {
-		send_sig(SIGINT, worker->task, 1);
+		set_notify_signal(worker->task);
 		match->nr_running++;
 	}
 	spin_unlock_irqrestore(&worker->lock, flags);
@@ -1075,22 +983,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 		INIT_LIST_HEAD(&wqe->all_list);
 	}
 
+	wq->task_pid = current->pid;
 	init_completion(&wq->done);
+	refcount_set(&wq->refs, 1);
 
-	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
-	if (!IS_ERR(wq->manager)) {
-		wake_up_process(wq->manager);
+	current->flags |= PF_IO_WORKER;
+	ret = fork_thread(io_wq_manager, wq);
+	current->flags &= ~PF_IO_WORKER;
+	if (ret >= 0) {
 		wait_for_completion(&wq->done);
-		if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
-			ret = -ENOMEM;
-			goto err;
-		}
 		reinit_completion(&wq->done);
 		return wq;
 	}
 
-	ret = PTR_ERR(wq->manager);
-	complete(&wq->done);
+	if (refcount_dec_and_test(&wq->refs))
+		complete(&wq->done);
 err:
 	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 	for_each_node(node)
@@ -1110,7 +1017,7 @@ void io_wq_destroy(struct io_wq *wq)
 
 	set_bit(IO_WQ_BIT_EXIT, &wq->state);
 	if (wq->manager)
-		kthread_stop(wq->manager);
+		wake_up_process(wq->manager);
 
 	rcu_read_lock();
 	for_each_node(node)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index d2cf284b4641..83d56adabd16 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -137,6 +137,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
 
 static inline bool io_wq_current_is_worker(void)
 {
-	return in_task() && (current->flags & PF_IO_WORKER);
+	return in_task() && (current->flags & PF_IO_WORKER) &&
+		current->pf_io_worker;
 }
 #endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 31402a19fca6..9d22ec9d9406 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1633,6 +1633,9 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 	struct io_uring_task *tctx = req->task->io_uring;
 
+	BUG_ON(!tctx);
+	BUG_ON(!tctx->io_wq);
+
 	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 					&req->work, req->flags);
 	io_wq_enqueue(tctx->io_wq, &req->work);
@@ -9240,6 +9243,10 @@ static int io_uring_flush(struct file *file, void *data)
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx = file->private_data;
 
+	/* Ignore helper thread files exit */
+	if (current->flags & PF_IO_WORKER)
+		return 0;
+
 	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
 		io_uring_cancel_task_requests(ctx, NULL);
 		io_req_caches_free(ctx, current);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26f499810dfa..ef00bb22164c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,9 @@ struct task_struct {
 	/* CLONE_CHILD_CLEARTID: */
 	int __user			*clear_child_tid;
 
+	/* PF_IO_WORKER */
+	void				*pf_io_worker;
+
 	u64				utime;
 	u64				stime;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
-- 
cgit v1.2.3


From 4379bf8bd70b5de6bba7d53015b0c36c57a634ee Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Feb 2021 13:40:22 -0700
Subject: io_uring: remove io_identity

We are no longer grabbing state, so no need to maintain an IO identity
that we COW if there are changes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c               |  26 ++++++++++++
 fs/io-wq.h               |   2 +-
 fs/io_uring.c            | 104 ++++++++++++-----------------------------------
 include/linux/io_uring.h |  19 ---------
 4 files changed, 52 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 41042119bf0f..acc67ed3a52c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -53,6 +53,9 @@ struct io_worker {
 	struct io_wq_work *cur_work;
 	spinlock_t lock;
 
+	const struct cred *cur_creds;
+	const struct cred *saved_creds;
+
 	struct rcu_head rcu;
 };
 
@@ -171,6 +174,11 @@ static void io_worker_exit(struct io_worker *worker)
 	worker->flags = 0;
 	preempt_enable();
 
+	if (worker->saved_creds) {
+		revert_creds(worker->saved_creds);
+		worker->cur_creds = worker->saved_creds = NULL;
+	}
+
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
@@ -312,6 +320,10 @@ static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 		worker->flags |= IO_WORKER_F_FREE;
 		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	}
+	if (worker->saved_creds) {
+		revert_creds(worker->saved_creds);
+		worker->cur_creds = worker->saved_creds = NULL;
+	}
 }
 
 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
@@ -359,6 +371,18 @@ static void io_flush_signals(void)
 	}
 }
 
+static void io_wq_switch_creds(struct io_worker *worker,
+			       struct io_wq_work *work)
+{
+	const struct cred *old_creds = override_creds(work->creds);
+
+	worker->cur_creds = work->creds;
+	if (worker->saved_creds)
+		put_cred(old_creds); /* creds set by previous switch */
+	else
+		worker->saved_creds = old_creds;
+}
+
 static void io_assign_current_work(struct io_worker *worker,
 				   struct io_wq_work *work)
 {
@@ -407,6 +431,8 @@ get_next:
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
+			if (work->creds && worker->cur_creds != work->creds)
+				io_wq_switch_creds(worker, work);
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bbe05dd54716..584f0bd5a83d 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -78,7 +78,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
 
 struct io_wq_work {
 	struct io_wq_work_node list;
-	struct io_identity *identity;
+	const struct cred *creds;
 	unsigned flags;
 };
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e88295758b5..6d851033e48d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1094,7 +1094,7 @@ static bool io_match_task(struct io_kiocb *head,
 			continue;
 		if (req->file && req->file->f_op == &io_uring_fops)
 			return true;
-		if (req->work.identity->files == files)
+		if (req->task->files == files)
 			return true;
 	}
 	return false;
@@ -1218,31 +1218,6 @@ static inline void req_set_fail_links(struct io_kiocb *req)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
-/*
- * None of these are dereferenced, they are simply used to check if any of
- * them have changed. If we're under current and check they are still the
- * same, we're fine to grab references to them for actual out-of-line use.
- */
-static void io_init_identity(struct io_identity *id)
-{
-	id->files = current->files;
-	id->mm = current->mm;
-#ifdef CONFIG_BLK_CGROUP
-	rcu_read_lock();
-	id->blkcg_css = blkcg_css();
-	rcu_read_unlock();
-#endif
-	id->creds = current_cred();
-	id->nsproxy = current->nsproxy;
-	id->fs = current->fs;
-	id->fsize = rlimit(RLIMIT_FSIZE);
-#ifdef CONFIG_AUDIT
-	id->loginuid = current->loginuid;
-	id->sessionid = current->sessionid;
-#endif
-	refcount_set(&id->count, 1);
-}
-
 static inline void __io_req_init_async(struct io_kiocb *req)
 {
 	memset(&req->work, 0, sizeof(req->work));
@@ -1255,17 +1230,10 @@ static inline void __io_req_init_async(struct io_kiocb *req)
  */
 static inline void io_req_init_async(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = current->io_uring;
-
 	if (req->flags & REQ_F_WORK_INITIALIZED)
 		return;
 
 	__io_req_init_async(req);
-
-	/* Grab a ref if this isn't our static identity */
-	req->work.identity = tctx->identity;
-	if (tctx->identity != &tctx->__identity)
-		refcount_inc(&req->work.identity->count);
 }
 
 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1350,19 +1318,15 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
-static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
-{
-	if (req->work.identity == &tctx->__identity)
-		return;
-	if (refcount_dec_and_test(&req->work.identity->count))
-		kfree(req->work.identity);
-}
-
 static void io_req_clean_work(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
 		return;
 
+	if (req->work.creds) {
+		put_cred(req->work.creds);
+		req->work.creds = NULL;
+	}
 	if (req->flags & REQ_F_INFLIGHT) {
 		struct io_ring_ctx *ctx = req->ctx;
 		struct io_uring_task *tctx = req->task->io_uring;
@@ -1377,7 +1341,6 @@ static void io_req_clean_work(struct io_kiocb *req)
 	}
 
 	req->flags &= ~REQ_F_WORK_INITIALIZED;
-	io_put_identity(req->task->io_uring, req);
 }
 
 static void io_req_track_inflight(struct io_kiocb *req)
@@ -1411,6 +1374,8 @@ static void io_prep_async_work(struct io_kiocb *req)
 		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
+	if (!req->work.creds)
+		req->work.creds = get_current_cred();
 }
 
 static void io_prep_async_link(struct io_kiocb *req)
@@ -6376,9 +6341,9 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	const struct cred *old_creds = NULL;
 	int ret;
 
-	if ((req->flags & REQ_F_WORK_INITIALIZED) &&
-	    req->work.identity->creds != current_cred())
-		old_creds = override_creds(req->work.identity->creds);
+	if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+	    req->work.creds != current_cred())
+		old_creds = override_creds(req->work.creds);
 
 	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
@@ -6508,16 +6473,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		struct io_identity *iod;
-
-		iod = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!iod))
-			return -EINVAL;
-		refcount_inc(&iod->count);
-
 		__io_req_init_async(req);
-		get_cred(iod->creds);
-		req->work.identity = iod;
+		req->work.creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->work.creds))
+			return -EINVAL;
+		get_cred(req->work.creds);
 	}
 
 	state = &ctx->submit_state;
@@ -7936,8 +7896,6 @@ static int io_uring_alloc_task_context(struct task_struct *task,
 	tctx->last = NULL;
 	atomic_set(&tctx->in_idle, 0);
 	tctx->sqpoll = false;
-	io_init_identity(&tctx->__identity);
-	tctx->identity = &tctx->__identity;
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
@@ -7951,9 +7909,6 @@ void __io_uring_free(struct task_struct *tsk)
 	struct io_uring_task *tctx = tsk->io_uring;
 
 	WARN_ON_ONCE(!xa_empty(&tctx->xa));
-	WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
-	if (tctx->identity != &tctx->__identity)
-		kfree(tctx->identity);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -8593,13 +8548,11 @@ static int io_uring_fasync(int fd, struct file *file, int on)
 
 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 {
-	struct io_identity *iod;
+	const struct cred *creds;
 
-	iod = idr_remove(&ctx->personality_idr, id);
-	if (iod) {
-		put_cred(iod->creds);
-		if (refcount_dec_and_test(&iod->count))
-			kfree(iod);
+	creds = idr_remove(&ctx->personality_idr, id);
+	if (creds) {
+		put_cred(creds);
 		return 0;
 	}
 
@@ -9300,8 +9253,7 @@ out_fput:
 #ifdef CONFIG_PROC_FS
 static int io_uring_show_cred(int id, void *p, void *data)
 {
-	struct io_identity *iod = p;
-	const struct cred *cred = iod->creds;
+	const struct cred *cred = p;
 	struct seq_file *m = data;
 	struct user_namespace *uns = seq_user_ns(m);
 	struct group_info *gi;
@@ -9732,21 +9684,15 @@ out:
 
 static int io_register_personality(struct io_ring_ctx *ctx)
 {
-	struct io_identity *id;
+	const struct cred *creds;
 	int ret;
 
-	id = kmalloc(sizeof(*id), GFP_KERNEL);
-	if (unlikely(!id))
-		return -ENOMEM;
-
-	io_init_identity(id);
-	id->creds = get_current_cred();
+	creds = get_current_cred();
 
-	ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
-	if (ret < 0) {
-		put_cred(id->creds);
-		kfree(id);
-	}
+	ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+				USHRT_MAX, GFP_KERNEL);
+	if (ret < 0)
+		put_cred(creds);
 	return ret;
 }
 
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 0e95398998b6..c48fcbdc2ea8 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,23 +5,6 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
-struct io_identity {
-	struct files_struct		*files;
-	struct mm_struct		*mm;
-#ifdef CONFIG_BLK_CGROUP
-	struct cgroup_subsys_state	*blkcg_css;
-#endif
-	const struct cred		*creds;
-	struct nsproxy			*nsproxy;
-	struct fs_struct		*fs;
-	unsigned long			fsize;
-#ifdef CONFIG_AUDIT
-	kuid_t				loginuid;
-	unsigned int			sessionid;
-#endif
-	refcount_t			count;
-};
-
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -38,8 +21,6 @@ struct io_uring_task {
 	struct file		*last;
 	void			*io_wq;
 	struct percpu_counter	inflight;
-	struct io_identity	__identity;
-	struct io_identity	*identity;
 	atomic_t		in_idle;
 	bool			sqpoll;
 
-- 
cgit v1.2.3


From 2596b6ae412be3d29632efc63976a2132032e620 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 19 Feb 2021 14:51:42 -0500
Subject: kexec: move machine_kexec_post_load() to public interface

The kernel test robot reports the following compiler warning:

  | arch/arm64/kernel/machine_kexec.c:62:5: warning: no previous prototype for
  | function 'machine_kexec_post_load' [-Wmissing-prototypes]
  |    int machine_kexec_post_load(struct kimage *kimage)

Fix it by moving the declaration of machine_kexec_post_load() from
kexec_internal.h to the public header instead.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/linux-arm-kernel/202102030727.gqTokACH-lkp@intel.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Link: https://lore.kernel.org/r/20210219195142.13571-1-pasha.tatashin@soleen.com
Fixes: 4c3c31230c91 ("arm64: kexec: move relocation function setup")
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/kexec.h   | 2 ++
 kernel/kexec_internal.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9e93bef52968..3671b845cf28 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -309,6 +309,8 @@ extern void machine_kexec_cleanup(struct kimage *image);
 extern int kernel_kexec(void);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
+int machine_kexec_post_load(struct kimage *image);
+
 extern void __crash_kexec(struct pt_regs *);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 39d30ccf8d87..48aaf2ac0d0d 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -13,8 +13,6 @@ void kimage_terminate(struct kimage *image);
 int kimage_is_destination_range(struct kimage *image,
 				unsigned long start, unsigned long end);
 
-int machine_kexec_post_load(struct kimage *image);
-
 extern struct mutex kexec_mutex;
 
 #ifdef CONFIG_KEXEC_FILE
-- 
cgit v1.2.3


From 9fb407179c6fd910005040bebb040094ef959b6c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 21 Feb 2021 18:28:05 -0800
Subject: block: Remove unused blk_pm_*() function definitions

Commit a1ce35fa4985 ("block: remove dead elevator code") removed the last
callers of blk_pm_requeue_request(), blk_pm_add_request() and
blk_pm_put_request(). Hence remove the definitions of these functions.
Removing these functions removes all users of the struct request nr_pending
member. Hence also remove 'nr_pending'. Note: 'nr_pending' is no longer
used since commit 7cedffec8e75 ("block: Make blk_get_request() block for
non-PM requests while suspended").

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-pm.h         | 38 --------------------------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 39 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-pm.h b/block/blk-pm.h
index a2283cc9f716..8a5a0d4b357f 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -21,31 +21,6 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
 	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
 		pm_runtime_mark_last_busy(rq->q->dev);
 }
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
-	lockdep_assert_held(&rq->q->queue_lock);
-
-	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
-		rq->q->nr_pending--;
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
-				      struct request *rq)
-{
-	lockdep_assert_held(&q->queue_lock);
-
-	if (q->dev && !(rq->rq_flags & RQF_PM))
-		q->nr_pending++;
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
-	lockdep_assert_held(&rq->q->queue_lock);
-
-	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
-		--rq->q->nr_pending;
-}
 #else
 static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
 {
@@ -55,19 +30,6 @@ static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
 static inline void blk_pm_mark_last_busy(struct request *rq)
 {
 }
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
-				      struct request *rq)
-{
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
-}
 #endif
 
 #endif /* _BLOCK_BLK_PM_H_ */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9149f4a5adb3..f2e0697258b8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -462,7 +462,6 @@ struct request_queue {
 #ifdef CONFIG_PM
 	struct device		*dev;
 	enum rpm_status		rpm_status;
-	unsigned int		nr_pending;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 179d1600723670dc0d6ae8ce572e0e2c44b64763 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Sun, 21 Feb 2021 21:29:55 -0800
Subject: block: remove superfluous param in blk_fill_rwbs()

The last parameter for the function blk_fill_rwbs() was added in
5782138e47 ("tracing/events: convert block trace points to
TRACE_EVENT()") in order to signal read request and use of that parameter
was replaced with using switch case REQ_OP_READ with
1b9a9ab78b0 ("blktrace: use op accessors"), but the parameter was never
removed.

Remove the unused parameter and adjust the respective call sites.

Fixes: 1b9a9ab78b0 ("blktrace: use op accessors")
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h  |  2 +-
 include/trace/events/bcache.h | 10 +++++-----
 include/trace/events/block.h  | 16 ++++++++--------
 kernel/trace/blktrace.c       |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 05556573b896..11484f1d19a1 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
+extern void blk_fill_rwbs(char *rwbs, unsigned int op);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index e41c611d6d3b..899fdacf57b9 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u",
@@ -137,7 +137,7 @@ TRACE_EVENT(bcache_read,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
 	),
@@ -168,7 +168,7 @@ TRACE_EVENT(bcache_write,
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
 	),
@@ -238,7 +238,7 @@ TRACE_EVENT(bcache_journal_write,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		__entry->nr_keys	= keys;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u keys %u",
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 0d782663a005..879cba8bdfca 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -89,7 +89,7 @@ TRACE_EVENT(block_rq_requeue,
 		__entry->sector    = blk_rq_trace_sector(rq);
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
 	),
 
@@ -133,7 +133,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->nr_sector = nr_bytes >> 9;
 		__entry->error     = error;
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
 	),
 
@@ -166,7 +166,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
 		__entry->bytes     = blk_rq_bytes(rq);
 
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 		__get_str(cmd)[0] = '\0';
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -249,7 +249,7 @@ TRACE_EVENT(block_bio_complete,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= blk_status_to_errno(bio->bi_status);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
@@ -276,7 +276,7 @@ DECLARE_EVENT_CLASS(block_bio,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -433,7 +433,7 @@ TRACE_EVENT(block_split,
 		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -474,7 +474,7 @@ TRACE_EVENT(block_bio_remap,
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
@@ -518,7 +518,7 @@ TRACE_EVENT(block_rq_remap,
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		__entry->nr_bios	= blk_rq_count_bios(rq);
-		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e9ee4945043..8a2591c7aa41 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1867,7 +1867,7 @@ void blk_trace_remove_sysfs(struct device *dev)
 
 #ifdef CONFIG_EVENT_TRACING
 
-void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
+void blk_fill_rwbs(char *rwbs, unsigned int op)
 {
 	int i = 0;
 
-- 
cgit v1.2.3


From 1f83bb4b491472310ae7aeca505ed3725149906c Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Sun, 21 Feb 2021 21:29:56 -0800
Subject: blktrace: add blk_fill_rwbs documentation comment

blk_fill_rwbs() is an expoted function, add kernel style documentation
comment.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h |  2 +-
 kernel/trace/blktrace.c      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 11484f1d19a1..e17d04abf6a3 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-extern void blk_fill_rwbs(char *rwbs, unsigned int op);
+void blk_fill_rwbs(char *rwbs, unsigned int op);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8a2591c7aa41..d7ebef83771c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1867,6 +1867,16 @@ void blk_trace_remove_sysfs(struct device *dev)
 
 #ifdef CONFIG_EVENT_TRACING
 
+/**
+ * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
+ * @rwbs	buffer to be filled
+ * @op:		REQ_OP_XXX for the tracepoint
+ *
+ * Description:
+ *     Maps the REQ_OP_XXX to character and fills the buffer provided by the
+ *     caller with resulting string.
+ *
+ **/
 void blk_fill_rwbs(char *rwbs, unsigned int op)
 {
 	int i = 0;
-- 
cgit v1.2.3


From 4a42d848db9544e3108875390886dc490d9c101e Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Mon, 22 Feb 2021 11:45:22 +0900
Subject: KVM: x86/mmu: Consider the hva in mmu_notifier retry

Track the range being invalidated by mmu_notifier and skip page fault
retries if the fault address is not affected by the in-progress
invalidation. Handle concurrent invalidations by finding the minimal
range which includes all ranges being invalidated. Although the combined
range may include unrelated addresses and cannot be shrunk as individual
invalidation operations complete, it is unlikely the marginal gains of
proper range tracking are worth the additional complexity.

The primary benefit of this change is the reduction in the likelihood of
extreme latency when handing a page fault due to another thread having
been preempted while modifying host virtual addresses.

Signed-off-by: David Stevens <stevensd@chromium.org>
Message-Id: <20210222024522.1751719-3-stevensd@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  2 +-
 arch/x86/kvm/mmu/mmu.c                 | 23 +++++++++++++++++------
 arch/x86/kvm/mmu/paging_tmpl.h         | 14 +++++++++++---
 include/linux/kvm_host.h               | 25 ++++++++++++++++++++++++-
 virt/kvm/kvm_main.c                    | 29 +++++++++++++++++++++++++----
 6 files changed, 79 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 38ea396a23d6..8e06cd3f759c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 	} else {
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-					   writing, &write_ok);
+					   writing, &write_ok, NULL);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
 		page = NULL;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index bb35490400e9..e603de7ade52 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 
 		/* Call KVM generic code to do the slow-path check */
 		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-					   writing, upgrade_p);
+					   writing, upgrade_p, NULL);
 		if (is_error_noslot_pfn(pfn))
 			return -EFAULT;
 		page = NULL;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 752b4b7ab01b..d75524bc8423 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2734,6 +2734,13 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	if (sp->role.level > PG_LEVEL_4K)
 		return;
 
+	/*
+	 * If addresses are being invalidated, skip prefetching to avoid
+	 * accidentally prefetching those addresses.
+	 */
+	if (unlikely(vcpu->kvm->mmu_notifier_count))
+		return;
+
 	__direct_pte_prefetch(vcpu, sp, sptep);
 }
 
@@ -3640,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
-			 bool *writable)
+			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+			 bool write, bool *writable)
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	bool async;
@@ -3654,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	}
 
 	async = false;
-	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+				    write, writable, hva);
 	if (!async)
 		return false; /* *pfn has correct page already */
 
@@ -3668,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+				    write, writable, hva);
 	return false;
 }
 
@@ -3681,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	kvm_pfn_t pfn;
+	hva_t hva;
 	int r;
 
 	if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3699,7 +3709,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+			 write, &map_writable))
 		return RET_PF_RETRY;
 
 	if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
@@ -3712,7 +3723,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	else
 		write_lock(&vcpu->kvm->mmu_lock);
 
-	if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
+	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
 		goto out_unlock;
 	r = make_mmu_pages_available(vcpu);
 	if (r)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5844d3979bb8..55d7b473ac44 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 	if (sp->role.level > PG_LEVEL_4K)
 		return;
 
+	/*
+	 * If addresses are being invalidated, skip prefetching to avoid
+	 * accidentally prefetching those addresses.
+	 */
+	if (unlikely(vcpu->kvm->mmu_notifier_count))
+		return;
+
 	if (sp->role.direct)
 		return __direct_pte_prefetch(vcpu, sp, sptep);
 
@@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	struct guest_walker walker;
 	int r;
 	kvm_pfn_t pfn;
+	hva_t hva;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 	int max_level;
@@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
-			 &map_writable))
+	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+			 write_fault, &map_writable))
 		return RET_PF_RETRY;
 
 	if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
@@ -869,7 +877,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 
 	r = RET_PF_RETRY;
 	write_lock(&vcpu->kvm->mmu_lock);
-	if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
+	if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e126ebda36d0..1b65e7204344 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -11,6 +11,7 @@
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/bug.h>
+#include <linux/minmax.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/preempt.h>
@@ -506,6 +507,8 @@ struct kvm {
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
+	unsigned long mmu_notifier_range_start;
+	unsigned long mmu_notifier_range_end;
 #endif
 	long tlbs_dirty;
 	struct list_head devices;
@@ -733,7 +736,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
-			       bool *writable);
+			       bool *writable, hva_t *hva);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@ -1207,6 +1210,26 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 		return 1;
 	return 0;
 }
+
+static inline int mmu_notifier_retry_hva(struct kvm *kvm,
+					 unsigned long mmu_seq,
+					 unsigned long hva)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+	/*
+	 * If mmu_notifier_count is non-zero, then the range maintained by
+	 * kvm_mmu_notifier_invalidate_range_start contains all addresses that
+	 * might be being invalidated. Note that it may include some false
+	 * positives, due to shortcuts when handing concurrent invalidations.
+	 */
+	if (unlikely(kvm->mmu_notifier_count) &&
+	    hva >= kvm->mmu_notifier_range_start &&
+	    hva < kvm->mmu_notifier_range_end)
+		return 1;
+	if (kvm->mmu_notifier_seq != mmu_seq)
+		return 1;
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 001b9de4e727..383df23514b9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -486,6 +486,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_notifier_count++;
+	if (likely(kvm->mmu_notifier_count == 1)) {
+		kvm->mmu_notifier_range_start = range->start;
+		kvm->mmu_notifier_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has dimishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_notifier_range_start =
+			min(kvm->mmu_notifier_range_start, range->start);
+		kvm->mmu_notifier_range_end =
+			max(kvm->mmu_notifier_range_end, range->end);
+	}
 	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
 					     range->flags);
 	/* we've to flush the tlb before the pages can be freed */
@@ -2023,10 +2041,13 @@ exit:
 
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 			       bool atomic, bool *async, bool write_fault,
-			       bool *writable)
+			       bool *writable, hva_t *hva)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
+	if (hva)
+		*hva = addr;
+
 	if (addr == KVM_HVA_ERR_RO_BAD) {
 		if (writable)
 			*writable = false;
@@ -2054,19 +2075,19 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-				    write_fault, writable);
+				    write_fault, writable, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-- 
cgit v1.2.3


From fd70a406a344e084ac680c3f14e71d37d6023883 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:31:59 +0200
Subject: vdpa: Extend routine to accept vdpa device name

In a subsequent patch, when user initiated command creates a vdpa device,
the user chooses the name of the vdpa device.
To support it, extend the device allocation API to consider this name
specified by the caller driver.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-3-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/ifcvf/ifcvf_main.c   |  2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c |  2 +-
 drivers/vdpa/vdpa.c               | 36 ++++++++++++++++++++++++++++++++----
 drivers/vdpa/vdpa_sim/vdpa_sim.c  |  2 +-
 include/linux/vdpa.h              |  7 +++----
 5 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index fa1af301cf55..7c8bbfcf6c3e 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -432,7 +432,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
 				    dev, &ifc_vdpa_ops,
-				    IFCVF_MAX_QUEUE_PAIRS * 2);
+				    IFCVF_MAX_QUEUE_PAIRS * 2, NULL);
 	if (adapter == NULL) {
 		IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
 		return -ENOMEM;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b5fe6d2ad22f..dc88559a8d49 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1982,7 +1982,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-				 2 * mlx5_vdpa_max_qps(max_vqs));
+				 2 * mlx5_vdpa_max_qps(max_vqs), NULL);
 	if (IS_ERR(ndev))
 		return PTR_ERR(ndev);
 
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index c0825650c055..7414bbd9057c 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -12,6 +12,8 @@
 #include <linux/slab.h>
 #include <linux/vdpa.h>
 
+/* A global mutex that protects vdpa management device and device level operations. */
+static DEFINE_MUTEX(vdpa_dev_mutex);
 static DEFINE_IDA(vdpa_index_ida);
 
 static int vdpa_dev_probe(struct device *d)
@@ -63,6 +65,7 @@ static void vdpa_release_dev(struct device *d)
  * @config: the bus operations that is supported by this device
  * @nvqs: number of virtqueues supported by this device
  * @size: size of the parent structure that contains private data
+ * @name: name of the vdpa device; optional.
  *
  * Driver should use vdpa_alloc_device() wrapper macro instead of
  * using this directly.
@@ -72,8 +75,7 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					int nvqs,
-					size_t size)
+					int nvqs, size_t size, const char *name)
 {
 	struct vdpa_device *vdev;
 	int err = -EINVAL;
@@ -101,7 +103,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 	vdev->features_valid = false;
 	vdev->nvqs = nvqs;
 
-	err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
+	if (name)
+		err = dev_set_name(&vdev->dev, "%s", name);
+	else
+		err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
 	if (err)
 		goto err_name;
 
@@ -118,6 +123,13 @@ err:
 }
 EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
 
+static int vdpa_name_match(struct device *dev, const void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+
+	return (strcmp(dev_name(&vdev->dev), data) == 0);
+}
+
 /**
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
@@ -127,7 +139,21 @@ EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
  */
 int vdpa_register_device(struct vdpa_device *vdev)
 {
-	return device_add(&vdev->dev);
+	struct device *dev;
+	int err;
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
+	if (dev) {
+		put_device(dev);
+		err = -EEXIST;
+		goto name_err;
+	}
+
+	err = device_add(&vdev->dev);
+name_err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
 }
 EXPORT_SYMBOL_GPL(vdpa_register_device);
 
@@ -137,7 +163,9 @@ EXPORT_SYMBOL_GPL(vdpa_register_device);
  */
 void vdpa_unregister_device(struct vdpa_device *vdev)
 {
+	mutex_lock(&vdpa_dev_mutex);
 	device_unregister(&vdev->dev);
+	mutex_unlock(&vdpa_dev_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_unregister_device);
 
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index b3fcc67bfdf0..db1636a99ba4 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -235,7 +235,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
 		ops = &vdpasim_config_ops;
 
 	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
-				    dev_attr->nvqs);
+				    dev_attr->nvqs, NULL);
 	if (!vdpasim)
 		goto err_alloc;
 
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 0fefeb976877..5700baa22356 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -245,15 +245,14 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					int nvqs,
-					size_t size);
+					int nvqs, size_t size, const char *name);
 
-#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs, name)   \
 			  container_of(__vdpa_alloc_device( \
 				       parent, config, nvqs, \
 				       sizeof(dev_struct) + \
 				       BUILD_BUG_ON_ZERO(offsetof( \
-				       dev_struct, member))), \
+				       dev_struct, member)), name), \
 				       dev_struct, member)
 
 int vdpa_register_device(struct vdpa_device *vdev);
-- 
cgit v1.2.3


From 33b347503f014ebf76257327cbc7001c6b721956 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:00 +0200
Subject: vdpa: Define vdpa mgmt device, ops and a netlink interface

To add one or more VDPA devices, define a management device which
allows adding or removing vdpa device. A management device defines
set of callbacks to manage vdpa devices.

To begin with, it defines add and remove callbacks through which a user
defined vdpa device can be added or removed.

A unique management device is identified by its unique handle identified
by management device name and optionally the bus name.

Hence, introduce routine through which driver can register a
management device and its callback operations for adding and remove
a vdpa device.

Introduce vdpa netlink socket family so that user can query management
device and its attributes.

Example of show vdpa management device which allows creating vdpa device of
networking class (device id = 0x1) of virtio specification 1.1
section 5.1.1.

$ vdpa mgmtdev show
vdpasim_net:
  supported_classes:
    net

Example of showing vdpa management device in JSON format.

$ vdpa mgmtdev show -jp
{
    "show": {
        "vdpasim_net": {
            "supported_classes": [ "net" ]
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-4-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Including a bugfix:

vpda: correctly size vdpa_nl_policy

We need to ensure last entry of vdpa_nl_policy[]
is zero, otherwise out-of-bounds access is hurting us.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Eli Cohen <elic@nvidia.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20210210134911.4119555-1-eric.dumazet@gmail.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/Kconfig      |   1 +
 drivers/vdpa/vdpa.c       | 213 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/vdpa.h      |  31 +++++++
 include/uapi/linux/vdpa.h |  31 +++++++
 4 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/vdpa.h

(limited to 'include/linux')

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 92a6396f8a73..ffd1e098bfd2 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menuconfig VDPA
 	tristate "vDPA drivers"
+	depends on NET
 	help
 	  Enable this module to support vDPA device that uses a
 	  datapath which complies with virtio specifications with
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 7414bbd9057c..586e66b7f671 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -11,11 +11,17 @@
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/vdpa.h>
+#include <uapi/linux/vdpa.h>
+#include <net/genetlink.h>
+#include <linux/mod_devicetable.h>
 
+static LIST_HEAD(mdev_head);
 /* A global mutex that protects vdpa management device and device level operations. */
 static DEFINE_MUTEX(vdpa_dev_mutex);
 static DEFINE_IDA(vdpa_index_ida);
 
+static struct genl_family vdpa_nl_family;
+
 static int vdpa_dev_probe(struct device *d)
 {
 	struct vdpa_device *vdev = dev_to_vdpa(d);
@@ -195,13 +201,218 @@ void vdpa_unregister_driver(struct vdpa_driver *drv)
 }
 EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
 
+/**
+ * vdpa_mgmtdev_register - register a vdpa management device
+ *
+ * @mdev: Pointer to vdpa management device
+ * vdpa_mgmtdev_register() register a vdpa management device which supports
+ * vdpa device management.
+ */
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
+{
+	if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del)
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&mdev->list);
+	mutex_lock(&vdpa_dev_mutex);
+	list_add_tail(&mdev->list, &mdev_head);
+	mutex_unlock(&vdpa_dev_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
+
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
+{
+	mutex_lock(&vdpa_dev_mutex);
+	list_del(&mdev->list);
+	mutex_unlock(&vdpa_dev_mutex);
+}
+EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
+
+static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev,
+				 const char *busname, const char *devname)
+{
+	/* Bus name is optional for simulated management device, so ignore the
+	 * device with bus if bus attribute is provided.
+	 */
+	if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus))
+		return false;
+
+	if (!busname && strcmp(dev_name(mdev->device), devname) == 0)
+		return true;
+
+	if (busname && (strcmp(mdev->device->bus->name, busname) == 0) &&
+	    (strcmp(dev_name(mdev->device), devname) == 0))
+		return true;
+
+	return false;
+}
+
+static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs)
+{
+	struct vdpa_mgmt_dev *mdev;
+	const char *busname = NULL;
+	const char *devname;
+
+	if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+	devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]);
+	if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME])
+		busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]);
+
+	list_for_each_entry(mdev, &mdev_head, list) {
+		if (mgmtdev_handle_match(mdev, busname, devname))
+			return mdev;
+	}
+	return ERR_PTR(-ENODEV);
+}
+
+static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev)
+{
+	if (mdev->device->bus &&
+	    nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg,
+			     u32 portid, u32 seq, int flags)
+{
+	u64 supported_classes = 0;
+	void *hdr;
+	int i = 0;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW);
+	if (!hdr)
+		return -EMSGSIZE;
+	err = vdpa_nl_mgmtdev_handle_fill(msg, mdev);
+	if (err)
+		goto msg_err;
+
+	while (mdev->id_table[i].device) {
+		supported_classes |= BIT(mdev->id_table[i].device);
+		i++;
+	}
+
+	if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,
+			      supported_classes, VDPA_ATTR_UNSPEC)) {
+		err = -EMSGSIZE;
+		goto msg_err;
+	}
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+msg_err:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&vdpa_dev_mutex);
+	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+	if (IS_ERR(mdev)) {
+		mutex_unlock(&vdpa_dev_mutex);
+		NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device");
+		err = PTR_ERR(mdev);
+		goto out;
+	}
+
+	err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0);
+	mutex_unlock(&vdpa_dev_mutex);
+	if (err)
+		goto out;
+	err = genlmsg_reply(msg, info);
+	return err;
+
+out:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int
+vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct vdpa_mgmt_dev *mdev;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&vdpa_dev_mutex);
+	list_for_each_entry(mdev, &mdev_head, list) {
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&vdpa_dev_mutex);
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
+	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+};
+
+static const struct genl_ops vdpa_nl_ops[] = {
+	{
+		.cmd = VDPA_CMD_MGMTDEV_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_mgmtdev_get_doit,
+		.dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
+	},
+};
+
+static struct genl_family vdpa_nl_family __ro_after_init = {
+	.name = VDPA_GENL_NAME,
+	.version = VDPA_GENL_VERSION,
+	.maxattr = VDPA_ATTR_MAX,
+	.policy = vdpa_nl_policy,
+	.netnsok = false,
+	.module = THIS_MODULE,
+	.ops = vdpa_nl_ops,
+	.n_ops = ARRAY_SIZE(vdpa_nl_ops),
+};
+
 static int vdpa_init(void)
 {
-	return bus_register(&vdpa_bus);
+	int err;
+
+	err = bus_register(&vdpa_bus);
+	if (err)
+		return err;
+	err = genl_register_family(&vdpa_nl_family);
+	if (err)
+		goto err;
+	return 0;
+
+err:
+	bus_unregister(&vdpa_bus);
+	return err;
 }
 
 static void __exit vdpa_exit(void)
 {
+	genl_unregister_family(&vdpa_nl_family);
 	bus_unregister(&vdpa_bus);
 	ida_destroy(&vdpa_index_ida);
 }
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 5700baa22356..6b8b4222bca6 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -35,6 +35,8 @@ struct vdpa_vq_state {
 	u16	avail_index;
 };
 
+struct vdpa_mgmt_dev;
+
 /**
  * vDPA device - representation of a vDPA device
  * @dev: underlying device
@@ -335,4 +337,33 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
 	ops->get_config(vdev, offset, buf, len);
 }
 
+/**
+ * vdpa_mgmtdev_ops - vdpa device ops
+ * @dev_add:	Add a vdpa device using alloc and register
+ *		@mdev: parent device to use for device addition
+ *		@name: name of the new vdpa device
+ *		Driver need to add a new device using _vdpa_register_device()
+ *		after fully initializing the vdpa device. Driver must return 0
+ *		on success or appropriate error code.
+ * @dev_del:	Remove a vdpa device using unregister
+ *		@mdev: parent device to use for device removal
+ *		@dev: vdpa device to remove
+ *		Driver need to remove the specified device by calling
+ *		_vdpa_unregister_device().
+ */
+struct vdpa_mgmtdev_ops {
+	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
+	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
+};
+
+struct vdpa_mgmt_dev {
+	struct device *device;
+	const struct vdpa_mgmtdev_ops *ops;
+	const struct virtio_device_id *id_table; /* supported ids */
+	struct list_head list;
+};
+
+int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev);
+void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev);
+
 #endif /* _LINUX_VDPA_H */
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
new file mode 100644
index 000000000000..d44d82e567b1
--- /dev/null
+++ b/include/uapi/linux/vdpa.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * vdpa device management interface
+ * Copyright (c) 2020 Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#ifndef _UAPI_LINUX_VDPA_H_
+#define _UAPI_LINUX_VDPA_H_
+
+#define VDPA_GENL_NAME "vdpa"
+#define VDPA_GENL_VERSION 0x1
+
+enum vdpa_command {
+	VDPA_CMD_UNSPEC,
+	VDPA_CMD_MGMTDEV_NEW,
+	VDPA_CMD_MGMTDEV_GET,		/* can dump */
+};
+
+enum vdpa_attr {
+	VDPA_ATTR_UNSPEC,
+
+	/* bus name (optional) + dev name together make the parent device handle */
+	VDPA_ATTR_MGMTDEV_BUS_NAME,		/* string */
+	VDPA_ATTR_MGMTDEV_DEV_NAME,		/* string */
+	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
+
+	/* new attributes must be added above here */
+	VDPA_ATTR_MAX,
+};
+
+#endif
-- 
cgit v1.2.3


From 903f7bcaedb84ca47998e609015a34ddde93742e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 5 Jan 2021 12:32:01 +0200
Subject: vdpa: Enable a user to add and delete a vdpa device

Add the ability to add and delete a vdpa device.

Examples:
Create a vdpa device of type network named "foo2" from
the management device vdpasim:

$ vdpa dev add mgmtdev vdpasim_net name foo2

Delete the vdpa device after its use:
$ vdpa dev del foo2

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210105103203.82508-5-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 143 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/vdpa.h      |   6 ++
 include/uapi/linux/vdpa.h |   4 ++
 3 files changed, 143 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 586e66b7f671..f1228189814f 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -136,6 +136,37 @@ static int vdpa_name_match(struct device *dev, const void *data)
 	return (strcmp(dev_name(&vdev->dev), data) == 0);
 }
 
+static int __vdpa_register_device(struct vdpa_device *vdev)
+{
+	struct device *dev;
+
+	lockdep_assert_held(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
+	if (dev) {
+		put_device(dev);
+		return -EEXIST;
+	}
+	return device_add(&vdev->dev);
+}
+
+/**
+ * _vdpa_register_device - register a vDPA device with vdpa lock held
+ * Caller must have a succeed call of vdpa_alloc_device() before.
+ * Caller must invoke this routine in the management device dev_add()
+ * callback after setting up valid mgmtdev for this vdpa device.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ *
+ * Returns an error when fail to add device to vDPA bus
+ */
+int _vdpa_register_device(struct vdpa_device *vdev)
+{
+	if (!vdev->mdev)
+		return -EINVAL;
+
+	return __vdpa_register_device(vdev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_register_device);
+
 /**
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
@@ -145,24 +176,29 @@ static int vdpa_name_match(struct device *dev, const void *data)
  */
 int vdpa_register_device(struct vdpa_device *vdev)
 {
-	struct device *dev;
 	int err;
 
 	mutex_lock(&vdpa_dev_mutex);
-	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
-	if (dev) {
-		put_device(dev);
-		err = -EEXIST;
-		goto name_err;
-	}
-
-	err = device_add(&vdev->dev);
-name_err:
+	err = __vdpa_register_device(vdev);
 	mutex_unlock(&vdpa_dev_mutex);
 	return err;
 }
 EXPORT_SYMBOL_GPL(vdpa_register_device);
 
+/**
+ * _vdpa_unregister_device - unregister a vDPA device
+ * Caller must invoke this routine as part of management device dev_del()
+ * callback.
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void _vdpa_unregister_device(struct vdpa_device *vdev)
+{
+	lockdep_assert_held(&vdpa_dev_mutex);
+	WARN_ON(!vdev->mdev);
+	device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(_vdpa_unregister_device);
+
 /**
  * vdpa_unregister_device - unregister a vDPA device
  * @vdev: the vdpa device to be unregisted from vDPA bus
@@ -221,10 +257,25 @@ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register);
 
+static int vdpa_match_remove(struct device *dev, void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+	struct vdpa_mgmt_dev *mdev = vdev->mdev;
+
+	if (mdev == data)
+		mdev->ops->dev_del(mdev, vdev);
+	return 0;
+}
+
 void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
 {
 	mutex_lock(&vdpa_dev_mutex);
+
 	list_del(&mdev->list);
+
+	/* Filter out all the entries belong to this management device and delete it. */
+	bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove);
+
 	mutex_unlock(&vdpa_dev_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
@@ -368,9 +419,69 @@ out:
 	return msg->len;
 }
 
+static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	const char *name;
+	int err = 0;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+	mutex_lock(&vdpa_dev_mutex);
+	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
+	if (IS_ERR(mdev)) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device");
+		err = PTR_ERR(mdev);
+		goto err;
+	}
+
+	err = mdev->ops->dev_add(mdev, name);
+err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
+static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_mgmt_dev *mdev;
+	struct vdpa_device *vdev;
+	struct device *dev;
+	const char *name;
+	int err = 0;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto dev_err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user");
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	mdev = vdev->mdev;
+	mdev->ops->dev_del(mdev, vdev);
+mdev_err:
+	put_device(dev);
+dev_err:
+	mutex_unlock(&vdpa_dev_mutex);
+	return err;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
+	[VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING },
 };
 
 static const struct genl_ops vdpa_nl_ops[] = {
@@ -380,6 +491,18 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_mgmtdev_get_doit,
 		.dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_NEW,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_add_set_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = VDPA_CMD_DEV_DEL,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_del_set_doit,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 6b8b4222bca6..4ab5494503a8 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -45,6 +45,8 @@ struct vdpa_mgmt_dev;
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
  * @nvqs: maximum number of supported virtqueues
+ * @mdev: management device pointer; caller must setup when registering device as part
+ *	  of dev_add() mgmtdev ops callback before invoking _vdpa_register_device().
  */
 struct vdpa_device {
 	struct device dev;
@@ -53,6 +55,7 @@ struct vdpa_device {
 	unsigned int index;
 	bool features_valid;
 	int nvqs;
+	struct vdpa_mgmt_dev *mdev;
 };
 
 /**
@@ -260,6 +263,9 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 int vdpa_register_device(struct vdpa_device *vdev);
 void vdpa_unregister_device(struct vdpa_device *vdev);
 
+int _vdpa_register_device(struct vdpa_device *vdev);
+void _vdpa_unregister_device(struct vdpa_device *vdev);
+
 /**
  * vdpa_driver - operations for a vDPA driver
  * @driver: underlying device driver
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index d44d82e567b1..bb4a1f00eb1c 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -14,6 +14,8 @@ enum vdpa_command {
 	VDPA_CMD_UNSPEC,
 	VDPA_CMD_MGMTDEV_NEW,
 	VDPA_CMD_MGMTDEV_GET,		/* can dump */
+	VDPA_CMD_DEV_NEW,
+	VDPA_CMD_DEV_DEL,
 };
 
 enum vdpa_attr {
@@ -24,6 +26,8 @@ enum vdpa_attr {
 	VDPA_ATTR_MGMTDEV_DEV_NAME,		/* string */
 	VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES,	/* u64 */
 
+	VDPA_ATTR_DEV_NAME,			/* string */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };
-- 
cgit v1.2.3


From fd502729fbbf6a76fdb7acae4506486bfbb7c4f6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Mon, 4 Jan 2021 14:55:00 +0800
Subject: virtio-pci: introduce modern device module

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210104065503.199631-17-jasowang@redhat.com

Including a bugfix:

virtio: don't prompt CONFIG_VIRTIO_PCI_MODERN

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Fixes: 86b87c9d858b6 ("virtio-pci: introduce modern device module")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20210223061905.422659-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/Kconfig                 |   9 +
 drivers/virtio/Makefile                |   1 +
 drivers/virtio/virtio_pci_common.h     |  27 +-
 drivers/virtio/virtio_pci_modern.c     | 617 ---------------------------------
 drivers/virtio/virtio_pci_modern_dev.c | 599 ++++++++++++++++++++++++++++++++
 include/linux/virtio_pci_modern.h      | 111 ++++++
 6 files changed, 721 insertions(+), 643 deletions(-)
 create mode 100644 drivers/virtio/virtio_pci_modern_dev.c
 create mode 100644 include/linux/virtio_pci_modern.h

(limited to 'include/linux')

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 7b41130d3f35..ce1b3f6ec325 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -12,6 +12,14 @@ config ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
 	  This option is selected if the architecture may need to enforce
 	  VIRTIO_F_ACCESS_PLATFORM
 
+config VIRTIO_PCI_LIB
+	tristate
+	help
+	  Modern PCI device implementation. This module implements the
+	  basic probe and control for devices which are based on modern
+	  PCI device with possible vendor specific extensions. Any
+	  module that selects this module must depend on PCI.
+
 menuconfig VIRTIO_MENU
 	bool "Virtio drivers"
 	default y
@@ -21,6 +29,7 @@ if VIRTIO_MENU
 config VIRTIO_PCI
 	tristate "PCI driver for virtio devices"
 	depends on PCI
+	select VIRTIO_PCI_LIB
 	select VIRTIO
 	help
 	  This driver provides support for virtio based paravirtual device
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 591e6f72aa54..699bbea0465f 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
+obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index f35ff5b6b467..beec047a8f8d 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -25,6 +25,7 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/virtio_pci.h>
+#include <linux/virtio_pci_modern.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 
@@ -39,32 +40,6 @@ struct virtio_pci_vq_info {
 	unsigned msix_vector;
 };
 
-struct virtio_pci_modern_device {
-	struct pci_dev *pci_dev;
-
-	struct virtio_pci_common_cfg __iomem *common;
-	/* Device-specific data (non-legacy mode)  */
-	void __iomem *device;
-	/* Base of vq notifications (non-legacy mode). */
-	void __iomem *notify_base;
-	/* Where to read and clear interrupt */
-	u8 __iomem *isr;
-
-	/* So we can sanity-check accesses. */
-	size_t notify_len;
-	size_t device_len;
-
-	/* Capability for when we need to map notifications per-vq. */
-	int notify_map_cap;
-
-	/* Multiply queue_notify_off by this value. (non-legacy mode). */
-	u32 notify_offset_multiplier;
-
-	int modern_bars;
-
-	struct virtio_device_id id;
-};
-
 /* Our device structure */
 struct virtio_pci_device {
 	struct virtio_device vdev;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index a5e3a5e40323..fbd4ebc00eb6 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -19,158 +19,6 @@
 #define VIRTIO_RING_NO_LEGACY
 #include "virtio_pci_common.h"
 
-/*
- * Type-safe wrappers for io accesses.
- * Use these to enforce at compile time the following spec requirement:
- *
- * The driver MUST access each field using the “natural” access
- * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses
- * for 16-bit fields and 8-bit accesses for 8-bit fields.
- */
-static inline u8 vp_ioread8(const u8 __iomem *addr)
-{
-	return ioread8(addr);
-}
-static inline u16 vp_ioread16 (const __le16 __iomem *addr)
-{
-	return ioread16(addr);
-}
-
-static inline u32 vp_ioread32(const __le32 __iomem *addr)
-{
-	return ioread32(addr);
-}
-
-static inline void vp_iowrite8(u8 value, u8 __iomem *addr)
-{
-	iowrite8(value, addr);
-}
-
-static inline void vp_iowrite16(u16 value, __le16 __iomem *addr)
-{
-	iowrite16(value, addr);
-}
-
-static inline void vp_iowrite32(u32 value, __le32 __iomem *addr)
-{
-	iowrite32(value, addr);
-}
-
-static void vp_iowrite64_twopart(u64 val,
-				 __le32 __iomem *lo, __le32 __iomem *hi)
-{
-	vp_iowrite32((u32)val, lo);
-	vp_iowrite32(val >> 32, hi);
-}
-
-/*
- * vp_modern_map_capability - map a part of virtio pci capability
- * @mdev: the modern virtio-pci device
- * @off: offset of the capability
- * @minlen: minimal length of the capability
- * @align: align requirement
- * @start: start from the capability
- * @size: map size
- * @len: the length that is actually mapped
- *
- * Returns the io address of for the part of the capability
- */
-void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
-				       size_t minlen,
-				       u32 align,
-				       u32 start, u32 size,
-				       size_t *len)
-{
-	struct pci_dev *dev = mdev->pci_dev;
-	u8 bar;
-	u32 offset, length;
-	void __iomem *p;
-
-	pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
-						 bar),
-			     &bar);
-	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
-			     &offset);
-	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
-			      &length);
-
-	if (length <= start) {
-		dev_err(&dev->dev,
-			"virtio_pci: bad capability len %u (>%u expected)\n",
-			length, start);
-		return NULL;
-	}
-
-	if (length - start < minlen) {
-		dev_err(&dev->dev,
-			"virtio_pci: bad capability len %u (>=%zu expected)\n",
-			length, minlen);
-		return NULL;
-	}
-
-	length -= start;
-
-	if (start + offset < offset) {
-		dev_err(&dev->dev,
-			"virtio_pci: map wrap-around %u+%u\n",
-			start, offset);
-		return NULL;
-	}
-
-	offset += start;
-
-	if (offset & (align - 1)) {
-		dev_err(&dev->dev,
-			"virtio_pci: offset %u not aligned to %u\n",
-			offset, align);
-		return NULL;
-	}
-
-	if (length > size)
-		length = size;
-
-	if (len)
-		*len = length;
-
-	if (minlen + offset < minlen ||
-	    minlen + offset > pci_resource_len(dev, bar)) {
-		dev_err(&dev->dev,
-			"virtio_pci: map virtio %zu@%u "
-			"out of range on bar %i length %lu\n",
-			minlen, offset,
-			bar, (unsigned long)pci_resource_len(dev, bar));
-		return NULL;
-	}
-
-	p = pci_iomap_range(dev, bar, offset, length);
-	if (!p)
-		dev_err(&dev->dev,
-			"virtio_pci: unable to map virtio %u@%u on bar %i\n",
-			length, offset, bar);
-	return p;
-}
-
-/*
- * vp_modern_get_features - get features from device
- * @mdev: the modern virtio-pci device
- *
- * Returns the features read from the device
- */
-static u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	u64 features;
-
-	vp_iowrite32(0, &cfg->device_feature_select);
-	features = vp_ioread32(&cfg->device_feature);
-	vp_iowrite32(1, &cfg->device_feature_select);
-	features |= ((u64)vp_ioread32(&cfg->device_feature) << 32);
-
-	return features;
-}
-
-/* virtio config->get_features() implementation */
 static u64 vp_get_features(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -188,149 +36,6 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features)
 		__virtio_set_bit(vdev, VIRTIO_F_SR_IOV);
 }
 
-/*
- * vp_modern_set_features - set features to device
- * @mdev: the modern virtio-pci device
- * @features: the features set to device
- */
-static void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
-				   u64 features)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite32(0, &cfg->guest_feature_select);
-	vp_iowrite32((u32)features, &cfg->guest_feature);
-	vp_iowrite32(1, &cfg->guest_feature_select);
-	vp_iowrite32(features >> 32, &cfg->guest_feature);
-}
-
-/*
- * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
- * @mdev: the modern virtio-pci device
- * @index: queue index
- * @vector: the config vector
- *
- * Returns the config vector read from the device
- */
-static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
-				  u16 index, u16 vector)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite16(index, &cfg->queue_select);
-	vp_iowrite16(vector, &cfg->queue_msix_vector);
-	/* Flush the write out to device */
-	return vp_ioread16(&cfg->queue_msix_vector);
-}
-
-/*
- * vp_modern_queue_address - set the virtqueue address
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- * @desc_addr: address of the descriptor area
- * @driver_addr: address of the driver area
- * @device_addr: address of the device area
- */
-static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
-				    u16 index, u64 desc_addr, u64 driver_addr,
-				    u64 device_addr)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite16(index, &cfg->queue_select);
-
-	vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo,
-			     &cfg->queue_desc_hi);
-	vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo,
-			     &cfg->queue_avail_hi);
-	vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo,
-			     &cfg->queue_used_hi);
-}
-
-/*
- * vp_modern_set_queue_enable - enable a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- * @enable: whether the virtqueue is enable or not
- */
-static void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
-				       u16 index, bool enable)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-	vp_iowrite16(enable, &mdev->common->queue_enable);
-}
-
-/*
- * vp_modern_get_queue_enable - enable a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- *
- * Returns whether a virtqueue is enabled or not
- */
-static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
-				       u16 index)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-
-	return vp_ioread16(&mdev->common->queue_enable);
-}
-
-/*
- * vp_modern_set_queue_size - set size for a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- * @size: the size of the virtqueue
- */
-static void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
-				     u16 index, u16 size)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-	vp_iowrite16(size, &mdev->common->queue_size);
-
-}
-
-/*
- * vp_modern_get_queue_size - get size for a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- *
- * Returns the size of the virtqueue
- */
-static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
-				    u16 index)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-
-	return vp_ioread16(&mdev->common->queue_size);
-
-}
-
-/*
- * vp_modern_get_num_queues - get the number of virtqueues
- * @mdev: the modern virtio-pci device
- *
- * Returns the number of virtqueues
- */
-static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev)
-{
-	return vp_ioread16(&mdev->common->num_queues);
-}
-
-/*
- * vp_modern_get_queue_notify_off - get notification offset for a virtqueue
- * @mdev: the modern virtio-pci device
- * @index: the queue index
- *
- * Returns the notification offset for a virtqueue
- */
-static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
-					  u16 index)
-{
-	vp_iowrite16(index, &mdev->common->queue_select);
-
-	return vp_ioread16(&mdev->common->queue_notify_off);
-}
-
 /* virtio config->finalize_features() implementation */
 static int vp_finalize_features(struct virtio_device *vdev)
 {
@@ -429,19 +134,6 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 	}
 }
 
-/*
- * vp_modern_generation - get the device genreation
- * @mdev: the modern virtio-pci device
- *
- * Returns the genreation read from device
- */
-static u32 vp_modern_generation(struct virtio_pci_modern_device *mdev)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	return vp_ioread8(&cfg->config_generation);
-}
-
 static u32 vp_generation(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -449,19 +141,6 @@ static u32 vp_generation(struct virtio_device *vdev)
 	return vp_modern_generation(&vp_dev->mdev);
 }
 
-/*
- * vp_modern_get_status - get the device status
- * @mdev: the modern virtio-pci device
- *
- * Returns the status read from device
- */
-static u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	return vp_ioread8(&cfg->device_status);
-}
-
 /* config->{get,set}_status() implementations */
 static u8 vp_get_status(struct virtio_device *vdev)
 {
@@ -470,19 +149,6 @@ static u8 vp_get_status(struct virtio_device *vdev)
 	return vp_modern_get_status(&vp_dev->mdev);
 }
 
-/*
- * vp_modern_set_status - set status to device
- * @mdev: the modern virtio-pci device
- * @status: the status set to device
- */
-static void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
-				 u8 status)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	vp_iowrite8(status, &cfg->device_status);
-}
-
 static void vp_set_status(struct virtio_device *vdev, u8 status)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -510,25 +176,6 @@ static void vp_reset(struct virtio_device *vdev)
 	vp_synchronize_vectors(vdev);
 }
 
-/*
- * vp_modern_config_vector - set the vector for config interrupt
- * @mdev: the modern virtio-pci device
- * @vector: the config vector
- *
- * Returns the config vector read from the device
- */
-static u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
-				   u16 vector)
-{
-	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
-
-	/* Setup the vector used for configuration events */
-	vp_iowrite16(vector, &cfg->msix_config);
-	/* Verify we had enough resources to assign the vector */
-	/* Will also flush the write out to device */
-	return vp_ioread16(&cfg->msix_config);
-}
-
 static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 {
 	return vp_modern_config_vector(&vp_dev->mdev, vector);
@@ -789,253 +436,6 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 	.get_shm_region  = vp_get_shm_region,
 };
 
-/**
- * virtio_pci_find_capability - walk capabilities to find device info.
- * @dev: the pci device
- * @cfg_type: the VIRTIO_PCI_CAP_* value we seek
- * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
- * @bars: the bitmask of BARs
- *
- * Returns offset of the capability, or 0.
- */
-static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
-					     u32 ioresource_types, int *bars)
-{
-	int pos;
-
-	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
-	     pos > 0;
-	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
-		u8 type, bar;
-		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
-							 cfg_type),
-				     &type);
-		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
-							 bar),
-				     &bar);
-
-		/* Ignore structures with reserved BAR values */
-		if (bar > 0x5)
-			continue;
-
-		if (type == cfg_type) {
-			if (pci_resource_len(dev, bar) &&
-			    pci_resource_flags(dev, bar) & ioresource_types) {
-				*bars |= (1 << bar);
-				return pos;
-			}
-		}
-	}
-	return 0;
-}
-
-/* This is part of the ABI.  Don't screw with it. */
-static inline void check_offsets(void)
-{
-	/* Note: disk space was harmed in compilation of this function. */
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
-		     offsetof(struct virtio_pci_cap, cap_vndr));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
-		     offsetof(struct virtio_pci_cap, cap_next));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
-		     offsetof(struct virtio_pci_cap, cap_len));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
-		     offsetof(struct virtio_pci_cap, cfg_type));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
-		     offsetof(struct virtio_pci_cap, bar));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
-		     offsetof(struct virtio_pci_cap, offset));
-	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
-		     offsetof(struct virtio_pci_cap, length));
-	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
-		     offsetof(struct virtio_pci_notify_cap,
-			      notify_off_multiplier));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
-		     offsetof(struct virtio_pci_common_cfg,
-			      device_feature_select));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
-		     offsetof(struct virtio_pci_common_cfg, device_feature));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
-		     offsetof(struct virtio_pci_common_cfg,
-			      guest_feature_select));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
-		     offsetof(struct virtio_pci_common_cfg, guest_feature));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
-		     offsetof(struct virtio_pci_common_cfg, msix_config));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
-		     offsetof(struct virtio_pci_common_cfg, num_queues));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
-		     offsetof(struct virtio_pci_common_cfg, device_status));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
-		     offsetof(struct virtio_pci_common_cfg, config_generation));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
-		     offsetof(struct virtio_pci_common_cfg, queue_select));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
-		     offsetof(struct virtio_pci_common_cfg, queue_size));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
-		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
-		     offsetof(struct virtio_pci_common_cfg, queue_enable));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
-		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
-		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
-		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
-		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
-		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
-		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
-	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
-		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
-}
-
-/*
- * vp_modern_probe: probe the modern virtio pci device, note that the
- * caller is required to enable PCI device before calling this function.
- * @mdev: the modern virtio-pci device
- *
- * Return 0 on succeed otherwise fail
- */
-static int vp_modern_probe(struct virtio_pci_modern_device *mdev)
-{
-	struct pci_dev *pci_dev = mdev->pci_dev;
-	int err, common, isr, notify, device;
-	u32 notify_length;
-	u32 notify_offset;
-
-	check_offsets();
-
-	mdev->pci_dev = pci_dev;
-
-	/* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
-	if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
-		return -ENODEV;
-
-	if (pci_dev->device < 0x1040) {
-		/* Transitional devices: use the PCI subsystem device id as
-		 * virtio device id, same as legacy driver always did.
-		 */
-		mdev->id.device = pci_dev->subsystem_device;
-	} else {
-		/* Modern devices: simply use PCI device id, but start from 0x1040. */
-		mdev->id.device = pci_dev->device - 0x1040;
-	}
-	mdev->id.vendor = pci_dev->subsystem_vendor;
-
-	/* check for a common config: if not, use legacy mode (bar 0). */
-	common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
-					    IORESOURCE_IO | IORESOURCE_MEM,
-					    &mdev->modern_bars);
-	if (!common) {
-		dev_info(&pci_dev->dev,
-			 "virtio_pci: leaving for legacy driver\n");
-		return -ENODEV;
-	}
-
-	/* If common is there, these should be too... */
-	isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
-					 IORESOURCE_IO | IORESOURCE_MEM,
-					 &mdev->modern_bars);
-	notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
-					    IORESOURCE_IO | IORESOURCE_MEM,
-					    &mdev->modern_bars);
-	if (!isr || !notify) {
-		dev_err(&pci_dev->dev,
-			"virtio_pci: missing capabilities %i/%i/%i\n",
-			common, isr, notify);
-		return -EINVAL;
-	}
-
-	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
-	if (err)
-		err = dma_set_mask_and_coherent(&pci_dev->dev,
-						DMA_BIT_MASK(32));
-	if (err)
-		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
-
-	/* Device capability is only mandatory for devices that have
-	 * device-specific configuration.
-	 */
-	device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
-					    IORESOURCE_IO | IORESOURCE_MEM,
-					    &mdev->modern_bars);
-
-	err = pci_request_selected_regions(pci_dev, mdev->modern_bars,
-					   "virtio-pci-modern");
-	if (err)
-		return err;
-
-	err = -EINVAL;
-	mdev->common = vp_modern_map_capability(mdev, common,
-				      sizeof(struct virtio_pci_common_cfg), 4,
-				      0, sizeof(struct virtio_pci_common_cfg),
-				      NULL);
-	if (!mdev->common)
-		goto err_map_common;
-	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
-					     0, 1,
-					     NULL);
-	if (!mdev->isr)
-		goto err_map_isr;
-
-	/* Read notify_off_multiplier from config space. */
-	pci_read_config_dword(pci_dev,
-			      notify + offsetof(struct virtio_pci_notify_cap,
-						notify_off_multiplier),
-			      &mdev->notify_offset_multiplier);
-	/* Read notify length and offset from config space. */
-	pci_read_config_dword(pci_dev,
-			      notify + offsetof(struct virtio_pci_notify_cap,
-						cap.length),
-			      &notify_length);
-
-	pci_read_config_dword(pci_dev,
-			      notify + offsetof(struct virtio_pci_notify_cap,
-						cap.offset),
-			      &notify_offset);
-
-	/* We don't know how many VQs we'll map, ahead of the time.
-	 * If notify length is small, map it all now.
-	 * Otherwise, map each VQ individually later.
-	 */
-	if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
-		mdev->notify_base = vp_modern_map_capability(mdev, notify,
-							     2, 2,
-							     0, notify_length,
-							     &mdev->notify_len);
-		if (!mdev->notify_base)
-			goto err_map_notify;
-	} else {
-		mdev->notify_map_cap = notify;
-	}
-
-	/* Again, we don't know how much we should map, but PAGE_SIZE
-	 * is more than enough for all existing devices.
-	 */
-	if (device) {
-		mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
-							0, PAGE_SIZE,
-							&mdev->device_len);
-		if (!mdev->device)
-			goto err_map_device;
-	}
-
-	return 0;
-
-err_map_device:
-	if (mdev->notify_base)
-		pci_iounmap(pci_dev, mdev->notify_base);
-err_map_notify:
-	pci_iounmap(pci_dev, mdev->isr);
-err_map_isr:
-	pci_iounmap(pci_dev, mdev->common);
-err_map_common:
-	return err;
-}
-
 /* the PCI probing function */
 int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 {
@@ -1063,23 +463,6 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 	return 0;
 }
 
-/*
- * vp_modern_probe: remove and cleanup the modern virtio pci device
- * @mdev: the modern virtio-pci device
- */
-static void vp_modern_remove(struct virtio_pci_modern_device *mdev)
-{
-	struct pci_dev *pci_dev = mdev->pci_dev;
-
-	if (mdev->device)
-		pci_iounmap(pci_dev, mdev->device);
-	if (mdev->notify_base)
-		pci_iounmap(pci_dev, mdev->notify_base);
-	pci_iounmap(pci_dev, mdev->isr);
-	pci_iounmap(pci_dev, mdev->common);
-	pci_release_selected_regions(pci_dev, mdev->modern_bars);
-}
-
 void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev)
 {
 	struct virtio_pci_modern_device *mdev = &vp_dev->mdev;
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
new file mode 100644
index 000000000000..cbd667496bb1
--- /dev/null
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/virtio_pci_modern.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+/*
+ * vp_modern_map_capability - map a part of virtio pci capability
+ * @mdev: the modern virtio-pci device
+ * @off: offset of the capability
+ * @minlen: minimal length of the capability
+ * @align: align requirement
+ * @start: start from the capability
+ * @size: map size
+ * @len: the length that is actually mapped
+ *
+ * Returns the io address of for the part of the capability
+ */
+void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+				       size_t minlen,
+				       u32 align,
+				       u32 start, u32 size,
+				       size_t *len)
+{
+	struct pci_dev *dev = mdev->pci_dev;
+	u8 bar;
+	u32 offset, length;
+	void __iomem *p;
+
+	pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
+						 bar),
+			     &bar);
+	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset),
+			     &offset);
+	pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length),
+			      &length);
+
+	if (length <= start) {
+		dev_err(&dev->dev,
+			"virtio_pci: bad capability len %u (>%u expected)\n",
+			length, start);
+		return NULL;
+	}
+
+	if (length - start < minlen) {
+		dev_err(&dev->dev,
+			"virtio_pci: bad capability len %u (>=%zu expected)\n",
+			length, minlen);
+		return NULL;
+	}
+
+	length -= start;
+
+	if (start + offset < offset) {
+		dev_err(&dev->dev,
+			"virtio_pci: map wrap-around %u+%u\n",
+			start, offset);
+		return NULL;
+	}
+
+	offset += start;
+
+	if (offset & (align - 1)) {
+		dev_err(&dev->dev,
+			"virtio_pci: offset %u not aligned to %u\n",
+			offset, align);
+		return NULL;
+	}
+
+	if (length > size)
+		length = size;
+
+	if (len)
+		*len = length;
+
+	if (minlen + offset < minlen ||
+	    minlen + offset > pci_resource_len(dev, bar)) {
+		dev_err(&dev->dev,
+			"virtio_pci: map virtio %zu@%u "
+			"out of range on bar %i length %lu\n",
+			minlen, offset,
+			bar, (unsigned long)pci_resource_len(dev, bar));
+		return NULL;
+	}
+
+	p = pci_iomap_range(dev, bar, offset, length);
+	if (!p)
+		dev_err(&dev->dev,
+			"virtio_pci: unable to map virtio %u@%u on bar %i\n",
+			length, offset, bar);
+	return p;
+}
+EXPORT_SYMBOL_GPL(vp_modern_map_capability);
+
+/**
+ * virtio_pci_find_capability - walk capabilities to find device info.
+ * @dev: the pci device
+ * @cfg_type: the VIRTIO_PCI_CAP_* value we seek
+ * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO.
+ * @bars: the bitmask of BARs
+ *
+ * Returns offset of the capability, or 0.
+ */
+static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type,
+					     u32 ioresource_types, int *bars)
+{
+	int pos;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 type, bar;
+		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+							 cfg_type),
+				     &type);
+		pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap,
+							 bar),
+				     &bar);
+
+		/* Ignore structures with reserved BAR values */
+		if (bar > 0x5)
+			continue;
+
+		if (type == cfg_type) {
+			if (pci_resource_len(dev, bar) &&
+			    pci_resource_flags(dev, bar) & ioresource_types) {
+				*bars |= (1 << bar);
+				return pos;
+			}
+		}
+	}
+	return 0;
+}
+
+/* This is part of the ABI.  Don't screw with it. */
+static inline void check_offsets(void)
+{
+	/* Note: disk space was harmed in compilation of this function. */
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
+		     offsetof(struct virtio_pci_cap, cap_vndr));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
+		     offsetof(struct virtio_pci_cap, cap_next));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
+		     offsetof(struct virtio_pci_cap, cap_len));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE !=
+		     offsetof(struct virtio_pci_cap, cfg_type));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR !=
+		     offsetof(struct virtio_pci_cap, bar));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
+		     offsetof(struct virtio_pci_cap, offset));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
+		     offsetof(struct virtio_pci_cap, length));
+	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
+		     offsetof(struct virtio_pci_notify_cap,
+			      notify_off_multiplier));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      device_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
+		     offsetof(struct virtio_pci_common_cfg, device_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      guest_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
+		     offsetof(struct virtio_pci_common_cfg, guest_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, msix_config));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
+		     offsetof(struct virtio_pci_common_cfg, num_queues));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
+		     offsetof(struct virtio_pci_common_cfg, device_status));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
+		     offsetof(struct virtio_pci_common_cfg, config_generation));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
+		     offsetof(struct virtio_pci_common_cfg, queue_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_size));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_enable));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
+		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
+}
+
+/*
+ * vp_modern_probe: probe the modern virtio pci device, note that the
+ * caller is required to enable PCI device before calling this function.
+ * @mdev: the modern virtio-pci device
+ *
+ * Return 0 on succeed otherwise fail
+ */
+int vp_modern_probe(struct virtio_pci_modern_device *mdev)
+{
+	struct pci_dev *pci_dev = mdev->pci_dev;
+	int err, common, isr, notify, device;
+	u32 notify_length;
+	u32 notify_offset;
+
+	check_offsets();
+
+	mdev->pci_dev = pci_dev;
+
+	/* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */
+	if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
+		return -ENODEV;
+
+	if (pci_dev->device < 0x1040) {
+		/* Transitional devices: use the PCI subsystem device id as
+		 * virtio device id, same as legacy driver always did.
+		 */
+		mdev->id.device = pci_dev->subsystem_device;
+	} else {
+		/* Modern devices: simply use PCI device id, but start from 0x1040. */
+		mdev->id.device = pci_dev->device - 0x1040;
+	}
+	mdev->id.vendor = pci_dev->subsystem_vendor;
+
+	/* check for a common config: if not, use legacy mode (bar 0). */
+	common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG,
+					    IORESOURCE_IO | IORESOURCE_MEM,
+					    &mdev->modern_bars);
+	if (!common) {
+		dev_info(&pci_dev->dev,
+			 "virtio_pci: leaving for legacy driver\n");
+		return -ENODEV;
+	}
+
+	/* If common is there, these should be too... */
+	isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG,
+					 IORESOURCE_IO | IORESOURCE_MEM,
+					 &mdev->modern_bars);
+	notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG,
+					    IORESOURCE_IO | IORESOURCE_MEM,
+					    &mdev->modern_bars);
+	if (!isr || !notify) {
+		dev_err(&pci_dev->dev,
+			"virtio_pci: missing capabilities %i/%i/%i\n",
+			common, isr, notify);
+		return -EINVAL;
+	}
+
+	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (err)
+		err = dma_set_mask_and_coherent(&pci_dev->dev,
+						DMA_BIT_MASK(32));
+	if (err)
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
+
+	/* Device capability is only mandatory for devices that have
+	 * device-specific configuration.
+	 */
+	device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG,
+					    IORESOURCE_IO | IORESOURCE_MEM,
+					    &mdev->modern_bars);
+
+	err = pci_request_selected_regions(pci_dev, mdev->modern_bars,
+					   "virtio-pci-modern");
+	if (err)
+		return err;
+
+	err = -EINVAL;
+	mdev->common = vp_modern_map_capability(mdev, common,
+				      sizeof(struct virtio_pci_common_cfg), 4,
+				      0, sizeof(struct virtio_pci_common_cfg),
+				      NULL);
+	if (!mdev->common)
+		goto err_map_common;
+	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
+					     0, 1,
+					     NULL);
+	if (!mdev->isr)
+		goto err_map_isr;
+
+	/* Read notify_off_multiplier from config space. */
+	pci_read_config_dword(pci_dev,
+			      notify + offsetof(struct virtio_pci_notify_cap,
+						notify_off_multiplier),
+			      &mdev->notify_offset_multiplier);
+	/* Read notify length and offset from config space. */
+	pci_read_config_dword(pci_dev,
+			      notify + offsetof(struct virtio_pci_notify_cap,
+						cap.length),
+			      &notify_length);
+
+	pci_read_config_dword(pci_dev,
+			      notify + offsetof(struct virtio_pci_notify_cap,
+						cap.offset),
+			      &notify_offset);
+
+	/* We don't know how many VQs we'll map, ahead of the time.
+	 * If notify length is small, map it all now.
+	 * Otherwise, map each VQ individually later.
+	 */
+	if ((u64)notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
+		mdev->notify_base = vp_modern_map_capability(mdev, notify,
+							     2, 2,
+							     0, notify_length,
+							     &mdev->notify_len);
+		if (!mdev->notify_base)
+			goto err_map_notify;
+	} else {
+		mdev->notify_map_cap = notify;
+	}
+
+	/* Again, we don't know how much we should map, but PAGE_SIZE
+	 * is more than enough for all existing devices.
+	 */
+	if (device) {
+		mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
+							0, PAGE_SIZE,
+							&mdev->device_len);
+		if (!mdev->device)
+			goto err_map_device;
+	}
+
+	return 0;
+
+err_map_device:
+	if (mdev->notify_base)
+		pci_iounmap(pci_dev, mdev->notify_base);
+err_map_notify:
+	pci_iounmap(pci_dev, mdev->isr);
+err_map_isr:
+	pci_iounmap(pci_dev, mdev->common);
+err_map_common:
+	return err;
+}
+EXPORT_SYMBOL_GPL(vp_modern_probe);
+
+/*
+ * vp_modern_probe: remove and cleanup the modern virtio pci device
+ * @mdev: the modern virtio-pci device
+ */
+void vp_modern_remove(struct virtio_pci_modern_device *mdev)
+{
+	struct pci_dev *pci_dev = mdev->pci_dev;
+
+	if (mdev->device)
+		pci_iounmap(pci_dev, mdev->device);
+	if (mdev->notify_base)
+		pci_iounmap(pci_dev, mdev->notify_base);
+	pci_iounmap(pci_dev, mdev->isr);
+	pci_iounmap(pci_dev, mdev->common);
+	pci_release_selected_regions(pci_dev, mdev->modern_bars);
+}
+EXPORT_SYMBOL_GPL(vp_modern_remove);
+
+/*
+ * vp_modern_get_features - get features from device
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the features read from the device
+ */
+u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	u64 features;
+
+	vp_iowrite32(0, &cfg->device_feature_select);
+	features = vp_ioread32(&cfg->device_feature);
+	vp_iowrite32(1, &cfg->device_feature_select);
+	features |= ((u64)vp_ioread32(&cfg->device_feature) << 32);
+
+	return features;
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_features);
+
+/*
+ * vp_modern_set_features - set features to device
+ * @mdev: the modern virtio-pci device
+ * @features: the features set to device
+ */
+void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
+			    u64 features)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite32(0, &cfg->guest_feature_select);
+	vp_iowrite32((u32)features, &cfg->guest_feature);
+	vp_iowrite32(1, &cfg->guest_feature_select);
+	vp_iowrite32(features >> 32, &cfg->guest_feature);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_features);
+
+/*
+ * vp_modern_generation - get the device genreation
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the genreation read from device
+ */
+u32 vp_modern_generation(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	return vp_ioread8(&cfg->config_generation);
+}
+EXPORT_SYMBOL_GPL(vp_modern_generation);
+
+/*
+ * vp_modern_get_status - get the device status
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the status read from device
+ */
+u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	return vp_ioread8(&cfg->device_status);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_status);
+
+/*
+ * vp_modern_set_status - set status to device
+ * @mdev: the modern virtio-pci device
+ * @status: the status set to device
+ */
+void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
+				 u8 status)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite8(status, &cfg->device_status);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_status);
+
+/*
+ * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
+			   u16 index, u16 vector)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite16(index, &cfg->queue_select);
+	vp_iowrite16(vector, &cfg->queue_msix_vector);
+	/* Flush the write out to device */
+	return vp_ioread16(&cfg->queue_msix_vector);
+}
+EXPORT_SYMBOL_GPL(vp_modern_queue_vector);
+
+/*
+ * vp_modern_config_vector - set the vector for config interrupt
+ * @mdev: the modern virtio-pci device
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
+			    u16 vector)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	/* Setup the vector used for configuration events */
+	vp_iowrite16(vector, &cfg->msix_config);
+	/* Verify we had enough resources to assign the vector */
+	/* Will also flush the write out to device */
+	return vp_ioread16(&cfg->msix_config);
+}
+EXPORT_SYMBOL_GPL(vp_modern_config_vector);
+
+/*
+ * vp_modern_queue_address - set the virtqueue address
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @desc_addr: address of the descriptor area
+ * @driver_addr: address of the driver area
+ * @device_addr: address of the device area
+ */
+void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
+			     u16 index, u64 desc_addr, u64 driver_addr,
+			     u64 device_addr)
+{
+	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+
+	vp_iowrite16(index, &cfg->queue_select);
+
+	vp_iowrite64_twopart(desc_addr, &cfg->queue_desc_lo,
+			     &cfg->queue_desc_hi);
+	vp_iowrite64_twopart(driver_addr, &cfg->queue_avail_lo,
+			     &cfg->queue_avail_hi);
+	vp_iowrite64_twopart(device_addr, &cfg->queue_used_lo,
+			     &cfg->queue_used_hi);
+}
+EXPORT_SYMBOL_GPL(vp_modern_queue_address);
+
+/*
+ * vp_modern_set_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @enable: whether the virtqueue is enable or not
+ */
+void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 index, bool enable)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+	vp_iowrite16(enable, &mdev->common->queue_enable);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_enable);
+
+/*
+ * vp_modern_get_queue_enable - enable a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns whether a virtqueue is enabled or not
+ */
+bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 index)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+
+	return vp_ioread16(&mdev->common->queue_enable);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_enable);
+
+/*
+ * vp_modern_set_queue_size - set size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ * @size: the size of the virtqueue
+ */
+void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
+			      u16 index, u16 size)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+	vp_iowrite16(size, &mdev->common->queue_size);
+
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_size);
+
+/*
+ * vp_modern_get_queue_size - get size for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the size of the virtqueue
+ */
+u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
+			     u16 index)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+
+	return vp_ioread16(&mdev->common->queue_size);
+
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_size);
+
+/*
+ * vp_modern_get_num_queues - get the number of virtqueues
+ * @mdev: the modern virtio-pci device
+ *
+ * Returns the number of virtqueues
+ */
+u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev)
+{
+	return vp_ioread16(&mdev->common->num_queues);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_num_queues);
+
+/*
+ * vp_modern_get_queue_notify_off - get notification offset for a virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the notification offset for a virtqueue
+ */
+u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+				   u16 index)
+{
+	vp_iowrite16(index, &mdev->common->queue_select);
+
+	return vp_ioread16(&mdev->common->queue_notify_off);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off);
+
+MODULE_VERSION("0.1");
+MODULE_DESCRIPTION("Modern Virtio PCI Device");
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
new file mode 100644
index 000000000000..f26acbeec965
--- /dev/null
+++ b/include/linux/virtio_pci_modern.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_PCI_MODERN_H
+#define _LINUX_VIRTIO_PCI_MODERN_H
+
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+
+struct virtio_pci_modern_device {
+	struct pci_dev *pci_dev;
+
+	struct virtio_pci_common_cfg __iomem *common;
+	/* Device-specific data (non-legacy mode)  */
+	void __iomem *device;
+	/* Base of vq notifications (non-legacy mode). */
+	void __iomem *notify_base;
+	/* Where to read and clear interrupt */
+	u8 __iomem *isr;
+
+	/* So we can sanity-check accesses. */
+	size_t notify_len;
+	size_t device_len;
+
+	/* Capability for when we need to map notifications per-vq. */
+	int notify_map_cap;
+
+	/* Multiply queue_notify_off by this value. (non-legacy mode). */
+	u32 notify_offset_multiplier;
+
+	int modern_bars;
+
+	struct virtio_device_id id;
+};
+
+/*
+ * Type-safe wrappers for io accesses.
+ * Use these to enforce at compile time the following spec requirement:
+ *
+ * The driver MUST access each field using the “natural” access
+ * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses
+ * for 16-bit fields and 8-bit accesses for 8-bit fields.
+ */
+static inline u8 vp_ioread8(const u8 __iomem *addr)
+{
+	return ioread8(addr);
+}
+static inline u16 vp_ioread16 (const __le16 __iomem *addr)
+{
+	return ioread16(addr);
+}
+
+static inline u32 vp_ioread32(const __le32 __iomem *addr)
+{
+	return ioread32(addr);
+}
+
+static inline void vp_iowrite8(u8 value, u8 __iomem *addr)
+{
+	iowrite8(value, addr);
+}
+
+static inline void vp_iowrite16(u16 value, __le16 __iomem *addr)
+{
+	iowrite16(value, addr);
+}
+
+static inline void vp_iowrite32(u32 value, __le32 __iomem *addr)
+{
+	iowrite32(value, addr);
+}
+
+static inline void vp_iowrite64_twopart(u64 val,
+					__le32 __iomem *lo,
+					__le32 __iomem *hi)
+{
+	vp_iowrite32((u32)val, lo);
+	vp_iowrite32(val >> 32, hi);
+}
+
+u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev);
+void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
+		     u64 features);
+u32 vp_modern_generation(struct virtio_pci_modern_device *mdev);
+u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev);
+void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
+		   u8 status);
+u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
+			   u16 idx, u16 vector);
+u16 vp_modern_config_vector(struct virtio_pci_modern_device *mdev,
+		     u16 vector);
+void vp_modern_queue_address(struct virtio_pci_modern_device *mdev,
+			     u16 index, u64 desc_addr, u64 driver_addr,
+			     u64 device_addr);
+void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 idx, bool enable);
+bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev,
+				u16 idx);
+void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
+			      u16 idx, u16 size);
+u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
+			     u16 idx);
+u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
+u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+				   u16 idx);
+void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+				       size_t minlen,
+				       u32 align,
+				       u32 start, u32 size,
+				       size_t *len);
+int vp_modern_probe(struct virtio_pci_modern_device *mdev);
+void vp_modern_remove(struct virtio_pci_modern_device *mdev);
+#endif
-- 
cgit v1.2.3


From c0ea57608b691d6cde8aff23e11f9858a86b5918 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 16 Feb 2021 16:52:47 +0100
Subject: blktrace: remove debugfs file dentries from struct blk_trace

These debugfs dentries do not need to be saved for anything as the whole
directory and everything in it is properly cleaned up when the parent
directory is removed.  So remove them from struct blk_trace and don't
save them when created as it's not necessary.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h | 2 --
 kernel/trace/blktrace.c      | 8 ++------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index e17d04abf6a3..a083e15df608 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -23,8 +23,6 @@ struct blk_trace {
 	u32 pid;
 	u32 dev;
 	struct dentry *dir;
-	struct dentry *dropped_file;
-	struct dentry *msg_file;
 	struct list_head running_list;
 	atomic_t dropped;
 };
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d7ebef83771c..0c8690d9f6cd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,8 +311,6 @@ record_it:
 
 static void blk_trace_free(struct blk_trace *bt)
 {
-	debugfs_remove(bt->msg_file);
-	debugfs_remove(bt->dropped_file);
 	relay_close(bt->rchan);
 	debugfs_remove(bt->dir);
 	free_percpu(bt->sequence);
@@ -544,10 +542,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	INIT_LIST_HEAD(&bt->running_list);
 
 	ret = -EIO;
-	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
-					       &blk_dropped_fops);
-
-	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
 
 	bt->rchan = relay_open("trace", dir, buts->buf_size,
 				buts->buf_nr, &blk_relay_callbacks, bt);
-- 
cgit v1.2.3


From ee576c47db60432c37e54b1e2b43a8ca6d3a8dca Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 23 Feb 2021 14:18:58 +0100
Subject: net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending

The icmp{,v6}_send functions make all sorts of use of skb->cb, casting
it with IPCB or IP6CB, assuming the skb to have come directly from the
inet layer. But when the packet comes from the ndo layer, especially
when forwarded, there's no telling what might be in skb->cb at that
point. As a result, the icmp sending code risks reading bogus memory
contents, which can result in nasty stack overflows such as this one
reported by a user:

    panic+0x108/0x2ea
    __stack_chk_fail+0x14/0x20
    __icmp_send+0x5bd/0x5c0
    icmp_ndo_send+0x148/0x160

In icmp_send, skb->cb is cast with IPCB and an ip_options struct is read
from it. The optlen parameter there is of particular note, as it can
induce writes beyond bounds. There are quite a few ways that can happen
in __ip_options_echo. For example:

    // sptr/skb are attacker-controlled skb bytes
    sptr = skb_network_header(skb);
    // dptr/dopt points to stack memory allocated by __icmp_send
    dptr = dopt->__data;
    // sopt is the corrupt skb->cb in question
    if (sopt->rr) {
        optlen  = sptr[sopt->rr+1]; // corrupt skb->cb + skb->data
        soffset = sptr[sopt->rr+2]; // corrupt skb->cb + skb->data
	// this now writes potentially attacker-controlled data, over
	// flowing the stack:
        memcpy(dptr, sptr+sopt->rr, optlen);
    }

In the icmpv6_send case, the story is similar, but not as dire, as only
IP6CB(skb)->iif and IP6CB(skb)->dsthao are used. The dsthao case is
worse than the iif case, but it is passed to ipv6_find_tlv, which does
a bit of bounds checking on the value.

This is easy to simulate by doing a `memset(skb->cb, 0x41,
sizeof(skb->cb));` before calling icmp{,v6}_ndo_send, and it's only by
good fortune and the rarity of icmp sending from that context that we've
avoided reports like this until now. For example, in KASAN:

    BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xa0e/0x12b0
    Write of size 38 at addr ffff888006f1f80e by task ping/89
    CPU: 2 PID: 89 Comm: ping Not tainted 5.10.0-rc7-debug+ #5
    Call Trace:
     dump_stack+0x9a/0xcc
     print_address_description.constprop.0+0x1a/0x160
     __kasan_report.cold+0x20/0x38
     kasan_report+0x32/0x40
     check_memory_region+0x145/0x1a0
     memcpy+0x39/0x60
     __ip_options_echo+0xa0e/0x12b0
     __icmp_send+0x744/0x1700

Actually, out of the 4 drivers that do this, only gtp zeroed the cb for
the v4 case, while the rest did not. So this commit actually removes the
gtp-specific zeroing, while putting the code where it belongs in the
shared infrastructure of icmp{,v6}_ndo_send.

This commit fixes the issue by passing an empty IPCB or IP6CB along to
the functions that actually do the work. For the icmp_send, this was
already trivial, thanks to __icmp_send providing the plumbing function.
For icmpv6_send, this required a tiny bit of refactoring to make it
behave like the v4 case, after which it was straight forward.

Fixes: a2b78e9b2cac ("sunvnet: generate ICMP PTMUD messages for smaller port MTUs")
Reported-by: SinYu <liuxyon@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/netdev/CAF=yD-LOF116aHub6RMe8vB8ZpnrrnoTdqhobEx+bvoA8AsP0w@mail.gmail.com/T/
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20210223131858.72082-1-Jason@zx2c4.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/gtp.c      |  1 -
 include/linux/icmpv6.h | 26 ++++++++++++++++++++------
 include/linux/ipv6.h   |  1 -
 include/net/icmp.h     |  6 +++++-
 net/ipv4/icmp.c        |  5 +++--
 net/ipv6/icmp.c        | 18 +++++++++---------
 net/ipv6/ip6_icmp.c    | 12 +++++++-----
 7 files changed, 44 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 9a70f05baf6e..39c00f050fbd 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -543,7 +543,6 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) &&
 	    mtu < ntohs(iph->tot_len)) {
 		netdev_dbg(dev, "packet too big, fragmentation needed\n");
-		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 		icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			      htonl(mtu));
 		goto err_rt;
diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 452d8978ffc7..9055cb380ee2 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -3,6 +3,7 @@
 #define _LINUX_ICMPV6_H
 
 #include <linux/skbuff.h>
+#include <linux/ipv6.h>
 #include <uapi/linux/icmpv6.h>
 
 static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
@@ -15,13 +16,16 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_IPV6)
 
 typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info,
-			     const struct in6_addr *force_saddr);
+			     const struct in6_addr *force_saddr,
+			     const struct inet6_skb_parm *parm);
 void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
-		const struct in6_addr *force_saddr);
+		const struct in6_addr *force_saddr,
+		const struct inet6_skb_parm *parm);
 #if IS_BUILTIN(CONFIG_IPV6)
-static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+				 const struct inet6_skb_parm *parm)
 {
-	icmp6_send(skb, type, code, info, NULL);
+	icmp6_send(skb, type, code, info, NULL, parm);
 }
 static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
 {
@@ -34,18 +38,28 @@ static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
 	return 0;
 }
 #else
-extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info);
+extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+			  const struct inet6_skb_parm *parm);
 extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn);
 extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
 #endif
 
+static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+{
+	__icmpv6_send(skb, type, code, info, IP6CB(skb));
+}
+
 int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 			       unsigned int data_len);
 
 #if IS_ENABLED(CONFIG_NF_NAT)
 void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
 #else
-#define icmpv6_ndo_send icmpv6_send
+static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+{
+	struct inet6_skb_parm parm = { 0 };
+	__icmpv6_send(skb_in, type, code, info, &parm);
+}
 #endif
 
 #else
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9d1f29f0c512..70b2ad3b9884 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -85,7 +85,6 @@ struct ipv6_params {
 	__s32 autoconf;
 };
 extern struct ipv6_params ipv6_defaults;
-#include <linux/icmpv6.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 9ac2d2672a93..fd84adc47963 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -46,7 +46,11 @@ static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32
 #if IS_ENABLED(CONFIG_NF_NAT)
 void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
 #else
-#define icmp_ndo_send icmp_send
+static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+	struct ip_options opts = { 0 };
+	__icmp_send(skb_in, type, code, info, &opts);
+}
 #endif
 
 int icmp_rcv(struct sk_buff *skb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 396b492c804f..616e2dc1c8fa 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -775,13 +775,14 @@ EXPORT_SYMBOL(__icmp_send);
 void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 {
 	struct sk_buff *cloned_skb = NULL;
+	struct ip_options opts = { 0 };
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 	__be32 orig_ip;
 
 	ct = nf_ct_get(skb_in, &ctinfo);
 	if (!ct || !(ct->status & IPS_SRC_NAT)) {
-		icmp_send(skb_in, type, code, info);
+		__icmp_send(skb_in, type, code, info, &opts);
 		return;
 	}
 
@@ -796,7 +797,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 
 	orig_ip = ip_hdr(skb_in)->saddr;
 	ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
-	icmp_send(skb_in, type, code, info);
+	__icmp_send(skb_in, type, code, info, &opts);
 	ip_hdr(skb_in)->saddr = orig_ip;
 out:
 	consume_skb(cloned_skb);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index f3d05866692e..fd1f896115c1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -331,10 +331,9 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
 }
 
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
-static void mip6_addr_swap(struct sk_buff *skb)
+static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct ipv6_destopt_hao *hao;
 	struct in6_addr tmp;
 	int off;
@@ -351,7 +350,7 @@ static void mip6_addr_swap(struct sk_buff *skb)
 	}
 }
 #else
-static inline void mip6_addr_swap(struct sk_buff *skb) {}
+static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
 #endif
 
 static struct dst_entry *icmpv6_route_lookup(struct net *net,
@@ -446,7 +445,8 @@ static int icmp6_iif(const struct sk_buff *skb)
  *	Send an ICMP message in response to a packet in error
  */
 void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
-		const struct in6_addr *force_saddr)
+		const struct in6_addr *force_saddr,
+		const struct inet6_skb_parm *parm)
 {
 	struct inet6_dev *idev = NULL;
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -542,7 +542,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
 		goto out_bh_enable;
 
-	mip6_addr_swap(skb);
+	mip6_addr_swap(skb, parm);
 
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
@@ -559,7 +559,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		/* select a more meaningful saddr from input if */
 		struct net_device *in_netdev;
 
-		in_netdev = dev_get_by_index(net, IP6CB(skb)->iif);
+		in_netdev = dev_get_by_index(net, parm->iif);
 		if (in_netdev) {
 			ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
 					   inet6_sk(sk)->srcprefs,
@@ -640,7 +640,7 @@ EXPORT_SYMBOL(icmp6_send);
  */
 void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
 {
-	icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
+	icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
 	kfree_skb(skb);
 }
 
@@ -697,10 +697,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 	}
 	if (type == ICMP_TIME_EXCEEDED)
 		icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
-			   info, &temp_saddr);
+			   info, &temp_saddr, IP6CB(skb2));
 	else
 		icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
-			   info, &temp_saddr);
+			   info, &temp_saddr, IP6CB(skb2));
 	if (rt)
 		ip6_rt_put(rt);
 
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 70c8c2f36c98..9e3574880cb0 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -33,23 +33,25 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
 }
 EXPORT_SYMBOL(inet6_unregister_icmp_sender);
 
-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+		   const struct inet6_skb_parm *parm)
 {
 	ip6_icmp_send_t *send;
 
 	rcu_read_lock();
 	send = rcu_dereference(ip6_icmp_send);
 	if (send)
-		send(skb, type, code, info, NULL);
+		send(skb, type, code, info, NULL, parm);
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(icmpv6_send);
+EXPORT_SYMBOL(__icmpv6_send);
 #endif
 
 #if IS_ENABLED(CONFIG_NF_NAT)
 #include <net/netfilter/nf_conntrack.h>
 void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 {
+	struct inet6_skb_parm parm = { 0 };
 	struct sk_buff *cloned_skb = NULL;
 	enum ip_conntrack_info ctinfo;
 	struct in6_addr orig_ip;
@@ -57,7 +59,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 
 	ct = nf_ct_get(skb_in, &ctinfo);
 	if (!ct || !(ct->status & IPS_SRC_NAT)) {
-		icmpv6_send(skb_in, type, code, info);
+		__icmpv6_send(skb_in, type, code, info, &parm);
 		return;
 	}
 
@@ -72,7 +74,7 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 
 	orig_ip = ipv6_hdr(skb_in)->saddr;
 	ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
-	icmpv6_send(skb_in, type, code, info);
+	__icmpv6_send(skb_in, type, code, info, &parm);
 	ipv6_hdr(skb_in)->saddr = orig_ip;
 out:
 	consume_skb(cloned_skb);
-- 
cgit v1.2.3


From fa8fef0e104a23efe568b835d9e7e188d1d97610 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:55 +0530
Subject: PCI: endpoint: Add helper API to get the 'next' unreserved BAR

Add an API to get the next unreserved BAR starting from a given BAR number
that can be used by the endpoint function.

Link: https://lore.kernel.org/r/20210201195809.7342-4-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 26 ++++++++++++++++++++++----
 include/linux/pci-epc.h             |  2 ++
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 25e57672e1a1..1afe5d9afb0d 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -87,17 +87,36 @@ EXPORT_SYMBOL_GPL(pci_epc_get);
  * pci_epc_get_first_free_bar() - helper to get first unreserved BAR
  * @epc_features: pci_epc_features structure that holds the reserved bar bitmap
  *
- * Invoke to get the first unreserved BAR that can be used for endpoint
+ * Invoke to get the first unreserved BAR that can be used by the endpoint
  * function. For any incorrect value in reserved_bar return '0'.
  */
 unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
 					*epc_features)
+{
+	return pci_epc_get_next_free_bar(epc_features, BAR_0);
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
+
+/**
+ * pci_epc_get_next_free_bar() - helper to get unreserved BAR starting from @bar
+ * @epc_features: pci_epc_features structure that holds the reserved bar bitmap
+ * @bar: the starting BAR number from where unreserved BAR should be searched
+ *
+ * Invoke to get the next unreserved BAR starting from @bar that can be used
+ * for endpoint function. For any incorrect value in reserved_bar return '0'.
+ */
+unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
+				       *epc_features, enum pci_barno bar)
 {
 	unsigned long free_bar;
 
 	if (!epc_features)
 		return 0;
 
+	/* If 'bar - 1' is a 64-bit BAR, move to the next BAR */
+	if ((epc_features->bar_fixed_64bit << 1) & 1 << bar)
+		bar++;
+
 	/* Find if the reserved BAR is also a 64-bit BAR */
 	free_bar = epc_features->reserved_bar & epc_features->bar_fixed_64bit;
 
@@ -105,14 +124,13 @@ unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
 	free_bar <<= 1;
 	free_bar |= epc_features->reserved_bar;
 
-	/* Now find the free BAR */
-	free_bar = ffz(free_bar);
+	free_bar = find_next_zero_bit(&free_bar, 6, bar);
 	if (free_bar > 5)
 		return 0;
 
 	return free_bar;
 }
-EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
+EXPORT_SYMBOL_GPL(pci_epc_get_next_free_bar);
 
 /**
  * pci_epc_get_features() - get the features supported by EPC
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index cc66bec8be90..cfe9b427e6b7 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -203,6 +203,8 @@ const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
 						    u8 func_no);
 unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
 					*epc_features);
+unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
+				       *epc_features, enum pci_barno bar);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
-- 
cgit v1.2.3


From 0e27aeccfa3d1bab7c6a29fb8e6fcedbad7b09a8 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:56 +0530
Subject: PCI: endpoint: Make *_free_bar() to return error codes on failure

Modify pci_epc_get_next_free_bar() and pci_epc_get_first_free_bar() to
return error values if there are no free BARs available.

Link: https://lore.kernel.org/r/20210201195809.7342-5-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c |  2 ++
 drivers/pci/endpoint/pci-epc-core.c           | 12 ++++++------
 include/linux/pci-epc.h                       |  8 ++++----
 include/linux/pci-epf.h                       |  1 +
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index e4e51d884553..7a1f3abfde48 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -834,6 +834,8 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 		linkup_notifier = epc_features->linkup_notifier;
 		core_init_notifier = epc_features->core_init_notifier;
 		test_reg_bar = pci_epc_get_first_free_bar(epc_features);
+		if (test_reg_bar < 0)
+			return -EINVAL;
 		pci_epf_configure_bar(epf, epc_features);
 	}
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 1afe5d9afb0d..ea7e7465ce7a 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -90,8 +90,8 @@ EXPORT_SYMBOL_GPL(pci_epc_get);
  * Invoke to get the first unreserved BAR that can be used by the endpoint
  * function. For any incorrect value in reserved_bar return '0'.
  */
-unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
-					*epc_features)
+enum pci_barno
+pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features)
 {
 	return pci_epc_get_next_free_bar(epc_features, BAR_0);
 }
@@ -105,13 +105,13 @@ EXPORT_SYMBOL_GPL(pci_epc_get_first_free_bar);
  * Invoke to get the next unreserved BAR starting from @bar that can be used
  * for endpoint function. For any incorrect value in reserved_bar return '0'.
  */
-unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
-				       *epc_features, enum pci_barno bar)
+enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features
+					 *epc_features, enum pci_barno bar)
 {
 	unsigned long free_bar;
 
 	if (!epc_features)
-		return 0;
+		return BAR_0;
 
 	/* If 'bar - 1' is a 64-bit BAR, move to the next BAR */
 	if ((epc_features->bar_fixed_64bit << 1) & 1 << bar)
@@ -126,7 +126,7 @@ unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
 
 	free_bar = find_next_zero_bit(&free_bar, 6, bar);
 	if (free_bar > 5)
-		return 0;
+		return NO_BAR;
 
 	return free_bar;
 }
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index cfe9b427e6b7..88d311bad984 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -201,10 +201,10 @@ int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
 						    u8 func_no);
-unsigned int pci_epc_get_first_free_bar(const struct pci_epc_features
-					*epc_features);
-unsigned int pci_epc_get_next_free_bar(const struct pci_epc_features
-				       *epc_features, enum pci_barno bar);
+enum pci_barno
+pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features);
+enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features
+					 *epc_features, enum pci_barno bar);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 6644ff3b0702..fa3aca43eb19 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -21,6 +21,7 @@ enum pci_notify_event {
 };
 
 enum pci_barno {
+	NO_BAR = -1,
 	BAR_0,
 	BAR_1,
 	BAR_2,
-- 
cgit v1.2.3


From 7e5a51ebb321537c4209cdd0c54c4c19b3ef960d Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:57 +0530
Subject: PCI: endpoint: Remove unused pci_epf_match_device()

Remove unused pci_epf_match_device() function added in pci-epf-core.c

Link: https://lore.kernel.org/r/20210201195809.7342-6-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epf-core.c | 16 ----------------
 include/linux/pci-epf.h             |  2 --
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index c977cf9dce56..e44a317a2a2a 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -282,22 +282,6 @@ struct pci_epf *pci_epf_create(const char *name)
 }
 EXPORT_SYMBOL_GPL(pci_epf_create);
 
-const struct pci_epf_device_id *
-pci_epf_match_device(const struct pci_epf_device_id *id, struct pci_epf *epf)
-{
-	if (!id || !epf)
-		return NULL;
-
-	while (*id->name) {
-		if (strcmp(epf->name, id->name) == 0)
-			return id;
-		id++;
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(pci_epf_match_device);
-
 static void pci_epf_dev_release(struct device *dev)
 {
 	struct pci_epf *epf = to_pci_epf(dev);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index fa3aca43eb19..f373a134ac04 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -165,8 +165,6 @@ static inline void *epf_get_drvdata(struct pci_epf *epf)
 	return dev_get_drvdata(&epf->dev);
 }
 
-const struct pci_epf_device_id *
-pci_epf_match_device(const struct pci_epf_device_id *id, struct pci_epf *epf);
 struct pci_epf *pci_epf_create(const char *name);
 void pci_epf_destroy(struct pci_epf *epf);
 int __pci_epf_register_driver(struct pci_epf_driver *driver,
-- 
cgit v1.2.3


From 63840ff5322373d665b2b9c59cd64233d5f0691e Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:27:58 +0530
Subject: PCI: endpoint: Add support to associate secondary EPC with EPF

In the case of standard endpoint functions, only one endpoint controller
(EPC) will be associated with an endpoint function (EPF). However for
providing NTB (non transparent bridge) functionality, two EPCs should be
associated with a single EPF.  Add support to associate secondary EPC with
EPF. This is in preparation for adding NTB endpoint function driver.

Link: https://lore.kernel.org/r/20210201195809.7342-7-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 11 ++++--
 drivers/pci/endpoint/pci-ep-cfs.c             |  6 +--
 drivers/pci/endpoint/pci-epc-core.c           | 47 ++++++++++++++++------
 drivers/pci/endpoint/pci-epf-core.c           | 57 +++++++++++++++++++--------
 include/linux/pci-epc.h                       | 25 +++++++++++-
 include/linux/pci-epf.h                       | 17 +++++++-
 6 files changed, 125 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index 7a1f3abfde48..c0ac4e9cbe72 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -619,7 +619,8 @@ static void pci_epf_test_unbind(struct pci_epf *epf)
 
 		if (epf_test->reg[bar]) {
 			pci_epc_clear_bar(epc, epf->func_no, epf_bar);
-			pci_epf_free_space(epf, epf_test->reg[bar], bar);
+			pci_epf_free_space(epf, epf_test->reg[bar], bar,
+					   PRIMARY_INTERFACE);
 		}
 	}
 }
@@ -651,7 +652,8 @@ static int pci_epf_test_set_bar(struct pci_epf *epf)
 
 		ret = pci_epc_set_bar(epc, epf->func_no, epf_bar);
 		if (ret) {
-			pci_epf_free_space(epf, epf_test->reg[bar], bar);
+			pci_epf_free_space(epf, epf_test->reg[bar], bar,
+					   PRIMARY_INTERFACE);
 			dev_err(dev, "Failed to set BAR%d\n", bar);
 			if (bar == test_reg_bar)
 				return ret;
@@ -771,7 +773,7 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 	}
 
 	base = pci_epf_alloc_space(epf, test_reg_size, test_reg_bar,
-				   epc_features->align);
+				   epc_features->align, PRIMARY_INTERFACE);
 	if (!base) {
 		dev_err(dev, "Failed to allocated register space\n");
 		return -ENOMEM;
@@ -789,7 +791,8 @@ static int pci_epf_test_alloc_space(struct pci_epf *epf)
 			continue;
 
 		base = pci_epf_alloc_space(epf, bar_size[bar], bar,
-					   epc_features->align);
+					   epc_features->align,
+					   PRIMARY_INTERFACE);
 		if (!base)
 			dev_err(dev, "Failed to allocate space for BAR%d\n",
 				bar);
diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 3710adf51912..6ca9e2f92460 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -94,13 +94,13 @@ static int pci_epc_epf_link(struct config_item *epc_item,
 	struct pci_epc *epc = epc_group->epc;
 	struct pci_epf *epf = epf_group->epf;
 
-	ret = pci_epc_add_epf(epc, epf);
+	ret = pci_epc_add_epf(epc, epf, PRIMARY_INTERFACE);
 	if (ret)
 		return ret;
 
 	ret = pci_epf_bind(epf);
 	if (ret) {
-		pci_epc_remove_epf(epc, epf);
+		pci_epc_remove_epf(epc, epf, PRIMARY_INTERFACE);
 		return ret;
 	}
 
@@ -120,7 +120,7 @@ static void pci_epc_epf_unlink(struct config_item *epc_item,
 	epc = epc_group->epc;
 	epf = epf_group->epf;
 	pci_epf_unbind(epf);
-	pci_epc_remove_epf(epc, epf);
+	pci_epc_remove_epf(epc, epf, PRIMARY_INTERFACE);
 }
 
 static struct configfs_item_operations pci_epc_item_ops = {
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index ea7e7465ce7a..3693eca5b030 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -493,21 +493,28 @@ EXPORT_SYMBOL_GPL(pci_epc_write_header);
  * pci_epc_add_epf() - bind PCI endpoint function to an endpoint controller
  * @epc: the EPC device to which the endpoint function should be added
  * @epf: the endpoint function to be added
+ * @type: Identifies if the EPC is connected to the primary or secondary
+ *        interface of EPF
  *
  * A PCI endpoint device can have one or more functions. In the case of PCIe,
  * the specification allows up to 8 PCIe endpoint functions. Invoke
  * pci_epc_add_epf() to add a PCI endpoint function to an endpoint controller.
  */
-int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf)
+int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf,
+		    enum pci_epc_interface_type type)
 {
+	struct list_head *list;
 	u32 func_no;
 	int ret = 0;
 
-	if (epf->epc)
+	if (IS_ERR_OR_NULL(epc))
+		return -EINVAL;
+
+	if (type == PRIMARY_INTERFACE && epf->epc)
 		return -EBUSY;
 
-	if (IS_ERR(epc))
-		return -EINVAL;
+	if (type == SECONDARY_INTERFACE && epf->sec_epc)
+		return -EBUSY;
 
 	mutex_lock(&epc->lock);
 	func_no = find_first_zero_bit(&epc->function_num_map,
@@ -524,11 +531,17 @@ int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf)
 	}
 
 	set_bit(func_no, &epc->function_num_map);
-	epf->func_no = func_no;
-	epf->epc = epc;
-
-	list_add_tail(&epf->list, &epc->pci_epf);
+	if (type == PRIMARY_INTERFACE) {
+		epf->func_no = func_no;
+		epf->epc = epc;
+		list = &epf->list;
+	} else {
+		epf->sec_epc_func_no = func_no;
+		epf->sec_epc = epc;
+		list = &epf->sec_epc_list;
+	}
 
+	list_add_tail(list, &epc->pci_epf);
 ret:
 	mutex_unlock(&epc->lock);
 
@@ -543,14 +556,26 @@ EXPORT_SYMBOL_GPL(pci_epc_add_epf);
  *
  * Invoke to remove PCI endpoint function from the endpoint controller.
  */
-void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf)
+void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf,
+			enum pci_epc_interface_type type)
 {
+	struct list_head *list;
+	u32 func_no = 0;
+
 	if (!epc || IS_ERR(epc) || !epf)
 		return;
 
+	if (type == PRIMARY_INTERFACE) {
+		func_no = epf->func_no;
+		list = &epf->list;
+	} else {
+		func_no = epf->sec_epc_func_no;
+		list = &epf->sec_epc_list;
+	}
+
 	mutex_lock(&epc->lock);
-	clear_bit(epf->func_no, &epc->function_num_map);
-	list_del(&epf->list);
+	clear_bit(func_no, &epc->function_num_map);
+	list_del(list);
 	epf->epc = NULL;
 	mutex_unlock(&epc->lock);
 }
diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index e44a317a2a2a..79329ec6373c 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -74,24 +74,37 @@ EXPORT_SYMBOL_GPL(pci_epf_bind);
  * @epf: the EPF device from whom to free the memory
  * @addr: the virtual address of the PCI EPF register space
  * @bar: the BAR number corresponding to the register space
+ * @type: Identifies if the allocated space is for primary EPC or secondary EPC
  *
  * Invoke to free the allocated PCI EPF register space.
  */
-void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar)
+void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
+			enum pci_epc_interface_type type)
 {
 	struct device *dev = epf->epc->dev.parent;
+	struct pci_epf_bar *epf_bar;
+	struct pci_epc *epc;
 
 	if (!addr)
 		return;
 
-	dma_free_coherent(dev, epf->bar[bar].size, addr,
-			  epf->bar[bar].phys_addr);
+	if (type == PRIMARY_INTERFACE) {
+		epc = epf->epc;
+		epf_bar = epf->bar;
+	} else {
+		epc = epf->sec_epc;
+		epf_bar = epf->sec_epc_bar;
+	}
+
+	dev = epc->dev.parent;
+	dma_free_coherent(dev, epf_bar[bar].size, addr,
+			  epf_bar[bar].phys_addr);
 
-	epf->bar[bar].phys_addr = 0;
-	epf->bar[bar].addr = NULL;
-	epf->bar[bar].size = 0;
-	epf->bar[bar].barno = 0;
-	epf->bar[bar].flags = 0;
+	epf_bar[bar].phys_addr = 0;
+	epf_bar[bar].addr = NULL;
+	epf_bar[bar].size = 0;
+	epf_bar[bar].barno = 0;
+	epf_bar[bar].flags = 0;
 }
 EXPORT_SYMBOL_GPL(pci_epf_free_space);
 
@@ -101,15 +114,18 @@ EXPORT_SYMBOL_GPL(pci_epf_free_space);
  * @size: the size of the memory that has to be allocated
  * @bar: the BAR number corresponding to the allocated register space
  * @align: alignment size for the allocation region
+ * @type: Identifies if the allocation is for primary EPC or secondary EPC
  *
  * Invoke to allocate memory for the PCI EPF register space.
  */
 void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
-			  size_t align)
+			  size_t align, enum pci_epc_interface_type type)
 {
-	void *space;
-	struct device *dev = epf->epc->dev.parent;
+	struct pci_epf_bar *epf_bar;
 	dma_addr_t phys_addr;
+	struct pci_epc *epc;
+	struct device *dev;
+	void *space;
 
 	if (size < 128)
 		size = 128;
@@ -119,17 +135,26 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 	else
 		size = roundup_pow_of_two(size);
 
+	if (type == PRIMARY_INTERFACE) {
+		epc = epf->epc;
+		epf_bar = epf->bar;
+	} else {
+		epc = epf->sec_epc;
+		epf_bar = epf->sec_epc_bar;
+	}
+
+	dev = epc->dev.parent;
 	space = dma_alloc_coherent(dev, size, &phys_addr, GFP_KERNEL);
 	if (!space) {
 		dev_err(dev, "failed to allocate mem space\n");
 		return NULL;
 	}
 
-	epf->bar[bar].phys_addr = phys_addr;
-	epf->bar[bar].addr = space;
-	epf->bar[bar].size = size;
-	epf->bar[bar].barno = bar;
-	epf->bar[bar].flags |= upper_32_bits(size) ?
+	epf_bar[bar].phys_addr = phys_addr;
+	epf_bar[bar].addr = space;
+	epf_bar[bar].size = size;
+	epf_bar[bar].barno = bar;
+	epf_bar[bar].flags |= upper_32_bits(size) ?
 				PCI_BASE_ADDRESS_MEM_TYPE_64 :
 				PCI_BASE_ADDRESS_MEM_TYPE_32;
 
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 88d311bad984..d9cb3944fb87 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -13,6 +13,12 @@
 
 struct pci_epc;
 
+enum pci_epc_interface_type {
+	UNKNOWN_INTERFACE = -1,
+	PRIMARY_INTERFACE,
+	SECONDARY_INTERFACE,
+};
+
 enum pci_epc_irq_type {
 	PCI_EPC_IRQ_UNKNOWN,
 	PCI_EPC_IRQ_LEGACY,
@@ -20,6 +26,19 @@ enum pci_epc_irq_type {
 	PCI_EPC_IRQ_MSIX,
 };
 
+static inline const char *
+pci_epc_interface_string(enum pci_epc_interface_type type)
+{
+	switch (type) {
+	case PRIMARY_INTERFACE:
+		return "primary";
+	case SECONDARY_INTERFACE:
+		return "secondary";
+	default:
+		return "UNKNOWN interface";
+	}
+}
+
 /**
  * struct pci_epc_ops - set of function pointers for performing EPC operations
  * @write_header: ops to populate configuration space header
@@ -175,10 +194,12 @@ __pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
 		 struct module *owner);
 void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc);
 void pci_epc_destroy(struct pci_epc *epc);
-int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf);
+int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf,
+		    enum pci_epc_interface_type type);
 void pci_epc_linkup(struct pci_epc *epc);
 void pci_epc_init_notify(struct pci_epc *epc);
-void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf);
+void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf,
+			enum pci_epc_interface_type type);
 int pci_epc_write_header(struct pci_epc *epc, u8 func_no,
 			 struct pci_epf_header *hdr);
 int pci_epc_set_bar(struct pci_epc *epc, u8 func_no,
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index f373a134ac04..1dc66824f5a8 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -14,6 +14,7 @@
 #include <linux/pci.h>
 
 struct pci_epf;
+enum pci_epc_interface_type;
 
 enum pci_notify_event {
 	CORE_INIT,
@@ -119,6 +120,11 @@ struct pci_epf_bar {
  * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc
  * @nb: notifier block to notify EPF of any EPC events (like linkup)
  * @lock: mutex to protect pci_epf_ops
+ * @sec_epc: the secondary EPC device to which this EPF device is bound
+ * @sec_epc_list: to add pci_epf as list of PCI endpoint functions to secondary
+ *   EPC device
+ * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC
+ * @sec_epc_func_no: unique (physical) function number within the secondary EPC
  */
 struct pci_epf {
 	struct device		dev;
@@ -135,6 +141,12 @@ struct pci_epf {
 	struct notifier_block   nb;
 	/* mutex to protect against concurrent access of pci_epf_ops */
 	struct mutex		lock;
+
+	/* Below members are to attach secondary EPC to an endpoint function */
+	struct pci_epc		*sec_epc;
+	struct list_head	sec_epc_list;
+	struct pci_epf_bar	sec_epc_bar[6];
+	u8			sec_epc_func_no;
 };
 
 /**
@@ -171,8 +183,9 @@ int __pci_epf_register_driver(struct pci_epf_driver *driver,
 			      struct module *owner);
 void pci_epf_unregister_driver(struct pci_epf_driver *driver);
 void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
-			  size_t align);
-void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar);
+			  size_t align, enum pci_epc_interface_type type);
+void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
+			enum pci_epc_interface_type type);
 int pci_epf_bind(struct pci_epf *epf);
 void pci_epf_unbind(struct pci_epf *epf);
 #endif /* __LINUX_PCI_EPF_H */
-- 
cgit v1.2.3


From 87d5972e476f6c4e98a0abce713c54c6f40661b0 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:00 +0530
Subject: PCI: endpoint: Add pci_epc_ops to map MSI IRQ

Add pci_epc_ops to map physical address to MSI address and return MSI data.
The physical address is an address in the outbound region. This is required
to implement doorbell functionality of NTB (non-transparent bridge) wherein
EPC on either side of the interface (primary and secondary) can directly
write to the physical address (in outbound region) of the other interface
to ring doorbell using MSI.

Link: https://lore.kernel.org/r/20210201195809.7342-9-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 41 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-epc.h             |  8 ++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 3693eca5b030..cc8f9eb2b177 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -230,6 +230,47 @@ int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 }
 EXPORT_SYMBOL_GPL(pci_epc_raise_irq);
 
+/**
+ * pci_epc_map_msi_irq() - Map physical address to MSI address and return
+ *                         MSI data
+ * @epc: the EPC device which has the MSI capability
+ * @func_no: the physical endpoint function number in the EPC device
+ * @phys_addr: the physical address of the outbound region
+ * @interrupt_num: the MSI interrupt number
+ * @entry_size: Size of Outbound address region for each interrupt
+ * @msi_data: the data that should be written in order to raise MSI interrupt
+ *            with interrupt number as 'interrupt num'
+ * @msi_addr_offset: Offset of MSI address from the aligned outbound address
+ *                   to which the MSI address is mapped
+ *
+ * Invoke to map physical address to MSI address and return MSI data. The
+ * physical address should be an address in the outbound region. This is
+ * required to implement doorbell functionality of NTB wherein EPC on either
+ * side of the interface (primary and secondary) can directly write to the
+ * physical address (in outbound region) of the other interface to ring
+ * doorbell.
+ */
+int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, phys_addr_t phys_addr,
+			u8 interrupt_num, u32 entry_size, u32 *msi_data,
+			u32 *msi_addr_offset)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(epc))
+		return -EINVAL;
+
+	if (!epc->ops->map_msi_irq)
+		return -EINVAL;
+
+	mutex_lock(&epc->lock);
+	ret = epc->ops->map_msi_irq(epc, func_no, phys_addr, interrupt_num,
+				    entry_size, msi_data, msi_addr_offset);
+	mutex_unlock(&epc->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_map_msi_irq);
+
 /**
  * pci_epc_get_msi() - get the number of MSI interrupt numbers allocated
  * @epc: the EPC device to which MSI interrupts was requested
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index d9cb3944fb87..b82c9b100e97 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -55,6 +55,7 @@ pci_epc_interface_string(enum pci_epc_interface_type type)
  * @get_msix: ops to get the number of MSI-X interrupts allocated by the RC
  *	     from the MSI-X capability register
  * @raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
+ * @map_msi_irq: ops to map physical address to MSI address and return MSI data
  * @start: ops to start the PCI link
  * @stop: ops to stop the PCI link
  * @owner: the module owner containing the ops
@@ -77,6 +78,10 @@ struct pci_epc_ops {
 	int	(*get_msix)(struct pci_epc *epc, u8 func_no);
 	int	(*raise_irq)(struct pci_epc *epc, u8 func_no,
 			     enum pci_epc_irq_type type, u16 interrupt_num);
+	int	(*map_msi_irq)(struct pci_epc *epc, u8 func_no,
+			       phys_addr_t phys_addr, u8 interrupt_num,
+			       u32 entry_size, u32 *msi_data,
+			       u32 *msi_addr_offset);
 	int	(*start)(struct pci_epc *epc);
 	void	(*stop)(struct pci_epc *epc);
 	const struct pci_epc_features* (*get_features)(struct pci_epc *epc,
@@ -216,6 +221,9 @@ int pci_epc_get_msi(struct pci_epc *epc, u8 func_no);
 int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts,
 		     enum pci_barno, u32 offset);
 int pci_epc_get_msix(struct pci_epc *epc, u8 func_no);
+int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no,
+			phys_addr_t phys_addr, u8 interrupt_num,
+			u32 entry_size, u32 *msi_data, u32 *msi_addr_offset);
 int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
 		      enum pci_epc_irq_type type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
-- 
cgit v1.2.3


From 256ae475201b16fd69e00dd6c2d14035e4ea5745 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:01 +0530
Subject: PCI: endpoint: Add pci_epf_ops to expose function-specific attrs

In addition to the attributes that are generic across function drivers
documented in Documentation/PCI/endpoint/pci-endpoint-cfs.rst, there could
be function-specific attributes that has to be exposed by the function
driver to be configured by the user. Add ->add_cfs() in pci_epf_ops to be
populated by the function driver if it has to expose any function-specific
attributes and pci_epf_type_add_cfs() to be invoked by pci-ep-cfs.c when
sub-directory to main function directory is created.

Link: https://lore.kernel.org/r/20210201195809.7342-10-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epf-core.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/pci-epf.h             |  5 +++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index 79329ec6373c..7646c8660d42 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -20,6 +20,38 @@ static DEFINE_MUTEX(pci_epf_mutex);
 static struct bus_type pci_epf_bus_type;
 static const struct device_type pci_epf_type;
 
+/**
+ * pci_epf_type_add_cfs() - Help function drivers to expose function specific
+ *                          attributes in configfs
+ * @epf: the EPF device that has to be configured using configfs
+ * @group: the parent configfs group (corresponding to entries in
+ *         pci_epf_device_id)
+ *
+ * Invoke to expose function specific attributes in configfs. If the function
+ * driver does not have anything to expose (attributes configured by user),
+ * return NULL.
+ */
+struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf,
+					  struct config_group *group)
+{
+	struct config_group *epf_type_group;
+
+	if (!epf->driver) {
+		dev_err(&epf->dev, "epf device not bound to driver\n");
+		return NULL;
+	}
+
+	if (!epf->driver->ops->add_cfs)
+		return NULL;
+
+	mutex_lock(&epf->lock);
+	epf_type_group = epf->driver->ops->add_cfs(epf, group);
+	mutex_unlock(&epf->lock);
+
+	return epf_type_group;
+}
+EXPORT_SYMBOL_GPL(pci_epf_type_add_cfs);
+
 /**
  * pci_epf_unbind() - Notify the function driver that the binding between the
  *		      EPF device and EPC device has been lost
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 1dc66824f5a8..b241e7dd171f 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -62,10 +62,13 @@ struct pci_epf_header {
  * @bind: ops to perform when a EPC device has been bound to EPF device
  * @unbind: ops to perform when a binding has been lost between a EPC device
  *	    and EPF device
+ * @add_cfs: ops to initialize function specific configfs attributes
  */
 struct pci_epf_ops {
 	int	(*bind)(struct pci_epf *epf);
 	void	(*unbind)(struct pci_epf *epf);
+	struct config_group *(*add_cfs)(struct pci_epf *epf,
+					struct config_group *group);
 };
 
 /**
@@ -188,4 +191,6 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
 			enum pci_epc_interface_type type);
 int pci_epf_bind(struct pci_epf *epf);
 void pci_epf_unbind(struct pci_epf *epf);
+struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf,
+					  struct config_group *group);
 #endif /* __LINUX_PCI_EPF_H */
-- 
cgit v1.2.3


From 38ad827e3bc0f0e94628ee1d8dc31e778d9be40f Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:02 +0530
Subject: PCI: endpoint: Allow user to create sub-directory of 'EPF Device'
 directory

Documentation/PCI/endpoint/pci-endpoint-cfs.rst explains how a user has to
create a directory in-order to create a 'EPF Device' that can be
configured/probed by 'EPF Driver'.

Allow user to create a sub-directory of 'EPF Device' directory for any
function specific attributes that has to be exposed to the user.

Link: https://lore.kernel.org/r/20210201195809.7342-11-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-ep-cfs.c | 23 +++++++++++++++++++++++
 include/linux/pci-epf.h           |  3 +++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 8f750961d6ab..f3a8b833b479 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -490,7 +490,29 @@ static struct configfs_item_operations pci_epf_ops = {
 	.release		= pci_epf_release,
 };
 
+static struct config_group *pci_epf_type_make(struct config_group *group,
+					      const char *name)
+{
+	struct pci_epf_group *epf_group = to_pci_epf_group(&group->cg_item);
+	struct config_group *epf_type_group;
+
+	epf_type_group = pci_epf_type_add_cfs(epf_group->epf, group);
+	return epf_type_group;
+}
+
+static void pci_epf_type_drop(struct config_group *group,
+			      struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations pci_epf_type_group_ops = {
+	.make_group     = &pci_epf_type_make,
+	.drop_item      = &pci_epf_type_drop,
+};
+
 static const struct config_item_type pci_epf_type = {
+	.ct_group_ops	= &pci_epf_type_group_ops,
 	.ct_item_ops	= &pci_epf_ops,
 	.ct_attrs	= pci_epf_attrs,
 	.ct_owner	= THIS_MODULE,
@@ -553,6 +575,7 @@ static struct config_group *pci_epf_make(struct config_group *group,
 		goto free_name;
 	}
 
+	epf->group = &epf_group->group;
 	epf_group->epf = epf;
 
 	kfree(epf_name);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index b241e7dd171f..6833e2160ef1 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -9,6 +9,7 @@
 #ifndef __LINUX_PCI_EPF_H
 #define __LINUX_PCI_EPF_H
 
+#include <linux/configfs.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/pci.h>
@@ -128,6 +129,7 @@ struct pci_epf_bar {
  *   EPC device
  * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC
  * @sec_epc_func_no: unique (physical) function number within the secondary EPC
+ * @group: configfs group associated with the EPF device
  */
 struct pci_epf {
 	struct device		dev;
@@ -150,6 +152,7 @@ struct pci_epf {
 	struct list_head	sec_epc_list;
 	struct pci_epf_bar	sec_epc_bar[6];
 	u8			sec_epc_func_no;
+	struct config_group	*group;
 };
 
 /**
-- 
cgit v1.2.3


From 599f86872f9ce8a0a0bd111a23442b18e8ee7059 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 2 Feb 2021 01:28:06 +0530
Subject: PCI: Add TI J721E device to PCI IDs

Add TI J721E device to the PCI ID database. Since this device has a
configurable PCIe endpoint, it could be used with different drivers.

Link: https://lore.kernel.org/r/20210201195809.7342-15-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/misc/pci_endpoint_test.c | 1 -
 include/linux/pci_ids.h          | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index eff481ce08ee..1b2868ca4f2a 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -68,7 +68,6 @@
 #define PCI_ENDPOINT_TEST_FLAGS			0x2c
 #define FLAG_USE_DMA				BIT(0)
 
-#define PCI_DEVICE_ID_TI_J721E			0xb00d
 #define PCI_DEVICE_ID_TI_AM654			0xb00c
 #define PCI_DEVICE_ID_LS1088A			0x80c0
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d8156a5dbee8..f968fcda338e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -881,6 +881,7 @@
 #define PCI_DEVICE_ID_TI_X620		0xac8d
 #define PCI_DEVICE_ID_TI_X420		0xac8e
 #define PCI_DEVICE_ID_TI_XX20_FM	0xac8f
+#define PCI_DEVICE_ID_TI_J721E		0xb00d
 #define PCI_DEVICE_ID_TI_DRA74x		0xb500
 #define PCI_DEVICE_ID_TI_DRA72x		0xb501
 
-- 
cgit v1.2.3


From cf6acb8bdb1d829b85a4daa2944bf9e71c93f4b9 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Mon, 22 Feb 2021 18:01:54 +0100
Subject: s390/cpumf: Add support for complete counter set extraction

Add support to the CPU Measurement counter facility device driver
to extract complete counter sets per CPU and per counter set from user
space. This includes a new device named /dev/hwctr and support
for the device driver functions open, close and ioctl. Other
functions are not supported.

The ioctl command supports 3 subcommands:
S390_HWCTR_START: enables counter sets on a list of CPUs.
S390_HWCTR_STOP: disables counter sets on a list of CPUs.
S390_HWCTR_READ: reads counter sets on a list of CPUs.

The ioctl(..., S390_HWCTR_READ, ...) is the only subcommand which
returns data.  It requires member data_bytes to be positive and
indicates the maximum amount of data available to store counter set
data. The other ioctl() subcommands do not use this member and it
should be set to zero.
The S390_HWCTR_READ subcommand returns the following data:

The cpuset data is flattened using the following scheme, stored in member
data:

 0x0       0x8   0xc       0x10  0x10      0x18  0x20  0x28         0xU-1
 +---------+-----+---------+-----+---------+-----+-----+------+------+
 | no_cpus | cpu | no_sets | set | no_cnts | cv1 | cv2 | .... | cv_n |
 +---------+-----+---------+-----+---------+-----+-----+------+------+

                           0xU   0xU+4     0xU+8 0xU+10             0xV-1
                           +-----+---------+-----+-----+------+------+
                           | set | no_cnts | cv1 | cv2 | .... | cv_n |
                           +-----+---------+-----+-----+------+------+

           0xV   0xV+4     0xV+8 0xV+c
           +-----+---------+-----+---------+-----+-----+------+------+
           | cpu | no_sets | set | no_cnts | cv1 | cv2 | .... | cv_n |
           +-----+---------+-----+---------+-----+-----+------+------+

U and V denote arbitrary hexadezimal addresses.
The first integer represents the number of CPUs data was extracted
from. This is followed by CPU number and number of counter sets extracted.
Both are two integer values. This is followed by the set identifer
and number of counters extracted. Both are two integer values. This is
followed by the counter values, each element is eight bytes in size.

The S390_HWCTR_READ ioctl subcommand is also limited to one call per
minute. This ensures that an application does not read out the
counter sets too often and reduces the overall CPU performance.
The complete counter set extraction is an expensive operation.

Reviewed-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/uapi/asm/perf_cpum_cf_diag.h |  51 +++
 arch/s390/kernel/perf_cpum_cf_diag.c           | 548 +++++++++++++++++++++++--
 include/linux/cpuhotplug.h                     |   1 +
 3 files changed, 577 insertions(+), 23 deletions(-)
 create mode 100644 arch/s390/include/uapi/asm/perf_cpum_cf_diag.h

(limited to 'include/linux')

diff --git a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h b/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h
new file mode 100644
index 000000000000..3d8284b95f87
--- /dev/null
+++ b/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright IBM Corp. 2021
+ * Interface implementation for communication with the CPU Measurement
+ * counter facility device driver.
+ *
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ *
+ * Define for ioctl() commands to communicate with the CPU Measurement
+ * counter facility device driver.
+ */
+
+#ifndef _PERF_CPUM_CF_DIAG_H
+#define _PERF_CPUM_CF_DIAG_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define S390_HWCTR_DEVICE		"hwctr"
+#define S390_HWCTR_START_VERSION	1
+
+struct s390_ctrset_start {		/* Set CPUs to operate on */
+	__u64 version;			/* Version of interface */
+	__u64 data_bytes;		/* # of bytes required */
+	__u64 cpumask_len;		/* Length of CPU mask in bytes */
+	__u64 *cpumask;			/* Pointer to CPU mask */
+	__u64 counter_sets;		/* Bit mask of counter sets to get */
+};
+
+struct s390_ctrset_setdata {		/* Counter set data */
+	__u32 set;			/* Counter set number */
+	__u32 no_cnts;			/* # of counters stored in cv[] */
+	__u64 cv[0];			/* Counter values (variable length) */
+};
+
+struct s390_ctrset_cpudata {		/* Counter set data per CPU */
+	__u32 cpu_nr;			/* CPU number */
+	__u32 no_sets;			/* # of counters sets in data[] */
+	struct s390_ctrset_setdata data[0];
+};
+
+struct s390_ctrset_read {		/* Structure to get all ctr sets */
+	__u64 no_cpus;			/* Total # of CPUs data taken from */
+	struct s390_ctrset_cpudata data[0];
+};
+
+#define S390_HWCTR_MAGIC	'C'	/* Random magic # for ioctls */
+#define	S390_HWCTR_START	_IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start)
+#define	S390_HWCTR_STOP		_IO(S390_HWCTR_MAGIC, 2)
+#define	S390_HWCTR_READ		_IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read)
+#endif
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
index b5c86fb70d63..db4877bbb9aa 100644
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ b/arch/s390/kernel/perf_cpum_cf_diag.c
@@ -2,7 +2,7 @@
 /*
  * Performance event support for s390x - CPU-measurement Counter Sets
  *
- *  Copyright IBM Corp. 2019
+ *  Copyright IBM Corp. 2019, 2021
  *  Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
  *	       Thomas Richer <tmricht@linux.ibm.com>
  */
@@ -17,6 +17,8 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/processor.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
 
 #include <asm/ctl_reg.h>
 #include <asm/irq.h>
@@ -24,15 +26,20 @@
 #include <asm/timex.h>
 #include <asm/debug.h>
 
-#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+#include <asm/perf_cpum_cf_diag.h>
 
+#define	CF_DIAG_CTRSET_DEF		0xfeef	/* Counter set header mark */
+#define CF_DIAG_MIN_INTERVAL		60	/* Minimum counter set read */
+						/* interval in seconds */
+static unsigned long cf_diag_interval = CF_DIAG_MIN_INTERVAL;
 static unsigned int cf_diag_cpu_speed;
 static debug_info_t *cf_diag_dbg;
 
-struct cf_diag_csd {		/* Counter set data per CPU */
+struct cf_diag_csd {			/* Counter set data per CPU */
 	size_t used;			/* Bytes used in data/start */
 	unsigned char start[PAGE_SIZE];	/* Counter set at event start */
 	unsigned char data[PAGE_SIZE];	/* Counter set at event delete */
+	unsigned int sets;		/* # Counter set saved in data */
 };
 static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
 
@@ -178,18 +185,35 @@ static void cf_diag_disable(struct pmu *pmu)
 
 /* Number of perf events counting hardware events */
 static atomic_t cf_diag_events = ATOMIC_INIT(0);
+/* Used to avoid races in calling reserve/release_cpumf_hardware */
+static DEFINE_MUTEX(cf_diag_reserve_mutex);
 
 /* Release the PMU if event is the last perf event */
 static void cf_diag_perf_event_destroy(struct perf_event *event)
 {
 	debug_sprintf_event(cf_diag_dbg, 5,
 			    "%s event %p cpu %d cf_diag_events %d\n",
-			    __func__, event, event->cpu,
+			    __func__, event, smp_processor_id(),
 			    atomic_read(&cf_diag_events));
 	if (atomic_dec_return(&cf_diag_events) == 0)
 		__kernel_cpumcf_end();
 }
 
+static int get_authctrsets(void)
+{
+	struct cpu_cf_events *cpuhw;
+	unsigned long auth = 0;
+	enum cpumf_ctr_set i;
+
+	cpuhw = &get_cpu_var(cpu_cf_events);
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
+			auth |= cpumf_ctr_ctl[i];
+	}
+	put_cpu_var(cpu_cf_events);
+	return auth;
+}
+
 /* Setup the event. Test for authorized counter sets and only include counter
  * sets which are authorized at the time of the setup. Including unauthorized
  * counter sets result in specification exception (and panic).
@@ -197,15 +221,12 @@ static void cf_diag_perf_event_destroy(struct perf_event *event)
 static int __hw_perf_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
-	struct cpu_cf_events *cpuhw;
-	enum cpumf_ctr_set i;
 	int err = 0;
 
 	debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
 			    event, event->cpu);
 
 	event->hw.config = attr->config;
-	event->hw.config_base = 0;
 
 	/* Add all authorized counter sets to config_base. The
 	 * the hardware init function is either called per-cpu or just once
@@ -215,11 +236,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	 * Checking the authorization on any CPU is fine as the hardware
 	 * applies the same authorization settings to all CPUs.
 	 */
-	cpuhw = &get_cpu_var(cpu_cf_events);
-	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
-		if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
-			event->hw.config_base |= cpumf_ctr_ctl[i];
-	put_cpu_var(cpu_cf_events);
+	event->hw.config_base = get_authctrsets();
 
 	/* No authorized counter sets, nothing to count/sample */
 	if (!event->hw.config_base) {
@@ -237,6 +254,25 @@ out:
 	return err;
 }
 
+/* Return 0 if the CPU-measurement counter facility is currently free
+ * and an error otherwise.
+ */
+static int cf_diag_perf_event_inuse(void)
+{
+	int err = 0;
+
+	if (!atomic_inc_not_zero(&cf_diag_events)) {
+		mutex_lock(&cf_diag_reserve_mutex);
+		if (atomic_read(&cf_diag_events) == 0 &&
+		    __kernel_cpumcf_begin())
+			err = -EBUSY;
+		else
+			err = atomic_inc_return(&cf_diag_events);
+		mutex_unlock(&cf_diag_reserve_mutex);
+	}
+	return err;
+}
+
 static int cf_diag_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -264,13 +300,9 @@ static int cf_diag_event_init(struct perf_event *event)
 	}
 
 	/* Initialize for using the CPU-measurement counter facility */
-	if (atomic_inc_return(&cf_diag_events) == 1) {
-		if (__kernel_cpumcf_begin()) {
-			atomic_dec(&cf_diag_events);
-			err = -EBUSY;
-			goto out;
-		}
-	}
+	err = cf_diag_perf_event_inuse();
+	if (err < 0)
+		goto out;
 	event->destroy = cf_diag_perf_event_destroy;
 
 	err = __hw_perf_event_init(event);
@@ -599,6 +631,8 @@ static void cf_diag_del(struct perf_event *event, int flags)
 	cpuhw->flags &= ~PMU_F_IN_USE;
 }
 
+/* Default counter set events and format attribute groups */
+
 CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
 
 static struct attribute *cf_diag_events_attr[] = {
@@ -663,6 +697,452 @@ static void cf_diag_get_cpu_speed(void)
 	}
 }
 
+/* Code to create device and file I/O operations */
+static atomic_t ctrset_opencnt = ATOMIC_INIT(0);	/* Excl. access */
+
+static int cf_diag_open(struct inode *inode, struct file *file)
+{
+	int err = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (atomic_xchg(&ctrset_opencnt, 1))
+		return -EBUSY;
+
+	/* Avoid concurrent access with perf_event_open() system call */
+	mutex_lock(&cf_diag_reserve_mutex);
+	if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin())
+		err = -EBUSY;
+	mutex_unlock(&cf_diag_reserve_mutex);
+	if (err) {
+		atomic_set(&ctrset_opencnt, 0);
+		return err;
+	}
+	file->private_data = NULL;
+	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
+	/* nonseekable_open() never fails */
+	return nonseekable_open(inode, file);
+}
+
+/* Variables for ioctl() interface support */
+static DEFINE_MUTEX(cf_diag_ctrset_mutex);
+static struct cf_diag_ctrset {
+	unsigned long ctrset;		/* Bit mask of counter set to read */
+	cpumask_t mask;			/* CPU mask to read from */
+	time64_t lastread;		/* Epoch counter set last read */
+} cf_diag_ctrset;
+
+static void cf_diag_ctrset_clear(void)
+{
+	cpumask_clear(&cf_diag_ctrset.mask);
+	cf_diag_ctrset.ctrset = 0;
+}
+
+static void cf_diag_release_cpu(void *p)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+
+	debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__,
+			    smp_processor_id());
+	lcctl(0);		/* Reset counter sets */
+	cpuhw->state = 0;	/* Save state in CPU hardware state */
+}
+
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ * Since only one application is allowed to open the device, simple stop all
+ * CPU counter sets.
+ */
+static int cf_diag_release(struct inode *inode, struct file *file)
+{
+	on_each_cpu(cf_diag_release_cpu, NULL, 1);
+	cf_diag_ctrset_clear();
+	atomic_set(&ctrset_opencnt, 0);
+	__kernel_cpumcf_end();
+	debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__);
+	return 0;
+}
+
+struct cf_diag_call_on_cpu_parm {	/* Parm struct for smp_call_on_cpu */
+	unsigned int sets;		/* Counter set bit mask */
+	atomic_t cpus_ack;		/* # CPUs successfully executed func */
+};
+
+static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask)
+{
+	struct s390_ctrset_read __user *ctrset_read;
+	unsigned int cpu, cpus, rc;
+	void __user *uptr;
+
+	ctrset_read = (struct s390_ctrset_read __user *)arg;
+	uptr = ctrset_read->data;
+	for_each_cpu(cpu, mask) {
+		struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu);
+		struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+		ctrset_cpudata = uptr;
+		debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n",
+				    __func__, cpu, csd->used);
+		rc  = put_user(cpu, &ctrset_cpudata->cpu_nr);
+		rc |= put_user(csd->sets, &ctrset_cpudata->no_sets);
+		rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used);
+		if (rc)
+			return -EFAULT;
+		uptr += sizeof(struct s390_ctrset_cpudata) + csd->used;
+		cond_resched();
+	}
+	cpus = cpumask_weight(mask);
+	if (put_user(cpus, &ctrset_read->no_cpus))
+		return -EFAULT;
+	debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n",
+			    __func__, uptr - (void __user *)ctrset_read->data);
+	return 0;
+}
+
+static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+				  int ctrset_size, size_t room)
+{
+	size_t need = 0;
+	int rc = -1;
+
+	need = sizeof(*p) + sizeof(u64) * ctrset_size;
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s room %zd need %zd set %#x set_size %d\n",
+			    __func__, room, need, ctrset, ctrset_size);
+	if (need <= room) {
+		p->set = cpumf_ctr_ctl[ctrset];
+		p->no_cnts = ctrset_size;
+		rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+		if (rc == 3)		/* Nothing stored */
+			need = 0;
+	}
+	debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__,
+			    need, rc);
+	return need;
+}
+
+/* Read all counter sets. Since the perf_event_open() system call with
+ * event cpum_cf_diag/.../ is blocked when this interface is active, reuse
+ * the perf_event_open() data buffer to store the counter sets.
+ */
+static void cf_diag_cpu_read(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
+	struct cf_diag_call_on_cpu_parm *p = parm;
+	int set, set_size;
+	size_t space;
+
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s new %#x flags %#x state %#llx\n",
+			    __func__, p->sets, cpuhw->flags,
+			    cpuhw->state);
+	/* No data saved yet */
+	csd->used = 0;
+	csd->sets = 0;
+	memset(csd->data, 0, sizeof(csd->data));
+
+	/* Scan the counter sets */
+	for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+		struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used;
+
+		if (!(p->sets & cpumf_ctr_ctl[set]))
+			continue;	/* Counter set not in list */
+		set_size = cf_diag_ctrset_size(set, &cpuhw->info);
+		space = sizeof(csd->data) - csd->used;
+		space = cf_diag_cpuset_read(sp, set, set_size, space);
+		if (space) {
+			csd->used += space;
+			csd->sets += 1;
+		}
+		debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n",
+				    __func__, sp, space);
+	}
+	debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__,
+			    csd->sets, csd->used);
+}
+
+static int cf_diag_all_read(unsigned long arg)
+{
+	struct cf_diag_call_on_cpu_parm p;
+	cpumask_var_t mask;
+	time64_t now;
+	int rc = 0;
+
+	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	now = ktime_get_seconds();
+	if (cf_diag_ctrset.lastread + cf_diag_interval > now) {
+		debug_sprintf_event(cf_diag_dbg, 5, "%s now %lld "
+				    " lastread %lld\n", __func__, now,
+				    cf_diag_ctrset.lastread);
+		rc = -EAGAIN;
+		goto out;
+	} else {
+		cf_diag_ctrset.lastread = now;
+	}
+	p.sets = cf_diag_ctrset.ctrset;
+	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1);
+	rc = cf_diag_all_copy(arg, mask);
+out:
+	free_cpumask_var(mask);
+	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc);
+	return rc;
+}
+
+/* Stop all counter sets via ioctl interface */
+static void cf_diag_ioctl_off(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cf_diag_call_on_cpu_parm *p = parm;
+	int rc;
+
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s new %#x flags %#x state %#llx\n",
+			    __func__, p->sets, cpuhw->flags,
+			    cpuhw->state);
+
+	ctr_set_multiple_disable(&cpuhw->state, p->sets);
+	ctr_set_multiple_stop(&cpuhw->state, p->sets);
+	rc = lcctl(cpuhw->state);		/* Stop counter sets */
+	if (!cpuhw->state)
+		cpuhw->flags &= ~PMU_F_IN_USE;
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s rc %d flags %#x state %#llx\n", __func__,
+			     rc, cpuhw->flags, cpuhw->state);
+}
+
+/* Start counter sets on particular CPU */
+static void cf_diag_ioctl_on(void *parm)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	struct cf_diag_call_on_cpu_parm *p = parm;
+	int rc;
+
+	debug_sprintf_event(cf_diag_dbg, 5,
+			    "%s new %#x flags %#x state %#llx\n",
+			    __func__, p->sets, cpuhw->flags,
+			    cpuhw->state);
+
+	if (!(cpuhw->flags & PMU_F_IN_USE))
+		cpuhw->state = 0;
+	cpuhw->flags |= PMU_F_IN_USE;
+	rc = lcctl(cpuhw->state);		/* Reset unused counter sets */
+	ctr_set_multiple_enable(&cpuhw->state, p->sets);
+	ctr_set_multiple_start(&cpuhw->state, p->sets);
+	rc |= lcctl(cpuhw->state);		/* Start counter sets */
+	if (!rc)
+		atomic_inc(&p->cpus_ack);
+	debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n",
+			    __func__, rc, cpuhw->state);
+}
+
+static int cf_diag_all_stop(void)
+{
+	struct cf_diag_call_on_cpu_parm p = {
+		.sets = cf_diag_ctrset.ctrset,
+	};
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
+	free_cpumask_var(mask);
+	return 0;
+}
+
+static int cf_diag_all_start(void)
+{
+	struct cf_diag_call_on_cpu_parm p = {
+		.sets = cf_diag_ctrset.ctrset,
+		.cpus_ack = ATOMIC_INIT(0),
+	};
+	cpumask_var_t mask;
+	int rc = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask);
+	on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1);
+	if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+		on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1);
+		rc = -EIO;
+	}
+	free_cpumask_var(mask);
+	return rc;
+}
+
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cf_diag_needspace(unsigned int sets)
+{
+	struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+	size_t bytes = 0;
+	int i;
+
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+		if (!(sets & cpumf_ctr_ctl[i]))
+			continue;
+		bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->set) +
+			 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+	}
+	bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+		(bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+		     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+	debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
+			    bytes);
+	return bytes;
+}
+
+static long cf_diag_ioctl_read(unsigned long arg)
+{
+	struct s390_ctrset_read read;
+	int ret = 0;
+
+	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
+	if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+		return -EFAULT;
+	ret = cf_diag_all_read(arg);
+	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static long cf_diag_ioctl_stop(void)
+{
+	int ret;
+
+	debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__);
+	ret = cf_diag_all_stop();
+	cf_diag_ctrset_clear();
+	debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static long cf_diag_ioctl_start(unsigned long arg)
+{
+	struct s390_ctrset_start __user *ustart;
+	struct s390_ctrset_start start;
+	void __user *umask;
+	unsigned int len;
+	int ret = 0;
+	size_t need;
+
+	if (cf_diag_ctrset.ctrset)
+		return -EBUSY;
+	ustart = (struct s390_ctrset_start __user *)arg;
+	if (copy_from_user(&start, ustart, sizeof(start)))
+		return -EFAULT;
+	if (start.version != S390_HWCTR_START_VERSION)
+		return -EINVAL;
+	if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+				   cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+		return -EINVAL;		/* Invalid counter set */
+	if (!start.counter_sets)
+		return -EINVAL;		/* No counter set at all? */
+	cpumask_clear(&cf_diag_ctrset.mask);
+	len = min_t(u64, start.cpumask_len, cpumask_size());
+	umask = (void __user *)start.cpumask;
+	if (copy_from_user(&cf_diag_ctrset.mask, umask, len))
+		return -EFAULT;
+	if (cpumask_empty(&cf_diag_ctrset.mask))
+		return -EINVAL;
+	need = cf_diag_needspace(start.counter_sets);
+	if (put_user(need, &ustart->data_bytes))
+		ret = -EFAULT;
+	if (ret)
+		goto out;
+	cf_diag_ctrset.ctrset = start.counter_sets;
+	ret = cf_diag_all_start();
+out:
+	if (ret)
+		cf_diag_ctrset_clear();
+	debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n",
+			    __func__, cf_diag_ctrset.ctrset, need, ret);
+	return ret;
+}
+
+static long cf_diag_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	int ret;
+
+	debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__,
+			    cmd, arg);
+	get_online_cpus();
+	mutex_lock(&cf_diag_ctrset_mutex);
+	switch (cmd) {
+	case S390_HWCTR_START:
+		ret = cf_diag_ioctl_start(arg);
+		break;
+	case S390_HWCTR_STOP:
+		ret = cf_diag_ioctl_stop();
+		break;
+	case S390_HWCTR_READ:
+		ret = cf_diag_ioctl_read(arg);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+	mutex_unlock(&cf_diag_ctrset_mutex);
+	put_online_cpus();
+	debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static const struct file_operations cf_diag_fops = {
+	.owner = THIS_MODULE,
+	.open = cf_diag_open,
+	.release = cf_diag_release,
+	.unlocked_ioctl	= cf_diag_ioctl,
+	.compat_ioctl = cf_diag_ioctl,
+	.llseek = no_llseek
+};
+
+static struct miscdevice cf_diag_dev = {
+	.name	= S390_HWCTR_DEVICE,
+	.minor	= MISC_DYNAMIC_MINOR,
+	.fops	= &cf_diag_fops,
+};
+
+static int cf_diag_online_cpu(unsigned int cpu)
+{
+	struct cf_diag_call_on_cpu_parm p;
+
+	mutex_lock(&cf_diag_ctrset_mutex);
+	if (!cf_diag_ctrset.ctrset)
+		goto out;
+	p.sets = cf_diag_ctrset.ctrset;
+	cf_diag_ioctl_on(&p);
+out:
+	mutex_unlock(&cf_diag_ctrset_mutex);
+	return 0;
+}
+
+static int cf_diag_offline_cpu(unsigned int cpu)
+{
+	struct cf_diag_call_on_cpu_parm p;
+
+	mutex_lock(&cf_diag_ctrset_mutex);
+	if (!cf_diag_ctrset.ctrset)
+		goto out;
+	p.sets = cf_diag_ctrset.ctrset;
+	cf_diag_ioctl_off(&p);
+out:
+	mutex_unlock(&cf_diag_ctrset_mutex);
+	return 0;
+}
+
 /* Initialize the counter set PMU to generate complete counter set data as
  * event raw data. This relies on the CPU Measurement Counter Facility device
  * already being loaded and initialized.
@@ -685,21 +1165,43 @@ static int __init cf_diag_init(void)
 		return -ENOMEM;
 	}
 
+	rc = misc_register(&cf_diag_dev);
+	if (rc) {
+		pr_err("Registration of /dev/" S390_HWCTR_DEVICE
+		       "failed rc=%d\n", rc);
+		goto out;
+	}
+
 	/* Setup s390dbf facility */
 	cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
 	if (!cf_diag_dbg) {
 		pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto out_dbf;
 	}
 	debug_register_view(cf_diag_dbg, &debug_sprintf_view);
 
 	rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
 	if (rc) {
-		debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
-		debug_unregister(cf_diag_dbg);
 		pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
 		       rc);
+		goto out_perf;
 	}
+	rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE,
+				       "perf/s390/cfd:online",
+				       cf_diag_online_cpu, cf_diag_offline_cpu);
+	if (!rc)
+		goto out;
+
+	pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n",
+	       rc);
+	perf_pmu_unregister(&cf_diag);
+out_perf:
+	debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
+	debug_unregister(cf_diag_dbg);
+out_dbf:
+	misc_deregister(&cf_diag_dev);
+out:
 	return rc;
 }
-arch_initcall(cf_diag_init);
+device_initcall(cf_diag_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index ee09a39627d6..9129ba231423 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -168,6 +168,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
+	CPUHP_AP_PERF_S390_CFD_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
-- 
cgit v1.2.3


From e54937963fa249595824439dc839c948188dea83 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Feb 2021 10:14:21 -0700
Subject: net: remove cmsg restriction from io_uring based send/recvmsg calls

No need to restrict these anymore, as the worker threads are direct
clones of the original task. Hence we know for a fact that we can
support anything that the regular task can.

Since the only user of proto_ops->flags was to flag PROTO_CMSG_DATA_ONLY,
kill the member and the flag definition too.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/net.h |  3 ---
 net/ipv4/af_inet.c  |  1 -
 net/ipv6/af_inet6.c |  1 -
 net/socket.c        | 10 ----------
 4 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 9e2324efc26a..ba736b457a06 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -42,8 +42,6 @@ struct net;
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
 
-#define PROTO_CMSG_DATA_ONLY	0x0001
-
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
  * enum sock_type - Socket types
@@ -138,7 +136,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 
 struct proto_ops {
 	int		family;
-	unsigned int	flags;
 	struct module	*owner;
 	int		(*release)   (struct socket *sock);
 	int		(*bind)	     (struct socket *sock,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a02ce89b56b5..1355e6c0d567 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1021,7 +1021,6 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
 
 const struct proto_ops inet_stream_ops = {
 	.family		   = PF_INET,
-	.flags		   = PROTO_CMSG_DATA_ONLY,
 	.owner		   = THIS_MODULE,
 	.release	   = inet_release,
 	.bind		   = inet_bind,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1fb75f01756c..802f5111805a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -665,7 +665,6 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 const struct proto_ops inet6_stream_ops = {
 	.family		   = PF_INET6,
-	.flags		   = PROTO_CMSG_DATA_ONLY,
 	.owner		   = THIS_MODULE,
 	.release	   = inet6_release,
 	.bind		   = inet6_bind,
diff --git a/net/socket.c b/net/socket.c
index 7f0617ab5437..90a60899aae5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2411,10 +2411,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
 			unsigned int flags)
 {
-	/* disallow ancillary data requests from this path */
-	if (msg->msg_control || msg->msg_controllen)
-		return -EINVAL;
-
 	return ____sys_sendmsg(sock, msg, flags, NULL, 0);
 }
 
@@ -2623,12 +2619,6 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
 			struct user_msghdr __user *umsg,
 			struct sockaddr __user *uaddr, unsigned int flags)
 {
-	if (msg->msg_control || msg->msg_controllen) {
-		/* disallow ancillary data reqs unless cmsg is plain data */
-		if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
-			return -EINVAL;
-	}
-
 	return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
 }
 
-- 
cgit v1.2.3


From 8a378fb096a7f02943c72a428bbfd0029260efb6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 23 Feb 2021 12:27:49 -0700
Subject: io_uring: ensure io-wq context is always destroyed for tasks

If the task ends up doing no IO, the context list is empty and we don't
call into __io_uring_files_cancel() when the task exits. This can cause
a leak of the io-wq structures.

Ensure we always call __io_uring_files_cancel(), even if the task
context list is empty.

Fixes: 5aa75ed5b93f ("io_uring: tie async worker side to the task context")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 7 ++++---
 include/linux/io_uring.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e62ad6bde569..0a435a6f265a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8800,9 +8800,10 @@ void __io_uring_files_cancel(struct files_struct *files)
 
 	if (files) {
 		io_uring_remove_task_files(tctx);
-	} else if (tctx->io_wq && current->flags & PF_EXITING) {
-		io_wq_destroy(tctx->io_wq);
-		tctx->io_wq = NULL;
+		if (tctx->io_wq) {
+			io_wq_destroy(tctx->io_wq);
+			tctx->io_wq = NULL;
+		}
 	}
 }
 
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index c48fcbdc2ea8..51ede771cd99 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -43,7 +43,7 @@ static inline void io_uring_task_cancel(void)
 }
 static inline void io_uring_files_cancel(struct files_struct *files)
 {
-	if (current->io_uring && !xa_empty(&current->io_uring->xa))
+	if (current->io_uring)
 		__io_uring_files_cancel(files);
 }
 static inline void io_uring_free(struct task_struct *tsk)
-- 
cgit v1.2.3


From 6120484ef2bd4ffea7d2f11d2f06167b8f848349 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 11 Feb 2021 21:17:01 +0100
Subject: ACPI: platform: Fix file references in comment

The referenced files are named slightly different. Replace '-' with '_'
and drop the .rst ending.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/platform_profile.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index a26542d53058..b4794027e759 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -12,9 +12,8 @@
 #include <linux/bitops.h>
 
 /*
- * If more options are added please update profile_names
- * array in platform-profile.c and sysfs-platform-profile.rst
- * documentation.
+ * If more options are added please update profile_names array in
+ * platform_profile.c and sysfs-platform_profile documentation.
  */
 
 enum platform_profile_option {
-- 
cgit v1.2.3


From 6c0b5e3fc6b536b125a66dfee103f3bc26d386f6 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 11 Feb 2021 21:17:02 +0100
Subject: ACPI: platform: Add balanced-performance platform profile

Some devices, including most Microsoft Surface devices, have a platform
profile somewhere inbetween balanced and performance. More specifically,
adding this profile allows the following mapping on Surface devices:

  Vendor Name           Platform Profile
  ------------------------------------------
  Battery Saver         low-power
  Recommended           balanced
  Better Performance    balanced-performance
  Best Performance      performance

Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-platform_profile | 18 +++++++++++-------
 drivers/acpi/platform_profile.c                  |  1 +
 include/linux/platform_profile.h                 |  1 +
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform_profile b/Documentation/ABI/testing/sysfs-platform_profile
index 9d6b89b66cca..dae9c8941905 100644
--- a/Documentation/ABI/testing/sysfs-platform_profile
+++ b/Documentation/ABI/testing/sysfs-platform_profile
@@ -5,13 +5,17 @@ Description:	This file contains a space-separated list of profiles supported for
 
 		Drivers must use the following standard profile-names:
 
-		============	============================================
-		low-power	Low power consumption
-		cool		Cooler operation
-		quiet		Quieter operation
-		balanced	Balance between low power consumption and performance
-		performance	High performance operation
-		============	============================================
+		====================	========================================
+		low-power		Low power consumption
+		cool			Cooler operation
+		quiet			Quieter operation
+		balanced		Balance between low power consumption
+					and performance
+		balanced-performance	Balance between performance and low
+					power consumption with a slight bias
+					towards performance
+		performance		High performance operation
+		====================	========================================
 
 		Userspace may expect drivers to offer more than one of these
 		standard profile names.
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index 4a59c5993bde..dd2fbf38e414 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -17,6 +17,7 @@ static const char * const profile_names[] = {
 	[PLATFORM_PROFILE_COOL] = "cool",
 	[PLATFORM_PROFILE_QUIET] = "quiet",
 	[PLATFORM_PROFILE_BALANCED] = "balanced",
+	[PLATFORM_PROFILE_BALANCED_PERFORMANCE] = "balanced-performance",
 	[PLATFORM_PROFILE_PERFORMANCE] = "performance",
 };
 static_assert(ARRAY_SIZE(profile_names) == PLATFORM_PROFILE_LAST);
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index b4794027e759..a6329003aee7 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -21,6 +21,7 @@ enum platform_profile_option {
 	PLATFORM_PROFILE_COOL,
 	PLATFORM_PROFILE_QUIET,
 	PLATFORM_PROFILE_BALANCED,
+	PLATFORM_PROFILE_BALANCED_PERFORMANCE,
 	PLATFORM_PROFILE_PERFORMANCE,
 	PLATFORM_PROFILE_LAST, /*must always be last */
 };
-- 
cgit v1.2.3


From abf4451b340b09f797c87341b3010f95af9215c0 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 19 Jan 2021 20:45:08 +0000
Subject: dma-buf: heaps: Rework heap allocation hooks to return struct dma_buf
 instead of fd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every heap needs to create a dmabuf and then export it to a fd
via dma_buf_fd(), so to consolidate things a bit, have the heaps
just return a struct dmabuf * and let the top level
dma_heap_buffer_alloc() call handle creating the fd via
dma_buf_fd().

Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Liam Mark <lmark@codeaurora.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Ørjan Eide <orjan.eide@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ezequiel Garcia <ezequiel@collabora.com>
Cc: Simon Ser <contact@emersion.fr>
Cc: James Jones <jajones@nvidia.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
 [sumits: minor reword of commit message]

Link: https://patchwork.freedesktop.org/patch/msgid/20210119204508.9256-3-john.stultz@linaro.org
(cherry picked from commit c7f59e3dd60313071a989227dcb69094f499d310)
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/dma-buf/dma-heap.c          | 14 +++++++++++++-
 drivers/dma-buf/heaps/cma_heap.c    | 22 +++++++---------------
 drivers/dma-buf/heaps/system_heap.c | 21 +++++++--------------
 include/linux/dma-heap.h            | 12 ++++++------
 4 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index afd22c9dbdcf..6b5db954569f 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -52,6 +52,9 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 				 unsigned int fd_flags,
 				 unsigned int heap_flags)
 {
+	struct dma_buf *dmabuf;
+	int fd;
+
 	/*
 	 * Allocations from all heaps have to begin
 	 * and end on page boundaries.
@@ -60,7 +63,16 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 	if (!len)
 		return -EINVAL;
 
-	return heap->ops->allocate(heap, len, fd_flags, heap_flags);
+	dmabuf = heap->ops->allocate(heap, len, fd_flags, heap_flags);
+	if (IS_ERR(dmabuf))
+		return PTR_ERR(dmabuf);
+
+	fd = dma_buf_fd(dmabuf, fd_flags);
+	if (fd < 0) {
+		dma_buf_put(dmabuf);
+		/* just return, as put will call release and that will free */
+	}
+	return fd;
 }
 
 static int dma_heap_open(struct inode *inode, struct file *file)
diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c
index 364fc2f3e499..5d64eccd21d6 100644
--- a/drivers/dma-buf/heaps/cma_heap.c
+++ b/drivers/dma-buf/heaps/cma_heap.c
@@ -271,10 +271,10 @@ static const struct dma_buf_ops cma_heap_buf_ops = {
 	.release = cma_heap_dma_buf_release,
 };
 
-static int cma_heap_allocate(struct dma_heap *heap,
-				  unsigned long len,
-				  unsigned long fd_flags,
-				  unsigned long heap_flags)
+static struct dma_buf *cma_heap_allocate(struct dma_heap *heap,
+					 unsigned long len,
+					 unsigned long fd_flags,
+					 unsigned long heap_flags)
 {
 	struct cma_heap *cma_heap = dma_heap_get_drvdata(heap);
 	struct cma_heap_buffer *buffer;
@@ -289,7 +289,7 @@ static int cma_heap_allocate(struct dma_heap *heap,
 
 	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&buffer->attachments);
 	mutex_init(&buffer->lock);
@@ -348,15 +348,7 @@ static int cma_heap_allocate(struct dma_heap *heap,
 		ret = PTR_ERR(dmabuf);
 		goto free_pages;
 	}
-
-	ret = dma_buf_fd(dmabuf, fd_flags);
-	if (ret < 0) {
-		dma_buf_put(dmabuf);
-		/* just return, as put will call release and that will free */
-		return ret;
-	}
-
-	return ret;
+	return dmabuf;
 
 free_pages:
 	kfree(buffer->pages);
@@ -365,7 +357,7 @@ free_cma:
 free_buffer:
 	kfree(buffer);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static const struct dma_heap_ops cma_heap_ops = {
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index 405351aad2a8..29e49ac17251 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -331,10 +331,10 @@ static struct page *alloc_largest_available(unsigned long size,
 	return NULL;
 }
 
-static int system_heap_allocate(struct dma_heap *heap,
-				unsigned long len,
-				unsigned long fd_flags,
-				unsigned long heap_flags)
+static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
+					    unsigned long len,
+					    unsigned long fd_flags,
+					    unsigned long heap_flags)
 {
 	struct system_heap_buffer *buffer;
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
@@ -349,7 +349,7 @@ static int system_heap_allocate(struct dma_heap *heap,
 
 	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&buffer->attachments);
 	mutex_init(&buffer->lock);
@@ -399,14 +399,7 @@ static int system_heap_allocate(struct dma_heap *heap,
 		ret = PTR_ERR(dmabuf);
 		goto free_pages;
 	}
-
-	ret = dma_buf_fd(dmabuf, fd_flags);
-	if (ret < 0) {
-		dma_buf_put(dmabuf);
-		/* just return, as put will call release and that will free */
-		return ret;
-	}
-	return ret;
+	return dmabuf;
 
 free_pages:
 	for_each_sgtable_sg(table, sg, i) {
@@ -420,7 +413,7 @@ free_buffer:
 		__free_pages(page, compound_order(page));
 	kfree(buffer);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static const struct dma_heap_ops system_heap_ops = {
diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 454e354d1ffb..5bc5c946af58 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -16,15 +16,15 @@ struct dma_heap;
 
 /**
  * struct dma_heap_ops - ops to operate on a given heap
- * @allocate:		allocate dmabuf and return fd
+ * @allocate:		allocate dmabuf and return struct dma_buf ptr
  *
- * allocate returns dmabuf fd  on success, -errno on error.
+ * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
  */
 struct dma_heap_ops {
-	int (*allocate)(struct dma_heap *heap,
-			unsigned long len,
-			unsigned long fd_flags,
-			unsigned long heap_flags);
+	struct dma_buf *(*allocate)(struct dma_heap *heap,
+				    unsigned long len,
+				    unsigned long fd_flags,
+				    unsigned long heap_flags);
 };
 
 /**
-- 
cgit v1.2.3


From f588f0c69e0e645225e4ebc1aff8f9677583a056 Mon Sep 17 00:00:00 2001
From: Veera Sundaram Sankaran <veeras@codeaurora.org>
Date: Fri, 15 Jan 2021 16:31:46 -0800
Subject: dma-fence: allow signaling drivers to set fence timestamp

Some drivers have hardware capability to get the precise HW timestamp
of certain events based on which the fences are triggered. The delta
between the event HW timestamp & current HW reference timestamp can
be used to calculate the timestamp in kernel's CLOCK_MONOTONIC time
domain. This allows it to set accurate timestamp factoring out any
software and IRQ latencies. Add a timestamp variant of fence signal
function, dma_fence_signal_timestamp to allow drivers to update the
precise timestamp for fences.

Changes in v2:
- Add a new fence signal variant instead of modifying fence struct

Changes in v3:
- Add timestamp domain information to commit-text and
dma_fence_signal_timestamp documentation

Signed-off-by: Veera Sundaram Sankaran <veeras@codeaurora.org>
Reviewed-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
 [sumits: minor parenthesis alignment]
Link: https://patchwork.freedesktop.org/patch/msgid/1610757107-11892-1-git-send-email-veeras@codeaurora.org
(cherry picked from commit 5a164ac4dbd21b82bcdc03186d40e455ff467fdc)
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/dma-buf/dma-fence.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/dma-fence.h   |  3 ++
 2 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 7475e09b0680..d64fc03929be 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -312,22 +312,25 @@ void __dma_fence_might_wait(void)
 
 
 /**
- * dma_fence_signal_locked - signal completion of a fence
+ * dma_fence_signal_timestamp_locked - signal completion of a fence
  * @fence: the fence to signal
+ * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain
  *
  * Signal completion for software callbacks on a fence, this will unblock
  * dma_fence_wait() calls and run all the callbacks added with
  * dma_fence_add_callback(). Can be called multiple times, but since a fence
  * can only go from the unsignaled to the signaled state and not back, it will
- * only be effective the first time.
+ * only be effective the first time. Set the timestamp provided as the fence
+ * signal timestamp.
  *
- * Unlike dma_fence_signal(), this function must be called with &dma_fence.lock
- * held.
+ * Unlike dma_fence_signal_timestamp(), this function must be called with
+ * &dma_fence.lock held.
  *
  * Returns 0 on success and a negative error value when @fence has been
  * signalled already.
  */
-int dma_fence_signal_locked(struct dma_fence *fence)
+int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
+				      ktime_t timestamp)
 {
 	struct dma_fence_cb *cur, *tmp;
 	struct list_head cb_list;
@@ -341,7 +344,7 @@ int dma_fence_signal_locked(struct dma_fence *fence)
 	/* Stash the cb_list before replacing it with the timestamp */
 	list_replace(&fence->cb_list, &cb_list);
 
-	fence->timestamp = ktime_get();
+	fence->timestamp = timestamp;
 	set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags);
 	trace_dma_fence_signaled(fence);
 
@@ -352,6 +355,59 @@ int dma_fence_signal_locked(struct dma_fence *fence)
 
 	return 0;
 }
+EXPORT_SYMBOL(dma_fence_signal_timestamp_locked);
+
+/**
+ * dma_fence_signal_timestamp - signal completion of a fence
+ * @fence: the fence to signal
+ * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain
+ *
+ * Signal completion for software callbacks on a fence, this will unblock
+ * dma_fence_wait() calls and run all the callbacks added with
+ * dma_fence_add_callback(). Can be called multiple times, but since a fence
+ * can only go from the unsignaled to the signaled state and not back, it will
+ * only be effective the first time. Set the timestamp provided as the fence
+ * signal timestamp.
+ *
+ * Returns 0 on success and a negative error value when @fence has been
+ * signalled already.
+ */
+int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!fence)
+		return -EINVAL;
+
+	spin_lock_irqsave(fence->lock, flags);
+	ret = dma_fence_signal_timestamp_locked(fence, timestamp);
+	spin_unlock_irqrestore(fence->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_fence_signal_timestamp);
+
+/**
+ * dma_fence_signal_locked - signal completion of a fence
+ * @fence: the fence to signal
+ *
+ * Signal completion for software callbacks on a fence, this will unblock
+ * dma_fence_wait() calls and run all the callbacks added with
+ * dma_fence_add_callback(). Can be called multiple times, but since a fence
+ * can only go from the unsignaled to the signaled state and not back, it will
+ * only be effective the first time.
+ *
+ * Unlike dma_fence_signal(), this function must be called with &dma_fence.lock
+ * held.
+ *
+ * Returns 0 on success and a negative error value when @fence has been
+ * signalled already.
+ */
+int dma_fence_signal_locked(struct dma_fence *fence)
+{
+	return dma_fence_signal_timestamp_locked(fence, ktime_get());
+}
 EXPORT_SYMBOL(dma_fence_signal_locked);
 
 /**
@@ -379,7 +435,7 @@ int dma_fence_signal(struct dma_fence *fence)
 	tmp = dma_fence_begin_signalling();
 
 	spin_lock_irqsave(fence->lock, flags);
-	ret = dma_fence_signal_locked(fence);
+	ret = dma_fence_signal_timestamp_locked(fence, ktime_get());
 	spin_unlock_irqrestore(fence->lock, flags);
 
 	dma_fence_end_signalling(tmp);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 09e23adb351d..9f12efaaa93a 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -372,6 +372,9 @@ static inline void __dma_fence_might_wait(void) {}
 
 int dma_fence_signal(struct dma_fence *fence);
 int dma_fence_signal_locked(struct dma_fence *fence);
+int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp);
+int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
+				      ktime_t timestamp);
 signed long dma_fence_default_wait(struct dma_fence *fence,
 				   bool intr, signed long timeout);
 int dma_fence_add_callback(struct dma_fence *fence,
-- 
cgit v1.2.3


From 1f7ef657740344541645349a8bece90cbff898f5 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Wed, 24 Feb 2021 12:01:42 -0800
Subject: mm/filemap: remove unused parameter and change to void type for
 replace_page_cache_page()

Since commit 74d609585d8b ("page cache: Add and replace pages using the
XArray") was merged, the replace_page_cache_page() can not fail and always
return 0, we can remove the redundant return value and void it.  Moreover
remove the unused gfp_mask.

Link: https://lkml.kernel.org/r/609c30e5274ba15d8b90c872fd0d8ac437a9b2bb.1610071401.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c           | 6 +-----
 include/linux/pagemap.h | 2 +-
 mm/filemap.c            | 7 +------
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 588f8d1240aa..c6636b4c4ccf 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	if (WARN_ON(PageMlocked(oldpage)))
 		goto out_fallback_unlock;
 
-	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
-	if (err) {
-		unlock_page(newpage);
-		goto out_put_old;
-	}
+	replace_page_cache_page(oldpage, newpage);
 
 	get_page(newpage);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d5570deff400..74e466e5a2ba 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -757,7 +757,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
 extern void __delete_from_page_cache(struct page *page, void *shadow);
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
+void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6ff2a3fb0dc7..7dfed3454a2e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * replace_page_cache_page - replace a pagecache page with a new one
  * @old:	page to be replaced
  * @new:	page to replace with
- * @gfp_mask:	allocation mode
  *
  * This function replaces a page in the pagecache with a new one.  On
  * success it acquires the pagecache reference for the new page and
@@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
  * caller must do that.
  *
  * The remove + add is atomic.  This function cannot fail.
- *
- * Return: %0
  */
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+void replace_page_cache_page(struct page *old, struct page *new)
 {
 	struct address_space *mapping = old->mapping;
 	void (*freepage)(struct page *) = mapping->a_ops->freepage;
@@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 	if (freepage)
 		freepage(old);
 	put_page(old);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
-- 
cgit v1.2.3


From 4805462598113f350838d612d0895db2dbb3992b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Feb 2021 12:02:02 -0800
Subject: mm/filemap: pass a sleep state to put_and_wait_on_page_locked

This is prep work for the next patch, but I think at least one of the
current callers would prefer a killable sleep to an uninterruptible one.

Link: https://lkml.kernel.org/r/20210122160140.223228-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 3 +--
 mm/filemap.c            | 7 +++++--
 mm/huge_memory.c        | 4 ++--
 mm/migrate.c            | 4 ++--
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 74e466e5a2ba..bd629d676a27 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -681,8 +681,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 	return wait_on_page_bit_killable(compound_head(page), PG_locked);
 }
 
-extern void put_and_wait_on_page_locked(struct page *page);
-
+int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 4fce9735e172..389a20eec9b8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1384,20 +1384,23 @@ static int wait_on_page_locked_async(struct page *page,
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
  *
  * The caller should hold a reference on @page.  They expect the page to
  * become unlocked relatively soon, but do not wish to hold up migration
  * (for example) by holding the reference while waiting for the page to
  * come unlocked.  After this function returns, the caller should not
  * dereference @page.
+ *
+ * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
  */
-void put_and_wait_on_page_locked(struct page *page)
+int put_and_wait_on_page_locked(struct page *page, int state)
 {
 	wait_queue_head_t *q;
 
 	page = compound_head(page);
 	q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+	return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
 }
 
 /**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 91ca9b103ee5..f655137536eb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1439,7 +1439,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		if (!get_page_unless_zero(page))
 			goto out_unlock;
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page);
+		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
 	}
 
@@ -1475,7 +1475,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		if (!get_page_unless_zero(page))
 			goto out_unlock;
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page);
+		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 20ca887ea769..4ec727adfaa2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 	if (!get_page_unless_zero(page))
 		goto out;
 	pte_unmap_unlock(ptep, ptl);
-	put_and_wait_on_page_locked(page);
+	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 	return;
 out:
 	pte_unmap_unlock(ptep, ptl);
@@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	if (!get_page_unless_zero(page))
 		goto unlock;
 	spin_unlock(ptl);
-	put_and_wait_on_page_locked(page);
+	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 	return;
 unlock:
 	spin_unlock(ptl);
-- 
cgit v1.2.3


From 87fa0f3eb267eed966ee194907bc15376c1b758f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 24 Feb 2021 12:02:42 -0800
Subject: mm/filemap: rename generic_file_buffered_read to filemap_read

Rename generic_file_buffered_read to match the naming of filemap_fault,
also update the written parameter to a more descriptive name and improve
the kerneldoc comment.

Link: https://lkml.kernel.org/r/20210122160140.223228-18-willy@infradead.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/file.c    |  2 +-
 include/linux/fs.h |  4 ++--
 mm/filemap.c       | 35 ++++++++++++++++-------------------
 3 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index be9e3900cce8..bf2c51a9607a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3634,7 +3634,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 	}
 
-	return generic_file_buffered_read(iocb, to, ret);
+	return filemap_read(iocb, to, ret);
 }
 
 const struct file_operations btrfs_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 418b772d6eca..ec8f3ddf4a6a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3080,8 +3080,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_write_check_limits(struct file *file, loff_t pos,
 		loff_t *count);
 extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
-extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
-		struct iov_iter *to, ssize_t already_read);
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
+		ssize_t already_read);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1358cb061fd6..28c290da22c1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2394,23 +2394,20 @@ err:
 }
 
 /**
- * generic_file_buffered_read - generic file read routine
- * @iocb:	the iocb to read
- * @iter:	data destination
- * @written:	already copied
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
  *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and readpage address_space operations to fetch it.
  *
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
- *
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
  */
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
-		struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
@@ -2437,7 +2434,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		 * can no longer safely return -EIOCBQUEUED. Hence mark
 		 * an async read NOWAIT at that point.
 		 */
-		if ((iocb->ki_flags & IOCB_WAITQ) && written)
+		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
 			iocb->ki_flags |= IOCB_NOWAIT;
 
 		error = filemap_get_pages(iocb, iter, &pvec);
@@ -2497,7 +2494,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 			copied = copy_page_to_iter(page, offset, bytes, iter);
 
-			written += copied;
+			already_read += copied;
 			iocb->ki_pos += copied;
 			ra->prev_pos = iocb->ki_pos;
 
@@ -2514,9 +2511,9 @@ put_pages:
 
 	file_accessed(filp);
 
-	return written ? written : error;
+	return already_read ? already_read : error;
 }
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+EXPORT_SYMBOL_GPL(filemap_read);
 
 /**
  * generic_file_read_iter - generic filesystem read routine
@@ -2591,7 +2588,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 			goto out;
 	}
 
-	retval = generic_file_buffered_read(iocb, iter, retval);
+	retval = filemap_read(iocb, iter, retval);
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From 2e9bd483159939ed2c0704b914294653c8341d25 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 24 Feb 2021 12:03:11 -0800
Subject: mm: memcg/slab: pre-allocate obj_cgroups for slab caches with
 SLAB_ACCOUNT

In general it's unknown in advance if a slab page will contain accounted
objects or not.  In order to avoid memory waste, an obj_cgroup vector is
allocated dynamically when a need to account of a new object arises.  Such
approach is memory efficient, but requires an expensive cmpxchg() to set
up the memcg/objcgs pointer, because an allocation can race with a
different allocation on another cpu.

But in some common cases it's known for sure that a slab page will contain
accounted objects: if the page belongs to a slab cache with a SLAB_ACCOUNT
flag set.  It includes such popular objects like vm_area_struct, anon_vma,
task_struct, etc.

In such cases we can pre-allocate the objcgs vector and simple assign it
to the page without any atomic operations, because at this early stage the
page is not visible to anyone else.

A very simplistic benchmark (allocating 10000000 64-bytes objects in a
row) shows ~15% win.  In the real life it seems that most workloads are
not very sensitive to the speed of (accounted) slab allocations.

[guro@fb.com: open-code set_page_objcgs() and add some comments, by Johannes]
  Link: https://lkml.kernel.org/r/20201113001926.GA2934489@carbon.dhcp.thefacebook.com
[akpm@linux-foundation.org: fix it for mm-slub-call-account_slab_page-after-slab-page-initialization-fix.patch]

Link: https://lkml.kernel.org/r/20201110195753.530157-2-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 -------------------
 mm/memcontrol.c            | 23 +++++++++++++++++++----
 mm/slab.c                  |  2 +-
 mm/slab.h                  | 14 ++++++++++----
 mm/slub.c                  |  2 +-
 5 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index eeb0b52203e9..7a4dd1cb19fe 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -475,19 +475,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
-/*
- * set_page_objcgs - associate a page with a object cgroups vector
- * @page: a pointer to the page struct
- * @objcgs: a pointer to the object cgroups vector
- *
- * Atomically associates a page with a vector of object cgroups.
- */
-static inline bool set_page_objcgs(struct page *page,
-					struct obj_cgroup **objcgs)
-{
-	return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs |
-			MEMCG_DATA_OBJCGS);
-}
 #else
 static inline struct obj_cgroup **page_objcgs(struct page *page)
 {
@@ -498,12 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 {
 	return NULL;
 }
-
-static inline bool set_page_objcgs(struct page *page,
-					struct obj_cgroup **objcgs)
-{
-	return true;
-}
 #endif
 
 static __always_inline bool memcg_stat_item_in_bytes(int idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b9bd354e97e..60ce452e42e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2935,9 +2935,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
-				 gfp_t gfp)
+				 gfp_t gfp, bool new_page)
 {
 	unsigned int objects = objs_per_slab_page(s, page);
+	unsigned long memcg_data;
 	void *vec;
 
 	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
@@ -2945,11 +2946,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 	if (!vec)
 		return -ENOMEM;
 
-	if (!set_page_objcgs(page, vec))
+	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
+	if (new_page) {
+		/*
+		 * If the slab page is brand new and nobody can yet access
+		 * it's memcg_data, no synchronization is required and
+		 * memcg_data can be simply assigned.
+		 */
+		page->memcg_data = memcg_data;
+	} else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+		/*
+		 * If the slab page is already in use, somebody can allocate
+		 * and assign obj_cgroups in parallel. In this case the existing
+		 * objcg vector should be reused.
+		 */
 		kfree(vec);
-	else
-		kmemleak_not_leak(vec);
+		return 0;
+	}
 
+	kmemleak_not_leak(vec);
 	return 0;
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index cdad64b2836d..9c9e0c255c4b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1379,7 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 		return NULL;
 	}
 
-	account_slab_page(page, cachep->gfporder, cachep);
+	account_slab_page(page, cachep->gfporder, cachep, flags);
 	__SetPageSlab(page);
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
 	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
diff --git a/mm/slab.h b/mm/slab.h
index 48dc466a3791..076582f58f68 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -238,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
 
 #ifdef CONFIG_MEMCG_KMEM
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
-				 gfp_t gfp);
+				 gfp_t gfp, bool new_page);
 
 static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
@@ -315,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 			page = virt_to_head_page(p[i]);
 
 			if (!page_objcgs(page) &&
-			    memcg_alloc_page_obj_cgroups(page, s, flags)) {
+			    memcg_alloc_page_obj_cgroups(page, s, flags,
+							 false)) {
 				obj_cgroup_uncharge(objcg, obj_full_size(s));
 				continue;
 			}
@@ -379,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
 }
 
 static inline int memcg_alloc_page_obj_cgroups(struct page *page,
-					       struct kmem_cache *s, gfp_t gfp)
+					       struct kmem_cache *s, gfp_t gfp,
+					       bool new_page)
 {
 	return 0;
 }
@@ -420,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
 }
 
 static __always_inline void account_slab_page(struct page *page, int order,
-					      struct kmem_cache *s)
+					      struct kmem_cache *s,
+					      gfp_t gfp)
 {
+	if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
+		memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+
 	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
 			    PAGE_SIZE << order);
 }
diff --git a/mm/slub.c b/mm/slub.c
index e0fef8f8d4bf..ca566361561e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1785,7 +1785,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	page->objects = oo_objects(oo);
 
-	account_slab_page(page, oo_order(oo), s);
+	account_slab_page(page, oo_order(oo), s, flags);
 
 	page->slab_cache = s;
 	__SetPageSlab(page);
-- 
cgit v1.2.3


From f3344adf38bdb3107d40483dd9501215ad40edce Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:15 -0800
Subject: mm: memcontrol: optimize per-lruvec stats counter memory usage

The vmstat threshold is 32 (MEMCG_CHARGE_BATCH), Actually the threshold
can be as big as MEMCG_CHARGE_BATCH * PAGE_SIZE.  It still fits into s32.
So introduce struct batched_lruvec_stat to optimize memory usage.

The size of struct lruvec_stat is 304 bytes on 64 bit systems.  As it is a
per-cpu structure.  So with this patch, we can save 304 / 2 * ncpu bytes
per-memcg per-node where ncpu is the number of the possible CPU.  If there
are c memory cgroup (include dying cgroup) and n NUMA node in the system.
Finally, we can save (152 * ncpu * c * n) bytes.

[akpm@linux-foundation.org: fix typo in comment]

Link: https://lkml.kernel.org/r/20201210042121.39665-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Chris Down <chris@chrisdown.name>
Cc: Yafang Shao <laoar.shao@gmail.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 14 ++++++++++++--
 mm/memcontrol.c            | 10 +++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7a4dd1cb19fe..41bbf71edd9f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -92,6 +92,10 @@ struct lruvec_stat {
 	long count[NR_VM_NODE_STAT_ITEMS];
 };
 
+struct batched_lruvec_stat {
+	s32 count[NR_VM_NODE_STAT_ITEMS];
+};
+
 /*
  * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
  * which have elements charged to this memcg.
@@ -107,11 +111,17 @@ struct memcg_shrinker_map {
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
 
-	/* Legacy local VM stats */
+	/*
+	 * Legacy local VM stats. This should be struct lruvec_stat and
+	 * cannot be optimized to struct batched_lruvec_stat. Because
+	 * the threshold of the lruvec_stat_cpu can be as big as
+	 * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
+	 * filed has no upper limit.
+	 */
 	struct lruvec_stat __percpu *lruvec_stat_local;
 
 	/* Subtree VM stats (batched updates) */
-	struct lruvec_stat __percpu *lruvec_stat_cpu;
+	struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
 	atomic_long_t		lruvec_stat[NR_VM_NODE_STAT_ITEMS];
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 60ce452e42e6..b259e7d8ce41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5208,7 +5208,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 		return 1;
 	}
 
-	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+	pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
 					       GFP_KERNEL_ACCOUNT);
 	if (!pn->lruvec_stat_cpu) {
 		free_percpu(pn->lruvec_stat_local);
@@ -7093,6 +7093,14 @@ static int __init mem_cgroup_init(void)
 {
 	int cpu, node;
 
+	/*
+	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
+	 * used for per-memcg-per-cpu caching of per-node statistics. In order
+	 * to work fine, we should make sure that the overfill threshold can't
+	 * exceed S32_MAX / PAGE_SIZE.
+	 */
+	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
+
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
 				  memcg_hotplug_cpu_dead);
 
-- 
cgit v1.2.3


From 69473e5de87389be6c0fa4a5d574a50c8f904fb3 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:23 -0800
Subject: mm: memcontrol: convert NR_ANON_THPS account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_ANON_THPS account to pages.  This patch is consistent
with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival").
Doing this also can make the unit of vmstat counters more unified.
Finally, the unit of the vmstat counters are pages, kB and bytes.  The
B/KB suffix can tell us that the unit is bytes or kB.  The rest which is
without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 15 +++++++++------
 fs/proc/meminfo.c      |  2 +-
 include/linux/mmzone.h | 13 +++++++++++++
 mm/huge_memory.c       |  3 ++-
 mm/memcontrol.c        | 20 ++++++--------------
 mm/page_alloc.c        |  2 +-
 mm/rmap.c              |  6 +++---
 mm/vmstat.c            | 11 +++++++++--
 8 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 04f71c7bc3f8..6da0c3508bc9 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -461,8 +461,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     nid, K(sunreclaimable)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			     ,
-			     nid, K(node_page_state(pgdat, NR_ANON_THPS) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_ANON_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
 				    HPAGE_PMD_NR),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
@@ -519,10 +518,14 @@ static ssize_t node_read_vmstat(struct device *dev,
 				     sum_zone_numa_state(nid, i));
 
 #endif
-	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-		len += sysfs_emit_at(buf, len, "%s %lu\n",
-				     node_stat_name(i),
-				     node_page_state_pages(pgdat, i));
+	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+		unsigned long pages = node_page_state_pages(pgdat, i);
+
+		if (vmstat_item_print_in_thp(i))
+			pages /= HPAGE_PMD_NR;
+		len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i),
+				     pages);
+	}
 
 	return len;
 }
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d6fc74619625..a635c8a84ddf 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -129,7 +129,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	show_val_kb(m, "AnonHugePages:  ",
-		    global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_ANON_THPS));
 	show_val_kb(m, "ShmemHugePages: ",
 		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
 	show_val_kb(m, "ShmemPmdMapped: ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..67d50ef5dd20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -209,6 +209,19 @@ enum node_stat_item {
 	NR_VM_NODE_STAT_ITEMS
 };
 
+/*
+ * Returns true if the item should be printed in THPs (/proc/vmstat
+ * currently prints number of anon, file and shmem THPs. But the item
+ * is charged in pages).
+ */
+static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return false;
+
+	return item == NR_ANON_THPS;
+}
+
 /*
  * Returns true if the value is measured in bytes (most vmstat values are
  * measured in pages). This defines the API part, the internal representation
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f655137536eb..3691e863070a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2176,7 +2176,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		lock_page_memcg(page);
 		if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
 			/* Last compound_mapcount is gone. */
-			__dec_lruvec_page_state(page, NR_ANON_THPS);
+			__mod_lruvec_page_state(page, NR_ANON_THPS,
+						-HPAGE_PMD_NR);
 			if (TestClearPageDoubleMap(page)) {
 				/* No need in mapcount reference anymore */
 				for (i = 0; i < HPAGE_PMD_NR; i++)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e31e47e7bab2..b2405f049006 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1533,7 +1533,7 @@ static struct memory_stat memory_stats[] = {
 	 * on some architectures, the macro of HPAGE_PMD_SIZE is not
 	 * constant(e.g. powerpc).
 	 */
-	{ "anon_thp", 0, NR_ANON_THPS },
+	{ "anon_thp", PAGE_SIZE, NR_ANON_THPS },
 	{ "file_thp", 0, NR_FILE_THPS },
 	{ "shmem_thp", 0, NR_SHMEM_THPS },
 #endif
@@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memory_stats[i].idx == NR_ANON_THPS ||
-		    memory_stats[i].idx == NR_FILE_THPS ||
+		if (memory_stats[i].idx == NR_FILE_THPS ||
 		    memory_stats[i].idx == NR_SHMEM_THPS)
 			memory_stats[i].ratio = HPAGE_PMD_SIZE;
 #endif
@@ -4087,10 +4086,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memcg1_stats[i] == NR_ANON_THPS)
-			nr *= HPAGE_PMD_NR;
-#endif
 		seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
 	}
 
@@ -4121,10 +4116,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		nr = memcg_page_state(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memcg1_stats[i] == NR_ANON_THPS)
-			nr *= HPAGE_PMD_NR;
-#endif
 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
 						(u64)nr * PAGE_SIZE);
 	}
@@ -5652,10 +5643,11 @@ static int mem_cgroup_move_account(struct page *page,
 			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
 			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
 			if (PageTransHuge(page)) {
-				__dec_lruvec_state(from_vec, NR_ANON_THPS);
-				__inc_lruvec_state(to_vec, NR_ANON_THPS);
+				__mod_lruvec_state(from_vec, NR_ANON_THPS,
+						   -nr_pages);
+				__mod_lruvec_state(to_vec, NR_ANON_THPS,
+						   nr_pages);
 			}
-
 		}
 	} else {
 		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef5070fed76b..2e2e47f8714b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5587,7 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
 					* HPAGE_PMD_NR),
-			K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+			K(node_page_state(pgdat, NR_ANON_THPS)),
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			node_page_state(pgdat, NR_KERNEL_STACK_KB),
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c56aaf72eb..c4d5c63cfd29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1144,7 +1144,7 @@ void do_page_add_anon_rmap(struct page *page,
 		 * disabled.
 		 */
 		if (compound)
-			__inc_lruvec_page_state(page, NR_ANON_THPS);
+			__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
 		__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
 	}
 
@@ -1186,7 +1186,7 @@ void page_add_new_anon_rmap(struct page *page,
 		if (hpage_pincount_available(page))
 			atomic_set(compound_pincount_ptr(page), 0);
 
-		__inc_lruvec_page_state(page, NR_ANON_THPS);
+		__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
 	} else {
 		/* Anon THP always mapped first with PMD */
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1292,7 +1292,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		return;
 
-	__dec_lruvec_page_state(page, NR_ANON_THPS);
+	__mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
 
 	if (TestClearPageDoubleMap(page)) {
 		/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..07dc0af50cf0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1619,8 +1619,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	if (is_zone_first_populated(pgdat, zone)) {
 		seq_printf(m, "\n  per-node stats");
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+			unsigned long pages = node_page_state_pages(pgdat, i);
+
+			if (vmstat_item_print_in_thp(i))
+				pages /= HPAGE_PMD_NR;
 			seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
-				   node_page_state_pages(pgdat, i));
+				   pages);
 		}
 	}
 	seq_printf(m,
@@ -1740,8 +1744,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	v += NR_VM_NUMA_STAT_ITEMS;
 #endif
 
-	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
 		v[i] = global_node_page_state_pages(i);
+		if (vmstat_item_print_in_thp(i))
+			v[i] /= HPAGE_PMD_NR;
+	}
 	v += NR_VM_NODE_STAT_ITEMS;
 
 	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
-- 
cgit v1.2.3


From bf9ecead53c89d3d2cf60acbc460174ebbcf0027 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:27 -0800
Subject: mm: memcontrol: convert NR_FILE_THPS account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with if hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_FILE_THPS account to pages.  This patch is consistent
with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival").
Doing this also can make the unit of vmstat counters more unified.
Finally, the unit of the vmstat counters are pages, kB and bytes.  The
B/KB suffix can tell us that the unit is bytes or kB.  The rest which is
without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 3 +--
 fs/proc/meminfo.c      | 2 +-
 include/linux/mmzone.h | 3 ++-
 mm/filemap.c           | 2 +-
 mm/huge_memory.c       | 5 ++++-
 mm/khugepaged.c        | 4 +++-
 mm/memcontrol.c        | 5 ++---
 7 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6da0c3508bc9..d5952f754911 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -466,8 +466,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 				    HPAGE_PMD_NR),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
 				    HPAGE_PMD_NR),
-			     nid, K(node_page_state(pgdat, NR_FILE_THPS) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
 			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
 				    HPAGE_PMD_NR)
 #endif
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a635c8a84ddf..7ea4679880c8 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -135,7 +135,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "ShmemPmdMapped: ",
 		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
 	show_val_kb(m, "FileHugePages:  ",
-		    global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_FILE_THPS));
 	show_val_kb(m, "FilePmdMapped:  ",
 		    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67d50ef5dd20..b751a9898bb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -219,7 +219,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		return false;
 
-	return item == NR_ANON_THPS;
+	return item == NR_ANON_THPS ||
+	       item == NR_FILE_THPS;
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 820d81901eae..9efe059e2b58 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -208,7 +208,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
 		if (PageTransHuge(page))
 			__dec_lruvec_page_state(page, NR_SHMEM_THPS);
 	} else if (PageTransHuge(page)) {
-		__dec_lruvec_page_state(page, NR_FILE_THPS);
+		__mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
 		filemap_nr_thps_dec(mapping);
 	}
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3691e863070a..77181faa2340 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2752,10 +2752,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 		}
 		spin_unlock(&ds_queue->split_queue_lock);
 		if (mapping) {
+			int nr = thp_nr_pages(head);
+
 			if (PageSwapBacked(head))
 				__dec_lruvec_page_state(head, NR_SHMEM_THPS);
 			else
-				__dec_lruvec_page_state(head, NR_FILE_THPS);
+				__mod_lruvec_page_state(head, NR_FILE_THPS,
+							-nr);
 		}
 
 		__split_huge_page(page, list, end);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fb0fdaec34d5..c427fe2ca7ff 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1643,6 +1643,7 @@ static void collapse_file(struct mm_struct *mm,
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 	int nr_none = 0, result = SCAN_SUCCEED;
 	bool is_shmem = shmem_file(file);
+	int nr;
 
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1854,11 +1855,12 @@ out_unlock:
 		put_page(page);
 		goto xa_unlocked;
 	}
+	nr = thp_nr_pages(new_page);
 
 	if (is_shmem)
 		__inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
 	else {
-		__inc_lruvec_page_state(new_page, NR_FILE_THPS);
+		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
 		filemap_nr_thps_inc(mapping);
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2405f049006..fc552f57d3fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1534,7 +1534,7 @@ static struct memory_stat memory_stats[] = {
 	 * constant(e.g. powerpc).
 	 */
 	{ "anon_thp", PAGE_SIZE, NR_ANON_THPS },
-	{ "file_thp", 0, NR_FILE_THPS },
+	{ "file_thp", PAGE_SIZE, NR_FILE_THPS },
 	{ "shmem_thp", 0, NR_SHMEM_THPS },
 #endif
 	{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
@@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memory_stats[i].idx == NR_FILE_THPS ||
-		    memory_stats[i].idx == NR_SHMEM_THPS)
+		if (memory_stats[i].idx == NR_SHMEM_THPS)
 			memory_stats[i].ratio = HPAGE_PMD_SIZE;
 #endif
 		VM_BUG_ON(!memory_stats[i].ratio);
-- 
cgit v1.2.3


From 57b2847d3c1dc154923578efb47a12302a57d700 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:31 -0800
Subject: mm: memcontrol: convert NR_SHMEM_THPS account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_SHMEM_THPS account to pages.  This patch is
consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page
arrival").  Doing this also can make the unit of vmstat counters more
unified.  Finally, the unit of the vmstat counters are pages, kB and
bytes.  The B/KB suffix can tell us that the unit is bytes or kB.  The
rest which is without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  3 +--
 fs/proc/meminfo.c      |  2 +-
 include/linux/mmzone.h |  3 ++-
 mm/filemap.c           |  2 +-
 mm/huge_memory.c       |  3 ++-
 mm/khugepaged.c        |  2 +-
 mm/memcontrol.c        | 26 ++------------------------
 mm/page_alloc.c        |  2 +-
 mm/shmem.c             |  2 +-
 9 files changed, 12 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d5952f754911..6d5ac6ffb6e1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -462,8 +462,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			     ,
 			     nid, K(node_page_state(pgdat, NR_ANON_THPS)),
-			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
 				    HPAGE_PMD_NR),
 			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7ea4679880c8..cfb107eaa3e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,7 +131,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "AnonHugePages:  ",
 		    global_node_page_state(NR_ANON_THPS));
 	show_val_kb(m, "ShmemHugePages: ",
-		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_SHMEM_THPS));
 	show_val_kb(m, "ShmemPmdMapped: ",
 		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
 	show_val_kb(m, "FileHugePages:  ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b751a9898bb6..788837f40b38 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -220,7 +220,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 		return false;
 
 	return item == NR_ANON_THPS ||
-	       item == NR_FILE_THPS;
+	       item == NR_FILE_THPS ||
+	       item == NR_SHMEM_THPS;
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 9efe059e2b58..46a8b9e82434 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -206,7 +206,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
 	if (PageSwapBacked(page)) {
 		__mod_lruvec_page_state(page, NR_SHMEM, -nr);
 		if (PageTransHuge(page))
-			__dec_lruvec_page_state(page, NR_SHMEM_THPS);
+			__mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
 	} else if (PageTransHuge(page)) {
 		__mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
 		filemap_nr_thps_dec(mapping);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 77181faa2340..86a3015ba54f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2755,7 +2755,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			int nr = thp_nr_pages(head);
 
 			if (PageSwapBacked(head))
-				__dec_lruvec_page_state(head, NR_SHMEM_THPS);
+				__mod_lruvec_page_state(head, NR_SHMEM_THPS,
+							-nr);
 			else
 				__mod_lruvec_page_state(head, NR_FILE_THPS,
 							-nr);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c427fe2ca7ff..75e246f680f4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1858,7 +1858,7 @@ out_unlock:
 	nr = thp_nr_pages(new_page);
 
 	if (is_shmem)
-		__inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
+		__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
 	else {
 		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
 		filemap_nr_thps_inc(mapping);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc552f57d3fb..d3a0c59210e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1516,7 +1516,7 @@ struct memory_stat {
 	unsigned int idx;
 };
 
-static struct memory_stat memory_stats[] = {
+static const struct memory_stat memory_stats[] = {
 	{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
 	{ "file", PAGE_SIZE, NR_FILE_PAGES },
 	{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
@@ -1528,14 +1528,9 @@ static struct memory_stat memory_stats[] = {
 	{ "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
 	{ "file_writeback", PAGE_SIZE, NR_WRITEBACK },
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	/*
-	 * The ratio will be initialized in memory_stats_init(). Because
-	 * on some architectures, the macro of HPAGE_PMD_SIZE is not
-	 * constant(e.g. powerpc).
-	 */
 	{ "anon_thp", PAGE_SIZE, NR_ANON_THPS },
 	{ "file_thp", PAGE_SIZE, NR_FILE_THPS },
-	{ "shmem_thp", 0, NR_SHMEM_THPS },
+	{ "shmem_thp", PAGE_SIZE, NR_SHMEM_THPS },
 #endif
 	{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
 	{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
@@ -1560,23 +1555,6 @@ static struct memory_stat memory_stats[] = {
 	{ "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
 };
 
-static int __init memory_stats_init(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		if (memory_stats[i].idx == NR_SHMEM_THPS)
-			memory_stats[i].ratio = HPAGE_PMD_SIZE;
-#endif
-		VM_BUG_ON(!memory_stats[i].ratio);
-		VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
-	}
-
-	return 0;
-}
-pure_initcall(memory_stats_init);
-
 static char *memory_stat_format(struct mem_cgroup *memcg)
 {
 	struct seq_buf s;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e2e47f8714b..df292d8e659b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5584,7 +5584,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_WRITEBACK)),
 			K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+			K(node_page_state(pgdat, NR_SHMEM_THPS)),
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
 					* HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_ANON_THPS)),
diff --git a/mm/shmem.c b/mm/shmem.c
index 7924b3bf46fb..ff741d229701 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -713,7 +713,7 @@ next:
 		}
 		if (PageTransHuge(page)) {
 			count_vm_event(THP_FILE_ALLOC);
-			__inc_lruvec_page_state(page, NR_SHMEM_THPS);
+			__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
 		}
 		mapping->nrpages += nr;
 		__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
-- 
cgit v1.2.3


From a1528e21f8915e16252cda1137fe29672c918361 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:35 -0800
Subject: mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_SHMEM_PMDMAPPED account to pages.  This patch is
consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page
arrival").  Doing this also can make the unit of vmstat counters more
unified.  Finally, the unit of the vmstat counters are pages, kB and
bytes.  The B/KB suffix can tell us that the unit is bytes or kB.  The
rest which is without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  3 +--
 fs/proc/meminfo.c      |  2 +-
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        |  3 +--
 mm/rmap.c              | 14 ++++++++++----
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6d5ac6ffb6e1..7a66aefe4e46 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -463,8 +463,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     ,
 			     nid, K(node_page_state(pgdat, NR_ANON_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
-			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
-				    HPAGE_PMD_NR),
+			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
 			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
 				    HPAGE_PMD_NR)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index cfb107eaa3e6..c61f440570f9 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -133,7 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "ShmemHugePages: ",
 		    global_node_page_state(NR_SHMEM_THPS));
 	show_val_kb(m, "ShmemPmdMapped: ",
-		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_SHMEM_PMDMAPPED));
 	show_val_kb(m, "FileHugePages:  ",
 		    global_node_page_state(NR_FILE_THPS));
 	show_val_kb(m, "FilePmdMapped:  ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 788837f40b38..7bdbfeeb5c8c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -221,7 +221,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 
 	return item == NR_ANON_THPS ||
 	       item == NR_FILE_THPS ||
-	       item == NR_SHMEM_THPS;
+	       item == NR_SHMEM_THPS ||
+	       item == NR_SHMEM_PMDMAPPED;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df292d8e659b..069561aadc7b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5585,8 +5585,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			K(node_page_state(pgdat, NR_SHMEM_THPS)),
-			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
-					* HPAGE_PMD_NR),
+			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			K(node_page_state(pgdat, NR_ANON_THPS)),
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
diff --git a/mm/rmap.c b/mm/rmap.c
index c4d5c63cfd29..1c1b576c0627 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1211,14 +1211,17 @@ void page_add_file_rmap(struct page *page, bool compound)
 	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
 	lock_page_memcg(page);
 	if (compound && PageTransHuge(page)) {
-		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+		int nr_pages = thp_nr_pages(page);
+
+		for (i = 0, nr = 0; i < nr_pages; i++) {
 			if (atomic_inc_and_test(&page[i]._mapcount))
 				nr++;
 		}
 		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
 			goto out;
 		if (PageSwapBacked(page))
-			__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+						nr_pages);
 		else
 			__inc_node_page_state(page, NR_FILE_PMDMAPPED);
 	} else {
@@ -1252,14 +1255,17 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 
 	/* page still mapped by someone else? */
 	if (compound && PageTransHuge(page)) {
-		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+		int nr_pages = thp_nr_pages(page);
+
+		for (i = 0, nr = 0; i < nr_pages; i++) {
 			if (atomic_add_negative(-1, &page[i]._mapcount))
 				nr++;
 		}
 		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
 			return;
 		if (PageSwapBacked(page))
-			__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+						-nr_pages);
 		else
 			__dec_node_page_state(page, NR_FILE_PMDMAPPED);
 	} else {
-- 
cgit v1.2.3


From 380780e71895ae301505ffcec8f954ab3666a4c7 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 24 Feb 2021 12:03:39 -0800
Subject: mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages

Currently we use struct per_cpu_nodestat to cache the vmstat counters,
which leads to inaccurate statistics especially THP vmstat counters.  In
the systems with hundreds of processors it can be GBs of memory.  For
example, for a 96 CPUs system, the threshold is the maximum number of 125.
And the per cpu counters can cache 23.4375 GB in total.

The THP page is already a form of batched addition (it will add 512 worth
of memory in one go) so skipping the batching seems like sensible.
Although every THP stats update overflows the per-cpu counter, resorting
to atomic global updates.  But it can make the statistics more accuracy
for the THP vmstat counters.

So we convert the NR_FILE_PMDMAPPED account to pages.  This patch is
consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page
arrival").  Doing this also can make the unit of vmstat counters more
unified.  Finally, the unit of the vmstat counters are pages, kB and
bytes.  The B/KB suffix can tell us that the unit is bytes or kB.  The
rest which is without suffix are pages.

Link: https://lkml.kernel.org/r/20201228164110.2838-7-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Rafael. J. Wysocki <rafael@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 3 +--
 fs/proc/meminfo.c      | 2 +-
 include/linux/mmzone.h | 3 ++-
 mm/rmap.c              | 6 ++++--
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 7a66aefe4e46..d02d86aec19f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -465,8 +465,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
 			     nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),
-			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
-				    HPAGE_PMD_NR)
+			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
 #endif
 			    );
 	len += hugetlb_report_node_meminfo(buf, len, nid);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c61f440570f9..6fa761c9cc78 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -137,7 +137,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "FileHugePages:  ",
 		    global_node_page_state(NR_FILE_THPS));
 	show_val_kb(m, "FilePmdMapped:  ",
-		    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+		    global_node_page_state(NR_FILE_PMDMAPPED));
 #endif
 
 #ifdef CONFIG_CMA
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7bdbfeeb5c8c..66d68e5d5a0f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -222,7 +222,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
 	return item == NR_ANON_THPS ||
 	       item == NR_FILE_THPS ||
 	       item == NR_SHMEM_THPS ||
-	       item == NR_SHMEM_PMDMAPPED;
+	       item == NR_SHMEM_PMDMAPPED ||
+	       item == NR_FILE_PMDMAPPED;
 }
 
 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 1c1b576c0627..5ebf16fae4b9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1223,7 +1223,8 @@ void page_add_file_rmap(struct page *page, bool compound)
 			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
 						nr_pages);
 		else
-			__inc_node_page_state(page, NR_FILE_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+						nr_pages);
 	} else {
 		if (PageTransCompound(page) && page_mapping(page)) {
 			VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1267,7 +1268,8 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 			__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
 						-nr_pages);
 		else
-			__dec_node_page_state(page, NR_FILE_PMDMAPPED);
+			__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+						-nr_pages);
 	} else {
 		if (!atomic_add_negative(-1, &page->_mapcount))
 			return;
-- 
cgit v1.2.3


From b6038942480e574c697ea1a80019bbe586c1d654 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 24 Feb 2021 12:03:55 -0800
Subject: mm: memcg: add swapcache stat for memcg v2

This patch adds swapcache stat for the cgroup v2.  The swapcache
represents the memory that is accounted against both the memory and the
swap limit of the cgroup.  The main motivation behind exposing the
swapcache stat is for enabling users to gracefully migrate from cgroup
v1's memsw counter to cgroup v2's memory and swap counters.

Cgroup v1's memsw limit allows users to limit the memory+swap usage of a
workload but without control on the exact proportion of memory and swap.
Cgroup v2 provides separate limits for memory and swap which enables more
control on the exact usage of memory and swap individually for the
workload.

With some little subtleties, the v1's memsw limit can be switched with the
sum of the v2's memory and swap limits.  However the alternative for memsw
usage is not yet available in cgroup v2.  Exposing per-cgroup swapcache
stat enables that alternative.  Adding the memory usage and swap usage and
subtracting the swapcache will approximate the memsw usage.  This will
help in the transparent migration of the workloads depending on memsw
usage and limit to v2' memory and swap counters.

The reasons these applications are still interested in this approximate
memsw usage are: (1) these applications are not really interested in two
separate memory and swap usage metrics.  A single usage metric is more
simple to use and reason about for them.

(2) The memsw usage metric hides the underlying system's swap setup from
the applications.  Applications with multiple instances running in a
datacenter with heterogeneous systems (some have swap and some don't) will
keep seeing a consistent view of their usage.

[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]

Link: https://lkml.kernel.org/r/20210108155813.2914586-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  4 ++++
 drivers/base/node.c                     |  6 ++++++
 include/linux/mmzone.h                  |  3 +++
 include/linux/swap.h                    |  6 +++++-
 mm/memcontrol.c                         |  3 +++
 mm/migrate.c                            |  6 ++++++
 mm/swap_state.c                         | 28 ++--------------------------
 mm/vmstat.c                             |  3 +++
 8 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index c513eafaddea..9acc57f68f31 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1299,6 +1299,10 @@ PAGE_SIZE multiple when read back.
 		Amount of cached filesystem data that was modified and
 		is currently being written back to disk
 
+	  swapcached
+		Amount of swap cached in memory. The swapcache is accounted
+		against both memory and swap usage.
+
 	  anon_thp
 		Amount of memory used in anonymous mappings backed by
 		transparent hugepages
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d02d86aec19f..f449dbb2c746 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -372,14 +372,19 @@ static ssize_t node_read_meminfo(struct device *dev,
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct sysinfo i;
 	unsigned long sreclaimable, sunreclaimable;
+	unsigned long swapcached = 0;
 
 	si_meminfo_node(&i, nid);
 	sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
 	sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
+#ifdef CONFIG_SWAP
+	swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE);
+#endif
 	len = sysfs_emit_at(buf, len,
 			    "Node %d MemTotal:       %8lu kB\n"
 			    "Node %d MemFree:        %8lu kB\n"
 			    "Node %d MemUsed:        %8lu kB\n"
+			    "Node %d SwapCached:     %8lu kB\n"
 			    "Node %d Active:         %8lu kB\n"
 			    "Node %d Inactive:       %8lu kB\n"
 			    "Node %d Active(anon):   %8lu kB\n"
@@ -391,6 +396,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			    nid, K(i.totalram),
 			    nid, K(i.freeram),
 			    nid, K(i.totalram - i.freeram),
+			    nid, K(swapcached),
 			    nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
 				   node_page_state(pgdat, NR_ACTIVE_FILE)),
 			    nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 66d68e5d5a0f..fc99e9241846 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,6 +206,9 @@ enum node_stat_item {
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
+#ifdef CONFIG_SWAP
+	NR_SWAPCACHE,
+#endif
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3f1f7ae0fbe9..0d64a2bc7e00 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -408,7 +408,11 @@ extern struct address_space *swapper_spaces[];
 #define swap_address_space(entry)			    \
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
-extern unsigned long total_swapcache_pages(void);
+static inline unsigned long total_swapcache_pages(void)
+{
+	return global_node_page_state(NR_SWAPCACHE);
+}
+
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *page);
 extern void *get_shadow_from_swap_cache(swp_entry_t entry);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5435b370c929..58edfb6614b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1521,6 +1521,9 @@ static const struct memory_stat memory_stats[] = {
 	{ "file_mapped",		NR_FILE_MAPPED			},
 	{ "file_dirty",			NR_FILE_DIRTY			},
 	{ "file_writeback",		NR_WRITEBACK			},
+#ifdef CONFIG_SWAP
+	{ "swapcached",			NR_SWAPCACHE			},
+#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{ "anon_thp",			NR_ANON_THPS			},
 	{ "file_thp",			NR_FILE_THPS			},
diff --git a/mm/migrate.c b/mm/migrate.c
index 4ec727adfaa2..62b81d5257aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
 			__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
 			__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
 		}
+#ifdef CONFIG_SWAP
+		if (PageSwapCache(page)) {
+			__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+			__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
+		}
+#endif
 		if (dirty && mapping_can_writeback(mapping)) {
 			__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
 			__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 807b00eae969..c1a648d9092b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -68,32 +68,6 @@ static struct {
 	unsigned long find_total;
 } swap_cache_info;
 
-unsigned long total_swapcache_pages(void)
-{
-	unsigned int i, j, nr;
-	unsigned long ret = 0;
-	struct address_space *spaces;
-	struct swap_info_struct *si;
-
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		swp_entry_t entry = swp_entry(i, 1);
-
-		/* Avoid get_swap_device() to warn for bad swap entry */
-		if (!swp_swap_info(entry))
-			continue;
-		/* Prevent swapoff to free swapper_spaces */
-		si = get_swap_device(entry);
-		if (!si)
-			continue;
-		nr = nr_swapper_spaces[i];
-		spaces = swapper_spaces[i];
-		for (j = 0; j < nr; j++)
-			ret += spaces[j].nrpages;
-		put_swap_device(si);
-	}
-	return ret;
-}
-
 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
 
 void show_swap_cache_info(void)
@@ -163,6 +137,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
 		address_space->nrexceptional -= nr_shadows;
 		address_space->nrpages += nr;
 		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+		__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
 		ADD_CACHE_INFO(add_total, nr);
 unlock:
 		xas_unlock_irq(&xas);
@@ -203,6 +178,7 @@ void __delete_from_swap_cache(struct page *page,
 		address_space->nrexceptional += nr;
 	address_space->nrpages -= nr;
 	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+	__mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
 	ADD_CACHE_INFO(del_total, nr);
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 07dc0af50cf0..a0e949542204 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1215,6 +1215,9 @@ const char * const vmstat_text[] = {
 	"nr_shadow_call_stack",
 #endif
 	"nr_page_table_pages",
+#ifdef CONFIG_SWAP
+	"nr_swapcached",
+#endif
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
-- 
cgit v1.2.3


From c1a660dea3fa616420606f1e206e6d22f7e05c30 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 24 Feb 2021 12:03:58 -0800
Subject: mm: kmem: make __memcg_kmem_(un)charge static

I've noticed that __memcg_kmem_charge() and __memcg_kmem_uncharge() are
not used anywhere except memcontrol.c.  Yet they are not declared as
non-static and are declared in memcontrol.h.

This patch makes them static.

Link: https://lkml.kernel.org/r/20210108020332.4096911-1-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  3 ---
 mm/memcontrol.c            | 11 ++++++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 41bbf71edd9f..7a38a1517a05 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1592,9 +1592,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
 #endif
 
 #ifdef CONFIG_MEMCG_KMEM
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-			unsigned int nr_pages);
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 58edfb6614b8..dce5291525a5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 #ifdef CONFIG_MEMCG_KMEM
 extern spinlock_t css_set_lock;
 
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+			       unsigned int nr_pages);
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
+				  unsigned int nr_pages);
+
 static void obj_cgroup_release(struct percpu_ref *ref)
 {
 	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
@@ -3087,8 +3092,8 @@ static void memcg_free_cache_id(int id)
  *
  * Returns 0 on success, an error code on failure.
  */
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-			unsigned int nr_pages)
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+			       unsigned int nr_pages)
 {
 	struct page_counter *counter;
 	int ret;
@@ -3120,7 +3125,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
  * @memcg: memcg to uncharge
  * @nr_pages: number of pages to uncharge
  */
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		page_counter_uncharge(&memcg->kmem, nr_pages);
-- 
cgit v1.2.3


From 802f1d522d5fdaefc2b935141bc8fe03d43a99ab Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 24 Feb 2021 12:04:01 -0800
Subject: mm: page_counter: re-layout structure to reduce false sharing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When checking a memory cgroup related performance regression [1], from the
perf c2c profiling data, we found high false sharing for accessing 'usage'
and 'parent'.

On 64 bit system, the 'usage' and 'parent' are close to each other, and
easy to be in one cacheline (for cacheline size == 64+ B).  'usage' is
usally written, while 'parent' is usually read as the cgroup's
hierarchical counting nature.

So move the 'parent' to the end of the structure to make sure they
are in different cache lines.

Following are some performance data with the patch, against v5.11-rc1.  [
In the data, A means a platform with 2 sockets 48C/96T, B is a platform of
4 sockests 72C/144T, and if a %stddev will be shown bigger than 2%,
P100/P50 means number of test tasks equals to 100%/50% of nr_cpu]

will-it-scale/malloc1
---------------------
	   v5.11-rc1			v5.11-rc1+patch

A-P100	     15782 ±  2%      -0.1%      15765 ±  3%  will-it-scale.per_process_ops
A-P50	     21511            +8.9%      23432        will-it-scale.per_process_ops
B-P100	      9155            +2.2%       9357        will-it-scale.per_process_ops
B-P50	     10967            +7.1%      11751 ±  2%  will-it-scale.per_process_ops

will-it-scale/pagefault2
------------------------
	   v5.11-rc1			v5.11-rc1+patch

A-P100	     79028            +3.0%      81411        will-it-scale.per_process_ops
A-P50	    183960 ±  2%      +4.4%     192078 ±  2%  will-it-scale.per_process_ops
B-P100	     85966            +9.9%      94467 ±  3%  will-it-scale.per_process_ops
B-P50	    198195            +9.8%     217526        will-it-scale.per_process_ops

fio (4k/1M is block size)
-------------------------
	   v5.11-rc1			v5.11-rc1+patch

A-P50-r-4k     16881 ±  2%    +1.2%      17081 ±  2%  fio.read_bw_MBps
A-P50-w-4k      3931          +4.5%       4111 ±  2%  fio.write_bw_MBps
A-P50-r-1M     15178          -0.2%      15154        fio.read_bw_MBps
A-P50-w-1M      3924          +0.1%       3929        fio.write_bw_MBps

[1].https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/

Link: https://lkml.kernel.org/r/1611040814-33449-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_counter.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 85bd413e784e..679591301994 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -12,7 +12,6 @@ struct page_counter {
 	unsigned long low;
 	unsigned long high;
 	unsigned long max;
-	struct page_counter *parent;
 
 	/* effective memory.min and memory.min usage tracking */
 	unsigned long emin;
@@ -27,6 +26,14 @@ struct page_counter {
 	/* legacy */
 	unsigned long watermark;
 	unsigned long failcnt;
+
+	/*
+	 * 'parent' is placed here to be far from 'usage' to reduce
+	 * cache false sharing, as 'usage' is written mostly while
+	 * parent is frequently read for cgroup's hierarchical
+	 * counting nature.
+	 */
+	struct page_counter *parent;
 };
 
 #if BITS_PER_LONG == 32
-- 
cgit v1.2.3


From 6eeb104e114cb6b7391c2d69ff873403858c1f35 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 24 Feb 2021 12:04:15 -0800
Subject: fs: buffer: use raw page_memcg() on locked page

alloc_page_buffers() currently uses get_mem_cgroup_from_page() for
charging the buffers to the page owner, which does an rcu-protected
page->memcg lookup and acquires a reference.  But buffer allocation has
the page lock held throughout, which pins the page to the memcg and
thereby the memcg - neither rcu nor holding an extra reference during the
allocation are necessary.  Use a raw page_memcg() instead.

This was the last user of get_mem_cgroup_from_page(), delete it.

Link: https://lkml.kernel.org/r/20210209190126.97842-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                |  4 ++--
 include/linux/memcontrol.h |  7 -------
 mm/memcontrol.c            | 23 -----------------------
 3 files changed, 2 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index f1c3a5b27a90..0cb7ffd4977c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	if (retry)
 		gfp |= __GFP_NOFAIL;
 
-	memcg = get_mem_cgroup_from_page(page);
+	/* The page lock pins the memcg */
+	memcg = page_memcg(page);
 	old_memcg = set_active_memcg(memcg);
 
 	head = NULL;
@@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 	}
 out:
 	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
 	return head;
 /*
  * In case anything failed, we just free everything we got.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7a38a1517a05..e6dc793d587d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -680,8 +680,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
-
 struct lruvec *lock_page_lruvec(struct page *page);
 struct lruvec *lock_page_lruvec_irq(struct page *page);
 struct lruvec *lock_page_lruvec_irqsave(struct page *page,
@@ -1191,11 +1189,6 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return NULL;
 }
 
-static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
-	return NULL;
-}
-
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c035846c7a4..7fdc001ce15f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1047,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 
-/**
- * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
- * @page: page from which memcg should be extracted.
- *
- * Obtain a reference on page->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned.
- */
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	rcu_read_lock();
-	/* Page should not get uncharged and freed memcg under us. */
-	if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
-		memcg = root_mem_cgroup;
-	rcu_read_unlock();
-	return memcg;
-}
-EXPORT_SYMBOL(get_mem_cgroup_from_page);
-
 static __always_inline struct mem_cgroup *active_memcg(void)
 {
 	if (in_interrupt())
-- 
cgit v1.2.3


From 027b37b552f326aa94ef06c7ea77088b16c41e6e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 24 Feb 2021 12:05:46 -0800
Subject: kasan: move _RET_IP_ to inline wrappers

Generic mm functions that call KASAN annotations that might report a bug
pass _RET_IP_ to them as an argument. This allows KASAN to include the
name of the function that called the mm function in its report's header.

Now that KASAN has inline wrappers for all of its annotations, move
_RET_IP_ to those wrappers to simplify annotation call sites.

Link: https://linux-review.googlesource.com/id/I8fb3c06d49671305ee184175a39591bc26647a67
Link: https://lkml.kernel.org/r/5c1490eddf20b436b8c4eeea83fce47687d5e4a4.1610733117.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 20 +++++++++-----------
 mm/mempool.c          |  2 +-
 mm/slab.c             |  2 +-
 mm/slub.c             |  4 ++--
 4 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 0aea9e2a2a01..a7254186558a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -185,19 +185,18 @@ static __always_inline void * __must_check kasan_init_slab_obj(
 }
 
 bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
-static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-						unsigned long ip)
+static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	if (kasan_enabled())
-		return __kasan_slab_free(s, object, ip);
+		return __kasan_slab_free(s, object, _RET_IP_);
 	return false;
 }
 
 void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
-static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip)
+static __always_inline void kasan_slab_free_mempool(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_slab_free_mempool(ptr, ip);
+		__kasan_slab_free_mempool(ptr, _RET_IP_);
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
@@ -241,10 +240,10 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 }
 
 void __kasan_kfree_large(void *ptr, unsigned long ip);
-static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
+static __always_inline void kasan_kfree_large(void *ptr)
 {
 	if (kasan_enabled())
-		__kasan_kfree_large(ptr, ip);
+		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
 bool kasan_save_enable_multi_shot(void);
@@ -277,12 +276,11 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 {
 	return (void *)object;
 }
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-				   unsigned long ip)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	return false;
 }
-static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {}
+static inline void kasan_slab_free_mempool(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags)
 {
@@ -302,7 +300,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
+static inline void kasan_kfree_large(void *ptr) {}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 624ed51b060f..79959fac27d7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_slab_free_mempool(element, _RET_IP_);
+		kasan_slab_free_mempool(element);
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 9c9e0c255c4b..35c68d99d460 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3420,7 +3420,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
 		memset(objp, 0, cachep->object_size);
 
 	/* Put the object into the quarantine, don't touch it for now. */
-	if (kasan_slab_free(cachep, objp, _RET_IP_))
+	if (kasan_slab_free(cachep, objp))
 		return;
 
 	/* Use KCSAN to help debug racy use-after-free. */
diff --git a/mm/slub.c b/mm/slub.c
index 046d7194acaf..b2833ce85c92 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1528,7 +1528,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 static __always_inline void kfree_hook(void *x)
 {
 	kmemleak_free(x);
-	kasan_kfree_large(x, _RET_IP_);
+	kasan_kfree_large(x);
 }
 
 static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
@@ -1558,7 +1558,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
 				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
 
 	/* KASAN might put x into memory quarantine, delaying its reuse */
-	return kasan_slab_free(s, x, _RET_IP_);
+	return kasan_slab_free(s, x);
 }
 
 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
-- 
cgit v1.2.3


From 611806b4bf8dd97a4f3d73f5cf3c2c7730c51eb2 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 24 Feb 2021 12:05:50 -0800
Subject: kasan: fix bug detection via ksize for HW_TAGS mode

The currently existing kasan_check_read/write() annotations are intended
to be used for kernel modules that have KASAN compiler instrumentation
disabled. Thus, they are only relevant for the software KASAN modes that
rely on compiler instrumentation.

However there's another use case for these annotations: ksize() checks
that the object passed to it is indeed accessible before unpoisoning the
whole object. This is currently done via __kasan_check_read(), which is
compiled away for the hardware tag-based mode that doesn't rely on
compiler instrumentation. This leads to KASAN missing detecting some
memory corruptions.

Provide another annotation called kasan_check_byte() that is available
for all KASAN modes. As the implementation rename and reuse
kasan_check_invalid_free(). Use this new annotation in ksize().
To avoid having ksize() as the top frame in the reported stack trace
pass _RET_IP_ to __kasan_check_byte().

Also add a new ksize_uaf() test that checks that a use-after-free is
detected via ksize() itself, and via plain accesses that happen later.

Link: https://linux-review.googlesource.com/id/Iaabf771881d0f9ce1b969f2a62938e99d3308ec5
Link: https://lkml.kernel.org/r/f32ad74a60b28d8402482a38476f02bb7600f620.1610733117.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan-checks.h |  6 ++++++
 include/linux/kasan.h        | 17 +++++++++++++++++
 lib/test_kasan.c             | 20 ++++++++++++++++++++
 mm/kasan/common.c            | 11 ++++++++++-
 mm/kasan/generic.c           |  4 ++--
 mm/kasan/kasan.h             | 10 +++++-----
 mm/kasan/sw_tags.c           |  6 +++---
 mm/slab_common.c             | 16 +++++++++-------
 8 files changed, 72 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index ca5e89fb10d3..3d6d22a25bdc 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -4,6 +4,12 @@
 
 #include <linux/types.h>
 
+/*
+ * The annotations present in this file are only relevant for the software
+ * KASAN modes that rely on compiler instrumentation, and will be optimized
+ * away for the hardware tag-based KASAN mode. Use kasan_check_byte() instead.
+ */
+
 /*
  * __kasan_check_*: Always available when KASAN is enabled. This may be used
  * even in compilation units that selectively disable KASAN, but must use KASAN
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index a7254186558a..7eaf2d9effb4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -246,6 +246,19 @@ static __always_inline void kasan_kfree_large(void *ptr)
 		__kasan_kfree_large(ptr, _RET_IP_);
 }
 
+/*
+ * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
+ * the hardware tag-based mode that doesn't rely on compiler instrumentation.
+ */
+bool __kasan_check_byte(const void *addr, unsigned long ip);
+static __always_inline bool kasan_check_byte(const void *addr)
+{
+	if (kasan_enabled())
+		return __kasan_check_byte(addr, _RET_IP_);
+	return true;
+}
+
+
 bool kasan_save_enable_multi_shot(void);
 void kasan_restore_multi_shot(bool enabled);
 
@@ -301,6 +314,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 	return (void *)object;
 }
 static inline void kasan_kfree_large(void *ptr) {}
+static inline bool kasan_check_byte(const void *address)
+{
+	return true;
+}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index e59f185b8075..3f771fabd0ec 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -496,6 +496,7 @@ static void kasan_global_oob(struct kunit *test)
 	KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
 }
 
+/* Check that ksize() makes the whole object accessible. */
 static void ksize_unpoisons_memory(struct kunit *test)
 {
 	char *ptr;
@@ -514,6 +515,24 @@ static void ksize_unpoisons_memory(struct kunit *test)
 	kfree(ptr);
 }
 
+/*
+ * Check that a use-after-free is detected by ksize() and via normal accesses
+ * after it.
+ */
+static void ksize_uaf(struct kunit *test)
+{
+	char *ptr;
+	int size = 128 - KASAN_GRANULE_SIZE;
+
+	ptr = kmalloc(size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+	kfree(ptr);
+
+	KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
+	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr);
+	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size));
+}
+
 static void kasan_stack_oob(struct kunit *test)
 {
 	char stack_array[10];
@@ -907,6 +926,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
 	KUNIT_CASE(kasan_alloca_oob_left),
 	KUNIT_CASE(kasan_alloca_oob_right),
 	KUNIT_CASE(ksize_unpoisons_memory),
+	KUNIT_CASE(ksize_uaf),
 	KUNIT_CASE(kmem_cache_double_free),
 	KUNIT_CASE(kmem_cache_invalid_free),
 	KUNIT_CASE(kasan_memchr),
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index eedc3e0fe365..b18189ef3a92 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -345,7 +345,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return false;
 
-	if (kasan_check_invalid_free(tagged_object)) {
+	if (!kasan_byte_accessible(tagged_object)) {
 		kasan_report_invalid_free(tagged_object, ip);
 		return true;
 	}
@@ -490,3 +490,12 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
 		kasan_report_invalid_free(ptr, ip);
 	/* The object will be poisoned by kasan_free_pages(). */
 }
+
+bool __kasan_check_byte(const void *address, unsigned long ip)
+{
+	if (!kasan_byte_accessible(address)) {
+		kasan_report((unsigned long)address, 1, false, ip);
+		return false;
+	}
+	return true;
+}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index acab8862dc67..3f17a1218055 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -185,11 +185,11 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
 	return check_region_inline(addr, size, write, ret_ip);
 }
 
-bool kasan_check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
 {
 	s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
 
-	return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE;
+	return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
 }
 
 void kasan_cache_shrink(struct kmem_cache *cache)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1298b79f9518..cc14b6e6c14c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -341,20 +341,20 @@ static inline void kasan_unpoison(const void *address, size_t size)
 			round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
 }
 
-static inline bool kasan_check_invalid_free(void *addr)
+static inline bool kasan_byte_accessible(const void *addr)
 {
 	u8 ptr_tag = get_tag(addr);
-	u8 mem_tag = hw_get_mem_tag(addr);
+	u8 mem_tag = hw_get_mem_tag((void *)addr);
 
-	return (mem_tag == KASAN_TAG_INVALID) ||
-		(ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
+	return (mem_tag != KASAN_TAG_INVALID) &&
+		(ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
 }
 
 #else /* CONFIG_KASAN_HW_TAGS */
 
 void kasan_poison(const void *address, size_t size, u8 value);
 void kasan_unpoison(const void *address, size_t size);
-bool kasan_check_invalid_free(void *addr);
+bool kasan_byte_accessible(const void *addr);
 
 #endif /* CONFIG_KASAN_HW_TAGS */
 
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index cc271fceb5d5..94c2d33be333 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -118,13 +118,13 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write,
 	return true;
 }
 
-bool kasan_check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
 {
 	u8 tag = get_tag(addr);
 	u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
 
-	return (shadow_byte == KASAN_TAG_INVALID) ||
-		(tag != KASAN_TAG_KERNEL && tag != shadow_byte);
+	return (shadow_byte != KASAN_TAG_INVALID) &&
+		(tag == KASAN_TAG_KERNEL || tag == shadow_byte);
 }
 
 #define DEFINE_HWASAN_LOAD_STORE(size)					\
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5be7825ad3ce..7c8298c17145 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1218,19 +1218,21 @@ size_t ksize(const void *objp)
 	size_t size;
 
 	/*
-	 * We need to check that the pointed to object is valid, and only then
-	 * unpoison the shadow memory below. We use __kasan_check_read(), to
-	 * generate a more useful report at the time ksize() is called (rather
-	 * than later where behaviour is undefined due to potential
-	 * use-after-free or double-free).
+	 * We need to first check that the pointer to the object is valid, and
+	 * only then unpoison the memory. The report printed from ksize() is
+	 * more useful, then when it's printed later when the behaviour could
+	 * be undefined due to a potential use-after-free or double-free.
 	 *
-	 * If the pointed to memory is invalid we return 0, to avoid users of
+	 * We use kasan_check_byte(), which is supported for the hardware
+	 * tag-based KASAN mode, unlike kasan_check_read/write().
+	 *
+	 * If the pointed to memory is invalid, we return 0 to avoid users of
 	 * ksize() writing to and potentially corrupting the memory region.
 	 *
 	 * We want to perform the check before __ksize(), to avoid potentially
 	 * crashing in __ksize() due to accessing invalid metadata.
 	 */
-	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
+	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
 		return 0;
 
 	size = __ksize(objp);
-- 
cgit v1.2.3


From 93f503c3fcd168a43e4a6c875fe2cfafaf8439dc Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Feb 2021 12:06:10 -0800
Subject: mm: fix prototype warning from kernel test robot

Patch series "mm: clean up names and parameters of memmap_init_xxxx functions", v5.

This patchset corrects inappropriate function names of memmap_init_xxx,
and simplify parameters of functions in the code flow.  And also fix a
prototype warning reported by lkp.

This patch (of 5);

Kernel test robot calling make with 'W=1' is triggering warning like
below for memmap_init_zone() function.

  mm/page_alloc.c:6259:23: warning: no previous prototype for 'memmap_init_zone' [-Wmissing-prototypes]
   6259 | void __meminit __weak memmap_init_zone(unsigned long size, int nid,
        |                       ^~~~~~~~~~~~~~~~

Fix it by adding the function declaration in include/linux/mm.h.  Since
memmap_init_zone() has a generic version with '__weak', the declaratoin in
ia64 header file can be simply removed.

Link: https://lkml.kernel.org/r/20210122135956.5946-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20210122135956.5946-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/pgtable.h | 6 ------
 include/linux/mm.h              | 2 ++
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 779b6972aa84..9b4efe89e62d 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -517,12 +517,6 @@ extern struct page *zero_page_memmap_ptr;
 	__changed;							\
 })
 #endif
-
-#  ifdef CONFIG_VIRTUAL_MEM_MAP
-  /* arch mem_map init routine is needed due to holes in a virtual mem_map */
-    extern void memmap_init (unsigned long size, int nid, unsigned long zone,
-			     unsigned long start_pfn);
-#  endif /* CONFIG_VIRTUAL_MEM_MAP */
 # endif /* !__ASSEMBLY__ */
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7deb7168104d..1c1eabdf112c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2408,6 +2408,8 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
+extern void memmap_init(unsigned long size, int nid,
+		unsigned long zone, unsigned long range_start_pfn);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
-- 
cgit v1.2.3


From ab28cb6e1e5e59eb8bf3ad399133617414301d3a Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Feb 2021 12:06:14 -0800
Subject: mm: rename memmap_init() and memmap_init_zone()

The current memmap_init_zone() only handles memory region inside one zone,
actually memmap_init() does the memmap init of one zone.  So rename both
of them accordingly.

Link: https://lkml.kernel.org/r/20210122135956.5946-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c | 6 +++---
 include/linux/mm.h  | 4 ++--
 mm/memory_hotplug.c | 2 +-
 mm/page_alloc.c     | 8 ++++----
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b19f47a5a305..39b782f62d7d 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -536,18 +536,18 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
 		    / sizeof(struct page));
 
 	if (map_start < map_end)
-		memmap_init_zone((unsigned long)(map_end - map_start),
+		memmap_init_range((unsigned long)(map_end - map_start),
 				 args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end),
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	return 0;
 }
 
 void __meminit
-memmap_init (unsigned long size, int nid, unsigned long zone,
+memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	     unsigned long start_pfn)
 {
 	if (!vmem_map) {
-		memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size,
+		memmap_init_range(size, nid, zone, start_pfn, start_pfn + size,
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	} else {
 		struct page *start;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c1eabdf112c..56d0eeae0ce3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2405,10 +2405,10 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn);
 #endif
 
 extern void set_dma_reserve(unsigned long new_dma_reserve);
-extern void memmap_init_zone(unsigned long, int, unsigned long,
+extern void memmap_init_range(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
-extern void memmap_init(unsigned long size, int nid,
+extern void memmap_init_zone(unsigned long size, int nid,
 		unsigned long zone, unsigned long range_start_pfn);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9d57b9be8c7..ddcb1cd24c60 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	 * expects the zone spans the pfn range. All the pages in the range
 	 * are reserved so nobody should be touching them so we should be safe
 	 */
-	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
 			 MEMINIT_HOTPLUG, altmap, migratetype);
 
 	set_zone_contiguous(zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 069561aadc7b..519cf52963eb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6121,7 +6121,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
  * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
  * zone stats (e.g., nr_isolate_pageblock) are touched.
  */
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, unsigned long zone_end_pfn,
 		enum meminit_context context,
 		struct vmem_altmap *altmap, int migratetype)
@@ -6258,7 +6258,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-void __meminit __weak memmap_init(unsigned long size, int nid,
+void __meminit __weak memmap_init_zone(unsigned long size, int nid,
 				  unsigned long zone,
 				  unsigned long range_start_pfn)
 {
@@ -6272,7 +6272,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
 
 		if (end_pfn > start_pfn) {
 			size = end_pfn - start_pfn;
-			memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
+			memmap_init_range(size, nid, zone, start_pfn, range_end_pfn,
 					 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 		}
 	}
@@ -6982,7 +6982,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
-		memmap_init(size, nid, j, zone_start_pfn);
+		memmap_init_zone(size, nid, j, zone_start_pfn);
 	}
 }
 
-- 
cgit v1.2.3


From 3256ff83c566235e812498ee1dc806c45a5d5af7 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 24 Feb 2021 12:06:17 -0800
Subject: mm: simplify parater of function memmap_init_zone()

As David suggested, simply passing 'struct zone *zone' is enough.  We can
get all needed information from 'struct zone*' easily.

Link: https://lkml.kernel.org/r/20210122135956.5946-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c | 12 +++++++-----
 include/linux/mm.h  |  3 +--
 mm/page_alloc.c     | 24 +++++++++++-------------
 3 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 39b782f62d7d..16d0d7d22657 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -542,12 +542,14 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
 	return 0;
 }
 
-void __meminit
-memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-	     unsigned long start_pfn)
+void __meminit memmap_init_zone(struct zone *zone)
 {
+	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+	unsigned long start_pfn = zone->zone_start_pfn;
+	unsigned long size = zone->spanned_pages;
+
 	if (!vmem_map) {
-		memmap_init_range(size, nid, zone, start_pfn, start_pfn + size,
+		memmap_init_range(size, nid, zone_id, start_pfn, start_pfn + size,
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	} else {
 		struct page *start;
@@ -557,7 +559,7 @@ memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		args.start = start;
 		args.end = start + size;
 		args.nid = nid;
-		args.zone = zone;
+		args.zone = zone_id;
 
 		efi_memmap_walk(virtual_memmap_init, &args);
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56d0eeae0ce3..2601a5cce0ef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2408,8 +2408,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_range(unsigned long, int, unsigned long,
 		unsigned long, unsigned long, enum meminit_context,
 		struct vmem_altmap *, int migratetype);
-extern void memmap_init_zone(unsigned long size, int nid,
-		unsigned long zone, unsigned long range_start_pfn);
+extern void memmap_init_zone(struct zone *zone);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 519cf52963eb..aa04c54ad47c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6258,23 +6258,21 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	}
 }
 
-void __meminit __weak memmap_init_zone(unsigned long size, int nid,
-				  unsigned long zone,
-				  unsigned long range_start_pfn)
+void __meminit __weak memmap_init_zone(struct zone *zone)
 {
+	unsigned long zone_start_pfn = zone->zone_start_pfn;
+	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+	int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
 	unsigned long start_pfn, end_pfn;
-	unsigned long range_end_pfn = range_start_pfn + size;
-	int i;
 
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
-		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+		start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+		end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
 
-		if (end_pfn > start_pfn) {
-			size = end_pfn - start_pfn;
-			memmap_init_range(size, nid, zone, start_pfn, range_end_pfn,
-					 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
-		}
+		if (end_pfn > start_pfn)
+			memmap_init_range(end_pfn - start_pfn, nid,
+					zone_id, start_pfn, zone_end_pfn,
+					MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	}
 }
 
@@ -6982,7 +6980,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
-		memmap_init_zone(size, nid, j, zone_start_pfn);
+		memmap_init_zone(zone);
 	}
 }
 
-- 
cgit v1.2.3


From a0cd7a7c4bc004587d1f4785a320f58e72d880eb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 24 Feb 2021 12:06:32 -0800
Subject: mm: simplify free_highmem_page() and free_reserved_page()

adjust_managed_page_count() as called by free_reserved_page() properly
handles pages in a highmem zone, so we can reuse it for
free_highmem_page().

We can now get rid of totalhigh_pages_inc() and simplify
free_reserved_page().

Link: https://lkml.kernel.org/r/20210126182113.19892-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem-internal.h |  5 -----
 include/linux/mm.h               | 16 ++--------------
 mm/page_alloc.c                  | 11 -----------
 3 files changed, 2 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 1bbe96dc8be6..7902c7d8b55f 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -127,11 +127,6 @@ static inline unsigned long totalhigh_pages(void)
 	return (unsigned long)atomic_long_read(&_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_inc(void)
-{
-	atomic_long_inc(&_totalhigh_pages);
-}
-
 static inline void totalhigh_pages_add(long count)
 {
 	atomic_long_add(count, &_totalhigh_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2601a5cce0ef..76cab132c295 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2310,32 +2310,20 @@ extern void free_initmem(void);
 extern unsigned long free_reserved_area(void *start, void *end,
 					int poison, const char *s);
 
-#ifdef	CONFIG_HIGHMEM
-/*
- * Free a highmem page into the buddy system, adjusting totalhigh_pages
- * and totalram_pages.
- */
-extern void free_highmem_page(struct page *page);
-#endif
-
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
 extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
-static inline void __free_reserved_page(struct page *page)
+static inline void free_reserved_page(struct page *page)
 {
 	ClearPageReserved(page);
 	init_page_count(page);
 	__free_page(page);
-}
-
-static inline void free_reserved_page(struct page *page)
-{
-	__free_reserved_page(page);
 	adjust_managed_page_count(page, 1);
 }
+#define free_highmem_page(page) free_reserved_page(page)
 
 static inline void mark_page_reserved(struct page *page)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 35e230cadb94..ddccc59f2f72 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7691,17 +7691,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
 	return pages;
 }
 
-#ifdef	CONFIG_HIGHMEM
-void free_highmem_page(struct page *page)
-{
-	__free_reserved_page(page);
-	totalram_pages_inc();
-	atomic_long_inc(&page_zone(page)->managed_pages);
-	totalhigh_pages_inc();
-}
-#endif
-
-
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
-- 
cgit v1.2.3


From 3b2ebeaf98a028d5dd4ec63095855ef507920276 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Feb 2021 12:06:35 -0800
Subject: mm/gfp: add kernel-doc for gfp_t

The generated html will link to the definition of the gfp_t automatically
once we define it.  Move the one-paragraph overview of GFP flags from the
documentation directory into gfp.h and pull gfp.h into the documentation.

This generates warnings with clang
(https://lkml.kernel.org/r/20210219195509.GA59987@24bbad8f3778), so
use a #if 0 to hide it from the compiler for now.

Link: https://lkml.kernel.org/r/20210215204909.3824509-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20210220003049.GZ2858050@casper.infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/mm-api.rst |  7 ++-----
 include/linux/gfp.h               | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index 2adffb3f7914..201b5423303b 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -19,11 +19,8 @@ User Space Memory Access
 Memory Allocation Controls
 ==========================
 
-Functions which need to allocate memory often use GFP flags to express
-how that memory should be allocated. The GFP acronym stands for "get
-free pages", the underlying memory allocation function. Not every GFP
-flag is allowed to every function which may allocate memory. Most
-users will want to use a plain ``GFP_KERNEL``.
+.. kernel-doc:: include/linux/gfp.h
+   :internal:
 
 .. kernel-doc:: include/linux/gfp.h
    :doc: Page mobility and placement hints
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 80544d5c08e7..220cd553a9e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,20 @@
 #include <linux/linkage.h>
 #include <linux/topology.h>
 
+/* The typedef is in types.h but we want the documentation here */
+#if 0
+/**
+ * typedef gfp_t - Memory allocation flags.
+ *
+ * GFP flags are commonly used throughout Linux to indicate how memory
+ * should be allocated.  The GFP acronym stands for get_free_pages(),
+ * the underlying memory allocation function.  Not every GFP flag is
+ * supported by every function which may allocate memory.  Most users
+ * will want to use a plain ``GFP_KERNEL``.
+ */
+typedef unsigned int __bitwise gfp_t;
+#endif
+
 struct vm_area_struct;
 
 /*
-- 
cgit v1.2.3


From 0fa5bc4023c188082024833b3deffd5543b93bc9 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 24 Feb 2021 12:07:12 -0800
Subject: mm/hugetlb: grab head page refcount once for group of subpages

Patch series "mm/hugetlb: follow_hugetlb_page() improvements", v2.

While looking at ZONE_DEVICE struct page reuse particularly the last
patch[0], I found two possible improvements for follow_hugetlb_page()
which is solely used for get_user_pages()/pin_user_pages().

The first patch batches page refcount updates while the second tidies up
storing the subpages/vmas.  Both together bring the cost of slow variant
of gup() cost from ~87.6k usecs to ~5.8k usecs.

libhugetlbfs tests seem to pass as well gup_test benchmarks with hugetlbfs
vmas.

This patch (of 2):

follow_hugetlb_page() once it locks the pmd/pud, checks all its N subpages
in a huge page and grabs a reference for each one.  Similar to gup-fast,
have follow_hugetlb_page() grab the head page refcount only after counting
all its subpages that are part of the just faulted huge page.

Consequently we reduce the number of atomics necessary to pin said huge
page, which improves non-fast gup() considerably:

  - 16G with 1G huge page size
  gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w

PIN_LONGTERM_BENCHMARK: ~87.6k us -> ~12.8k us

Link: https://lkml.kernel.org/r/20210128182632.24562-1-joao.m.martins@oracle.com
Link: https://lkml.kernel.org/r/20210128182632.24562-2-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/gup.c           |  5 ++---
 mm/hugetlb.c       | 43 ++++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76cab132c295..77e64e3eac80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1187,6 +1187,9 @@ static inline void get_page(struct page *page)
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
+__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
+						   unsigned int flags);
+
 
 static inline __must_check bool try_get_page(struct page *page)
 {
diff --git a/mm/gup.c b/mm/gup.c
index e4c224cd9661..e40579624f10 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
  * considered failure, and furthermore, a likely bug in the caller, so a warning
  * is also emitted.
  */
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
-							  int refs,
-							  unsigned int flags)
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
+						   int refs, unsigned int flags)
 {
 	if (flags & FOLL_GET)
 		return try_get_compound_head(page, refs);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6920c71d7e0d..d4fc5db6bc32 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4796,7 +4796,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long vaddr = *position;
 	unsigned long remainder = *nr_pages;
 	struct hstate *h = hstate_vma(vma);
-	int err = -EFAULT;
+	int err = -EFAULT, refs;
 
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -4916,26 +4916,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			continue;
 		}
 
+		refs = 0;
+
 same_page:
-		if (pages) {
+		if (pages)
 			pages[i] = mem_map_offset(page, pfn_offset);
-			/*
-			 * try_grab_page() should always succeed here, because:
-			 * a) we hold the ptl lock, and b) we've just checked
-			 * that the huge page is present in the page tables. If
-			 * the huge page is present, then the tail pages must
-			 * also be present. The ptl prevents the head page and
-			 * tail pages from being rearranged in any way. So this
-			 * page must be available at this point, unless the page
-			 * refcount overflowed:
-			 */
-			if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
-				spin_unlock(ptl);
-				remainder = 0;
-				err = -ENOMEM;
-				break;
-			}
-		}
 
 		if (vmas)
 			vmas[i] = vma;
@@ -4944,6 +4929,7 @@ same_page:
 		++pfn_offset;
 		--remainder;
 		++i;
+		++refs;
 		if (vaddr < vma->vm_end && remainder &&
 				pfn_offset < pages_per_huge_page(h)) {
 			/*
@@ -4951,6 +4937,25 @@ same_page:
 			 * of this compound page.
 			 */
 			goto same_page;
+		} else if (pages) {
+			/*
+			 * try_grab_compound_head() should always succeed here,
+			 * because: a) we hold the ptl lock, and b) we've just
+			 * checked that the huge page is present in the page
+			 * tables. If the huge page is present, then the tail
+			 * pages must also be present. The ptl prevents the
+			 * head page and tail pages from being rearranged in
+			 * any way. So this page must be available at this
+			 * point, unless the page refcount overflowed:
+			 */
+			if (WARN_ON_ONCE(!try_grab_compound_head(pages[i-1],
+								 refs,
+								 flags))) {
+				spin_unlock(ptl);
+				remainder = 0;
+				err = -ENOMEM;
+				break;
+			}
 		}
 		spin_unlock(ptl);
 	}
-- 
cgit v1.2.3


From 6c26d3108393211ecfd44d89404cfb744027bafd Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 24 Feb 2021 12:07:19 -0800
Subject: mm/hugetlb: fix some comment typos

Fix typos sasitfy to satisfy, reservtion to reservation, hugegpage to
hugepage and uniprocesor to uniprocessor in comments.

Link: https://lkml.kernel.org/r/20210128112028.64831-1-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 mm/hugetlb.c            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b5807f23caf8..ac3cb239a7ec 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -37,7 +37,7 @@ struct hugepage_subpool {
 	struct hstate *hstate;
 	long min_hpages;	/* Minimum huge pages or -1 if no minimum. */
 	long rsv_hpages;	/* Pages reserved against global pool to */
-				/* sasitfy minimum size. */
+				/* satisfy minimum size. */
 };
 
 struct resv_map {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf6653879bb4..fc895e25920c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1434,7 +1434,7 @@ static void __free_huge_page(struct page *page)
 	 * reservation.  If the page was associated with a subpool, there
 	 * would have been a page reserved in the subpool before allocation
 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
-	 * reservtion, do not call hugepage_subpool_put_pages() as this will
+	 * reservation, do not call hugepage_subpool_put_pages() as this will
 	 * remove the reserved page from the subpool.
 	 */
 	if (!restore_reserve) {
@@ -3707,7 +3707,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
@@ -4491,7 +4491,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
 }
 #else
 /*
- * For uniprocesor systems we always use a single mutex, so just
+ * For uniprocessor systems we always use a single mutex, so just
  * return 0 and avoid the hashing overhead.
  */
 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
-- 
cgit v1.2.3


From bae84953815793f68ddd8edeadd3f4e32676a2c8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Wed, 24 Feb 2021 12:07:32 -0800
Subject: mm/pmem: avoid inserting hugepage PTE entry with fsdax if hugepage
 support is disabled

Differentiate between hardware not supporting hugepages and user disabling
THP via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'

For the devdax namespace, the kernel handles the above via the
supported_alignment attribute and failing to initialize the namespace if
the namespace align value is not supported on the platform.

For the fsdax namespace, the kernel will continue to initialize the
namespace.  This can result in the kernel creating a huge pte entry even
though the hardware don't support the same.

We do want hugepage support with pmem even if the end-user disabled THP
via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled).  Hence
differentiate between hardware/firmware lacking support vs user-controlled
disable of THP and prevent a huge fault if the hardware lacks hugepage
support.

Link: https://lkml.kernel.org/r/20210205023956.417587-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 15 +++++++++------
 mm/huge_memory.c        |  6 +++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6a19f35f836b..ba973efcd369 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -78,6 +78,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
 }
 
 enum transparent_hugepage_flag {
+	TRANSPARENT_HUGEPAGE_NEVER_DAX,
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
@@ -123,6 +124,13 @@ extern unsigned long transparent_hugepage_flags;
  */
 static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 {
+
+	/*
+	 * If the hardware/firmware marked hugepage support disabled.
+	 */
+	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+		return false;
+
 	if (vma->vm_flags & VM_NOHUGEPAGE)
 		return false;
 
@@ -134,12 +142,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
 		return true;
-	/*
-	 * For dax vmas, try to always use hugepage mappings. If the kernel does
-	 * not support hugepages, fsdax mappings will fallback to PAGE_SIZE
-	 * mappings, and device-dax namespaces, that try to guarantee a given
-	 * mapping size, will fail to enable
-	 */
+
 	if (vma_is_dax(vma))
 		return true;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4cdcc4da36e8..d77605c30f2e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -386,7 +386,11 @@ static int __init hugepage_init(void)
 	struct kobject *hugepage_kobj;
 
 	if (!has_transparent_hugepage()) {
-		transparent_hugepage_flags = 0;
+		/*
+		 * Hardware doesn't support hugepages, hence disable
+		 * DAX PMD support.
+		 */
+		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From c2135f7c570bc274035834848d9bf46ea89ba763 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Wed, 24 Feb 2021 12:08:01 -0800
Subject: mm/vmscan: __isolate_lru_page_prepare() cleanup

The function just returns 2 results, so using a 'switch' to deal with its
result is unnecessary.  Also simplify it to a bool func as Vlastimil
suggested.

Also remove 'goto' by reusing list_move(), and take Matthew Wilcox's
suggestion to update comments in function.

Link: https://lkml.kernel.org/r/728874d7-2d93-4049-68c1-dcc3b2d52ccd@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 +-
 mm/compaction.c      |  2 +-
 mm/vmscan.c          | 68 ++++++++++++++++++++++++----------------------------
 3 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0d64a2bc7e00..32f665b1ee85 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -356,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
-extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
+extern bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
diff --git a/mm/compaction.c b/mm/compaction.c
index 190ccdaa6c19..8f52532675a9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -988,7 +988,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (unlikely(!get_page_unless_zero(page)))
 			goto isolate_fail;
 
-		if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
+		if (!__isolate_lru_page_prepare(page, isolate_mode))
 			goto isolate_fail_put;
 
 		/* Try isolate the page */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..04509994aed4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1539,19 +1539,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  * page:	page to consider
  * mode:	one of the LRU isolation modes defined above
  *
- * returns 0 on success, -ve errno on failure.
+ * returns true on success, false on failure.
  */
-int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
+bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 {
-	int ret = -EBUSY;
-
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
-		return ret;
+		return false;
 
 	/* Compaction should not handle unevictable pages but CMA can do so */
 	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
-		return ret;
+		return false;
 
 	/*
 	 * To minimise LRU disruption, the caller can indicate that it only
@@ -1564,7 +1562,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 	if (mode & ISOLATE_ASYNC_MIGRATE) {
 		/* All the caller can do on PageWriteback is block */
 		if (PageWriteback(page))
-			return ret;
+			return false;
 
 		if (PageDirty(page)) {
 			struct address_space *mapping;
@@ -1580,20 +1578,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 			 * from the page cache.
 			 */
 			if (!trylock_page(page))
-				return ret;
+				return false;
 
 			mapping = page_mapping(page);
 			migrate_dirty = !mapping || mapping->a_ops->migratepage;
 			unlock_page(page);
 			if (!migrate_dirty)
-				return ret;
+				return false;
 		}
 	}
 
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
-		return ret;
+		return false;
 
-	return 0;
+	return true;
 }
 
 /*
@@ -1677,35 +1675,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		 * only when the page is being freed somewhere else.
 		 */
 		scan += nr_pages;
-		switch (__isolate_lru_page_prepare(page, mode)) {
-		case 0:
-			/*
-			 * Be careful not to clear PageLRU until after we're
-			 * sure the page is not being freed elsewhere -- the
-			 * page release code relies on it.
-			 */
-			if (unlikely(!get_page_unless_zero(page)))
-				goto busy;
-
-			if (!TestClearPageLRU(page)) {
-				/*
-				 * This page may in other isolation path,
-				 * but we still hold lru_lock.
-				 */
-				put_page(page);
-				goto busy;
-			}
-
-			nr_taken += nr_pages;
-			nr_zone_taken[page_zonenum(page)] += nr_pages;
-			list_move(&page->lru, dst);
-			break;
+		if (!__isolate_lru_page_prepare(page, mode)) {
+			/* It is being freed elsewhere */
+			list_move(&page->lru, src);
+			continue;
+		}
+		/*
+		 * Be careful not to clear PageLRU until after we're
+		 * sure the page is not being freed elsewhere -- the
+		 * page release code relies on it.
+		 */
+		if (unlikely(!get_page_unless_zero(page))) {
+			list_move(&page->lru, src);
+			continue;
+		}
 
-		default:
-busy:
-			/* else it is being freed elsewhere */
+		if (!TestClearPageLRU(page)) {
+			/* Another thread is already isolating this page */
+			put_page(page);
 			list_move(&page->lru, src);
+			continue;
 		}
+
+		nr_taken += nr_pages;
+		nr_zone_taken[page_zonenum(page)] += nr_pages;
+		list_move(&page->lru, dst);
 	}
 
 	/*
-- 
cgit v1.2.3


From f90d8191ac864df33b1898bc7edc54eaa24e22bc Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:13 -0800
Subject: include/linux/mm_inline.h: shuffle lru list addition and deletion
 functions

These functions will call page_lru() in the following patches.  Move them
below page_lru() to avoid the forward declaration.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-3-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-3-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8fc71e9d7bb0..2889741f450a 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -45,27 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 #endif
 }
 
-static __always_inline void add_page_to_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
-{
-	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-	list_add(&page->lru, &lruvec->lists[lru]);
-}
-
-static __always_inline void add_page_to_lru_list_tail(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
-{
-	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
-	list_add_tail(&page->lru, &lruvec->lists[lru]);
-}
-
-static __always_inline void del_page_from_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
-{
-	list_del(&page->lru);
-	update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
-}
-
 /**
  * page_lru_base_type - which LRU list type should a page be on?
  * @page: the page to test
@@ -125,4 +104,25 @@ static __always_inline enum lru_list page_lru(struct page *page)
 	}
 	return lru;
 }
+
+static __always_inline void add_page_to_lru_list(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+	list_add(&page->lru, &lruvec->lists[lru]);
+}
+
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+}
+
+static __always_inline void del_page_from_lru_list(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	list_del(&page->lru);
+	update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
+}
 #endif
-- 
cgit v1.2.3


From 3a9c9788a3149d9745b7eb2eae811e57ef3b127c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:17 -0800
Subject: mm: don't pass "enum lru_list" to lru list addition functions

The "enum lru_list" parameter to add_page_to_lru_list() and
add_page_to_lru_list_tail() is redundant in the sense that it can
be extracted from the "struct page" parameter by page_lru().

A caveat is that we need to make sure PageActive() or
PageUnevictable() is correctly set or cleared before calling
these two functions. And they are indeed.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-4-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-4-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  8 ++++++--
 mm/swap.c                 | 15 +++++++--------
 mm/vmscan.c               |  6 ++----
 3 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2889741f450a..130ba3201d3f 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,15 +106,19 @@ static __always_inline enum lru_list page_lru(struct page *page)
 }
 
 static __always_inline void add_page_to_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
+				struct lruvec *lruvec)
 {
+	enum lru_list lru = page_lru(page);
+
 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
 }
 
 static __always_inline void add_page_to_lru_list_tail(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
+				struct lruvec *lruvec)
 {
+	enum lru_list lru = page_lru(page);
+
 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
 	list_add_tail(&page->lru, &lruvec->lists[lru]);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 2cca7141470c..22fad34b9c18 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -231,7 +231,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
 	if (!PageUnevictable(page)) {
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		ClearPageActive(page);
-		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
+		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, thp_nr_pages(page));
 	}
 }
@@ -313,8 +313,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
 
 		del_page_from_lru_list(page, lruvec, lru);
 		SetPageActive(page);
-		lru += LRU_ACTIVE;
-		add_page_to_lru_list(page, lruvec, lru);
+		add_page_to_lru_list(page, lruvec);
 		trace_mm_lru_activate(page);
 
 		__count_vm_events(PGACTIVATE, nr_pages);
@@ -543,14 +542,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 		 * It can make readahead confusing.  But race window
 		 * is _really_ small and  it's non-critical problem.
 		 */
-		add_page_to_lru_list(page, lruvec, lru);
+		add_page_to_lru_list(page, lruvec);
 		SetPageReclaim(page);
 	} else {
 		/*
 		 * The page's writeback ends up during pagevec
 		 * We moves tha page into tail of inactive.
 		 */
-		add_page_to_lru_list_tail(page, lruvec, lru);
+		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, nr_pages);
 	}
 
@@ -570,7 +569,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
 		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
-		add_page_to_lru_list(page, lruvec, lru);
+		add_page_to_lru_list(page, lruvec);
 
 		__count_vm_events(PGDEACTIVATE, nr_pages);
 		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -595,7 +594,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
 		 * anonymous pages
 		 */
 		ClearPageSwapBacked(page);
-		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
+		add_page_to_lru_list(page, lruvec);
 
 		__count_vm_events(PGLAZYFREE, nr_pages);
 		__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -1005,7 +1004,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
 	}
 
-	add_page_to_lru_list(page, lruvec, lru);
+	add_page_to_lru_list(page, lruvec);
 	trace_mm_lru_insertion(page, lru);
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 19875660e8f8..09e4f97488c9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1867,7 +1867,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		 * inhibits memcg migration).
 		 */
 		VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
-		add_page_to_lru_list(page, lruvec, page_lru(page));
+		add_page_to_lru_list(page, lruvec);
 		nr_pages = thp_nr_pages(page);
 		nr_moved += nr_pages;
 		if (PageActive(page))
@@ -4282,12 +4282,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 
 		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
-			enum lru_list lru = page_lru_base_type(page);
-
 			VM_BUG_ON_PAGE(PageActive(page), page);
 			ClearPageUnevictable(page);
 			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
-			add_page_to_lru_list(page, lruvec, lru);
+			add_page_to_lru_list(page, lruvec);
 			pgrescued += nr_pages;
 		}
 		SetPageLRU(page);
-- 
cgit v1.2.3


From 46ae6b2cc2a47904a368d238425531ea91f3a2a5 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:25 -0800
Subject: mm/swap.c: don't pass "enum lru_list" to del_page_from_lru_list()

The parameter is redundant in the sense that it can be potentially
extracted from the "struct page" parameter by page_lru(). We need to
make sure that existing PageActive() or PageUnevictable() remains
until the function returns. A few places don't conform, and simple
reordering fixes them.

This patch may have left page_off_lru() seemingly odd, and we'll take
care of it in the next patch.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-6-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-6-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  5 +++--
 mm/compaction.c           |  2 +-
 mm/mlock.c                |  3 +--
 mm/swap.c                 | 26 ++++++++++----------------
 mm/vmscan.c               |  4 ++--
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 130ba3201d3f..ffacc6273678 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -124,9 +124,10 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page,
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
-				struct lruvec *lruvec, enum lru_list lru)
+				struct lruvec *lruvec)
 {
 	list_del(&page->lru);
-	update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page));
+	update_lru_size(lruvec, page_lru(page), page_zonenum(page),
+			-thp_nr_pages(page));
 }
 #endif
diff --git a/mm/compaction.c b/mm/compaction.c
index 8f52532675a9..3aef10c61d0e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1034,7 +1034,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			low_pfn += compound_nr(page) - 1;
 
 		/* Successfully isolated */
-		del_page_from_lru_list(page, lruvec, page_lru(page));
+		del_page_from_lru_list(page, lruvec);
 		mod_node_page_state(page_pgdat(page),
 				NR_ISOLATED_ANON + page_is_file_lru(page),
 				thp_nr_pages(page));
diff --git a/mm/mlock.c b/mm/mlock.c
index 55b3b3672977..73960bb3464d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			 */
 			if (TestClearPageLRU(page)) {
 				lruvec = relock_page_lruvec_irq(page, lruvec);
-				del_page_from_lru_list(page, lruvec,
-							page_lru(page));
+				del_page_from_lru_list(page, lruvec);
 				continue;
 			} else
 				__munlock_isolation_failed(page);
diff --git a/mm/swap.c b/mm/swap.c
index a26e3a4fe17b..ff910f636df2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -85,7 +85,8 @@ static void __page_cache_release(struct page *page)
 		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
 		__ClearPageLRU(page);
-		del_page_from_lru_list(page, lruvec, page_off_lru(page));
+		del_page_from_lru_list(page, lruvec);
+		page_off_lru(page);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
@@ -229,7 +230,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
 {
 	if (!PageUnevictable(page)) {
-		del_page_from_lru_list(page, lruvec, page_lru(page));
+		del_page_from_lru_list(page, lruvec);
 		ClearPageActive(page);
 		add_page_to_lru_list_tail(page, lruvec);
 		__count_vm_events(PGROTATED, thp_nr_pages(page));
@@ -308,10 +309,9 @@ void lru_note_cost_page(struct page *page)
 static void __activate_page(struct page *page, struct lruvec *lruvec)
 {
 	if (!PageActive(page) && !PageUnevictable(page)) {
-		int lru = page_lru_base_type(page);
 		int nr_pages = thp_nr_pages(page);
 
-		del_page_from_lru_list(page, lruvec, lru);
+		del_page_from_lru_list(page, lruvec);
 		SetPageActive(page);
 		add_page_to_lru_list(page, lruvec);
 		trace_mm_lru_activate(page);
@@ -518,8 +518,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
  */
 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 {
-	int lru;
-	bool active;
+	bool active = PageActive(page);
 	int nr_pages = thp_nr_pages(page);
 
 	if (PageUnevictable(page))
@@ -529,10 +528,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 	if (page_mapped(page))
 		return;
 
-	active = PageActive(page);
-	lru = page_lru_base_type(page);
-
-	del_page_from_lru_list(page, lruvec, lru + active);
+	del_page_from_lru_list(page, lruvec);
 	ClearPageActive(page);
 	ClearPageReferenced(page);
 
@@ -563,10 +559,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
 {
 	if (PageActive(page) && !PageUnevictable(page)) {
-		int lru = page_lru_base_type(page);
 		int nr_pages = thp_nr_pages(page);
 
-		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		del_page_from_lru_list(page, lruvec);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
 		add_page_to_lru_list(page, lruvec);
@@ -581,11 +576,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
 {
 	if (PageAnon(page) && PageSwapBacked(page) &&
 	    !PageSwapCache(page) && !PageUnevictable(page)) {
-		bool active = PageActive(page);
 		int nr_pages = thp_nr_pages(page);
 
-		del_page_from_lru_list(page, lruvec,
-				       LRU_INACTIVE_ANON + active);
+		del_page_from_lru_list(page, lruvec);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
 		/*
@@ -919,7 +912,8 @@ void release_pages(struct page **pages, int nr)
 
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			__ClearPageLRU(page);
-			del_page_from_lru_list(page, lruvec, page_off_lru(page));
+			del_page_from_lru_list(page, lruvec);
+			page_off_lru(page);
 		}
 
 		__ClearPageWaiters(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 09e4f97488c9..7c65d47e6612 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1766,7 +1766,7 @@ int isolate_lru_page(struct page *page)
 
 		get_page(page);
 		lruvec = lock_page_lruvec_irq(page);
-		del_page_from_lru_list(page, lruvec, page_lru(page));
+		del_page_from_lru_list(page, lruvec);
 		unlock_page_lruvec_irq(lruvec);
 		ret = 0;
 	}
@@ -4283,8 +4283,8 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
 			VM_BUG_ON_PAGE(PageActive(page), page);
+			del_page_from_lru_list(page, lruvec);
 			ClearPageUnevictable(page);
-			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
 			add_page_to_lru_list(page, lruvec);
 			pgrescued += nr_pages;
 		}
-- 
cgit v1.2.3


From 875601796267214f286d3581fe74f2805d060fe8 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:28 -0800
Subject: mm: add __clear_page_lru_flags() to replace page_off_lru()

Similar to page_off_lru(), the new function does non-atomic clearing
of PageLRU() in addition to PageActive() and PageUnevictable(), on a
page that has no references left.

If PageActive() and PageUnevictable() are both set, refuse to clear
either and leave them to bad_page(). This is a behavior change that
is meant to help debug.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-7-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 28 ++++++++++------------------
 mm/swap.c                 |  6 ++----
 mm/vmscan.c               |  3 +--
 3 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ffacc6273678..ef3fd79222e5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -61,27 +61,19 @@ static inline enum lru_list page_lru_base_type(struct page *page)
 }
 
 /**
- * page_off_lru - which LRU list was page on? clearing its lru flags.
- * @page: the page to test
- *
- * Returns the LRU list a page was on, as an index into the array of LRU
- * lists; and clears its Unevictable or Active flags, ready for freeing.
+ * __clear_page_lru_flags - clear page lru flags before releasing a page
+ * @page: the page that was on lru and now has a zero reference
  */
-static __always_inline enum lru_list page_off_lru(struct page *page)
+static __always_inline void __clear_page_lru_flags(struct page *page)
 {
-	enum lru_list lru;
+	__ClearPageLRU(page);
 
-	if (PageUnevictable(page)) {
-		__ClearPageUnevictable(page);
-		lru = LRU_UNEVICTABLE;
-	} else {
-		lru = page_lru_base_type(page);
-		if (PageActive(page)) {
-			__ClearPageActive(page);
-			lru += LRU_ACTIVE;
-		}
-	}
-	return lru;
+	/* this shouldn't happen, so leave the flags to bad_page() */
+	if (PageActive(page) && PageUnevictable(page))
+		return;
+
+	__ClearPageActive(page);
+	__ClearPageUnevictable(page);
 }
 
 /**
diff --git a/mm/swap.c b/mm/swap.c
index ff910f636df2..1455fa56a0ee 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -84,9 +84,8 @@ static void __page_cache_release(struct page *page)
 
 		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
-		__ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec);
-		page_off_lru(page);
+		__clear_page_lru_flags(page);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
@@ -911,9 +910,8 @@ void release_pages(struct page **pages, int nr)
 				lock_batch = 0;
 
 			VM_BUG_ON_PAGE(!PageLRU(page), page);
-			__ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec);
-			page_off_lru(page);
+			__clear_page_lru_flags(page);
 		}
 
 		__ClearPageWaiters(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7c65d47e6612..452dd3818aa3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1849,8 +1849,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		SetPageLRU(page);
 
 		if (unlikely(put_page_testzero(page))) {
-			__ClearPageLRU(page);
-			__ClearPageActive(page);
+			__clear_page_lru_flags(page);
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&lruvec->lru_lock);
-- 
cgit v1.2.3


From bc7112719e1e80e4208eef3fc9bd8d2b6c263e7d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:32 -0800
Subject: mm: VM_BUG_ON lru page flags

Move scattered VM_BUG_ONs to two essential places that cover all
lru list additions and deletions.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-8-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-8-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 4 ++++
 mm/swap.c                 | 2 --
 mm/vmscan.c               | 1 -
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ef3fd79222e5..6d907a4dd6ad 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -66,6 +66,8 @@ static inline enum lru_list page_lru_base_type(struct page *page)
  */
 static __always_inline void __clear_page_lru_flags(struct page *page)
 {
+	VM_BUG_ON_PAGE(!PageLRU(page), page);
+
 	__ClearPageLRU(page);
 
 	/* this shouldn't happen, so leave the flags to bad_page() */
@@ -87,6 +89,8 @@ static __always_inline enum lru_list page_lru(struct page *page)
 {
 	enum lru_list lru;
 
+	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+
 	if (PageUnevictable(page))
 		lru = LRU_UNEVICTABLE;
 	else {
diff --git a/mm/swap.c b/mm/swap.c
index 1455fa56a0ee..ab3258afcbeb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -83,7 +83,6 @@ static void __page_cache_release(struct page *page)
 		unsigned long flags;
 
 		lruvec = lock_page_lruvec_irqsave(page, &flags);
-		VM_BUG_ON_PAGE(!PageLRU(page), page);
 		del_page_from_lru_list(page, lruvec);
 		__clear_page_lru_flags(page);
 		unlock_page_lruvec_irqrestore(lruvec, flags);
@@ -909,7 +908,6 @@ void release_pages(struct page **pages, int nr)
 			if (prev_lruvec != lruvec)
 				lock_batch = 0;
 
-			VM_BUG_ON_PAGE(!PageLRU(page), page);
 			del_page_from_lru_list(page, lruvec);
 			__clear_page_lru_flags(page);
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 452dd3818aa3..348a90096550 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4281,7 +4281,6 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 
 		lruvec = relock_page_lruvec_irq(page, lruvec);
 		if (page_evictable(page) && PageUnevictable(page)) {
-			VM_BUG_ON_PAGE(PageActive(page), page);
 			del_page_from_lru_list(page, lruvec);
 			ClearPageUnevictable(page);
 			add_page_to_lru_list(page, lruvec);
-- 
cgit v1.2.3


From c1770e34f3e7640887d8129fc05d13fe17101301 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:36 -0800
Subject: include/linux/mm_inline.h: fold page_lru_base_type() into its sole
 caller

We've removed all other references to this function.

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-9-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-9-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 6d907a4dd6ad..7183c7a03f09 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -45,21 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 #endif
 }
 
-/**
- * page_lru_base_type - which LRU list type should a page be on?
- * @page: the page to test
- *
- * Used for LRU list index arithmetic.
- *
- * Returns the base LRU type - file or anon - @page should be on.
- */
-static inline enum lru_list page_lru_base_type(struct page *page)
-{
-	if (page_is_file_lru(page))
-		return LRU_INACTIVE_FILE;
-	return LRU_INACTIVE_ANON;
-}
-
 /**
  * __clear_page_lru_flags - clear page lru flags before releasing a page
  * @page: the page that was on lru and now has a zero reference
@@ -92,12 +77,12 @@ static __always_inline enum lru_list page_lru(struct page *page)
 	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
 
 	if (PageUnevictable(page))
-		lru = LRU_UNEVICTABLE;
-	else {
-		lru = page_lru_base_type(page);
-		if (PageActive(page))
-			lru += LRU_ACTIVE;
-	}
+		return LRU_UNEVICTABLE;
+
+	lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
+	if (PageActive(page))
+		lru += LRU_ACTIVE;
+
 	return lru;
 }
 
-- 
cgit v1.2.3


From 289ccba18af436f2b65ec69b2be1b086ec9f24a4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:40 -0800
Subject: include/linux/mm_inline.h: fold __update_lru_size() into its sole
 caller

All other references to the function were removed after commit
a892cb6b977f ("mm/vmscan.c: use update_lru_size() in update_lru_sizes()").

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-10-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-10-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7183c7a03f09..355ea1ee32bd 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struct page *page)
 	return !PageSwapBacked(page);
 }
 
-static __always_inline void __update_lru_size(struct lruvec *lruvec,
+static __always_inline void update_lru_size(struct lruvec *lruvec,
 				enum lru_list lru, enum zone_type zid,
 				int nr_pages)
 {
@@ -33,13 +33,6 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
-}
-
-static __always_inline void update_lru_size(struct lruvec *lruvec,
-				enum lru_list lru, enum zone_type zid,
-				int nr_pages)
-{
-	__update_lru_size(lruvec, lru, zid, nr_pages);
 #ifdef CONFIG_MEMCG
 	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
 #endif
-- 
cgit v1.2.3


From 2091339d59e7808e9b39a79f48e3d17ef7389b97 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 24 Feb 2021 12:08:44 -0800
Subject: mm/vmscan.c: make lruvec_lru_size() static

All other references to the function were removed after
commit b910718a948a ("mm: vmscan: detect file thrashing at the reclaim
root").

Link: https://lore.kernel.org/linux-mm/20201207220949.830352-11-yuzhao@google.com/
Link: https://lkml.kernel.org/r/20210122220600.906146-11-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 --
 mm/vmscan.c            | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc99e9241846..9198b7ade85f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -892,8 +892,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
-
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 int local_memory_node(int node_id);
 #else
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 348a90096550..fea6b43bc1f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  * @lru: lru to use
  * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
  */
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+				     int zone_idx)
 {
 	unsigned long size = 0;
 	int zid;
-- 
cgit v1.2.3


From d6995da311221a05c8aef3bda2629e5cb14c7302 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:08:51 -0800
Subject: hugetlb: use page.private for hugetlb specific page flags

Patch series "create hugetlb flags to consolidate state", v3.

While discussing a series of hugetlb fixes in [1], it became evident that
the hugetlb specific page state information is stored in a somewhat
haphazard manner.  Code dealing with state information would be easier to
read, understand and maintain if this information was stored in a
consistent manner.

This series uses page.private of the hugetlb head page for storing a set
of hugetlb specific page flags.  Routines are priovided for test, set and
clear of the flags.

[1] https://lore.kernel.org/r/20210106084739.63318-1-songmuchun@bytedance.com

This patch (of 4):

As hugetlbfs evolved, state information about hugetlb pages was added.
One 'convenient' way of doing this was to use available fields in tail
pages.  Over time, it has become difficult to know the meaning or contents
of fields simply by looking at a small bit of code.  Sometimes, the naming
is just confusing.  For example: The PagePrivate flag indicates a huge
page reservation was consumed and needs to be restored if an error is
encountered and the page is freed before it is instantiated.  The
page.private field contains the pointer to a subpool if the page is
associated with one.

In an effort to make the code more readable, use page.private to contain
hugetlb specific page flags.  These flags will have test, set and clear
functions similar to those used for 'normal' page flags.  More
importantly, an enum of flag values will be created with names that
actually reflect their purpose.

In this patch,
- Create infrastructure for hugetlb specific page flag functions
- Move subpool pointer to page[1].private to make way for flags
  Create routines with meaningful names to modify subpool field
- Use new HPageRestoreReserve flag instead of PagePrivate

Conversion of other state information will happen in subsequent patches.

Link: https://lkml.kernel.org/r/20210122195231.324857-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20210122195231.324857-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 12 +++------
 include/linux/hugetlb.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/hugetlb.c            | 48 +++++++++++++++++-----------------
 3 files changed, 96 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b7a72f577aab..907f6405e805 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -973,15 +973,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
-	/*
-	 * page_private is subpool pointer in hugetlb pages.  Transfer to
-	 * new page.  PagePrivate is not associated with page_private for
-	 * hugetlb pages and can not be set here as only page_huge_active
-	 * pages can be migrated.
-	 */
-	if (page_private(page)) {
-		set_page_private(newpage, page_private(page));
-		set_page_private(page, 0);
+	if (hugetlb_page_subpool(page)) {
+		hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
+		hugetlb_set_page_subpool(page, NULL);
 	}
 
 	if (mode != MIGRATE_SYNC_NO_COPY)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ac3cb239a7ec..249c65e1d8ca 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -472,6 +472,60 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long flags);
 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
+/*
+ * huegtlb page specific state flags.  These flags are located in page.private
+ * of the hugetlb head page.  Functions created via the below macros should be
+ * used to manipulate these flags.
+ *
+ * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
+ *	allocation time.  Cleared when page is fully instantiated.  Free
+ *	routine checks flag to restore a reservation on error paths.
+ */
+enum hugetlb_page_flags {
+	HPG_restore_reserve = 0,
+	__NR_HPAGEFLAGS,
+};
+
+/*
+ * Macros to create test, set and clear function definitions for
+ * hugetlb specific page flags.
+ */
+#ifdef CONFIG_HUGETLB_PAGE
+#define TESTHPAGEFLAG(uname, flname)				\
+static inline int HPage##uname(struct page *page)		\
+	{ return test_bit(HPG_##flname, &(page->private)); }
+
+#define SETHPAGEFLAG(uname, flname)				\
+static inline void SetHPage##uname(struct page *page)		\
+	{ set_bit(HPG_##flname, &(page->private)); }
+
+#define CLEARHPAGEFLAG(uname, flname)				\
+static inline void ClearHPage##uname(struct page *page)		\
+	{ clear_bit(HPG_##flname, &(page->private)); }
+#else
+#define TESTHPAGEFLAG(uname, flname)				\
+static inline int HPage##uname(struct page *page)		\
+	{ return 0; }
+
+#define SETHPAGEFLAG(uname, flname)				\
+static inline void SetHPage##uname(struct page *page)		\
+	{ }
+
+#define CLEARHPAGEFLAG(uname, flname)				\
+static inline void ClearHPage##uname(struct page *page)		\
+	{ }
+#endif
+
+#define HPAGEFLAG(uname, flname)				\
+	TESTHPAGEFLAG(uname, flname)				\
+	SETHPAGEFLAG(uname, flname)				\
+	CLEARHPAGEFLAG(uname, flname)				\
+
+/*
+ * Create functions associated with hugetlb page flags
+ */
+HPAGEFLAG(RestoreReserve, restore_reserve)
+
 #ifdef CONFIG_HUGETLB_PAGE
 
 #define HSTATE_NAME_LEN 32
@@ -531,6 +585,20 @@ extern unsigned int default_hstate_idx;
 
 #define default_hstate (hstates[default_hstate_idx])
 
+/*
+ * hugetlb page subpool pointer located in hpage[1].private
+ */
+static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
+{
+	return (struct hugepage_subpool *)(hpage+1)->private;
+}
+
+static inline void hugetlb_set_page_subpool(struct page *hpage,
+					struct hugepage_subpool *subpool)
+{
+	set_page_private(hpage+1, (unsigned long)subpool);
+}
+
 static inline struct hstate *hstate_file(struct file *f)
 {
 	return hstate_inode(file_inode(f));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8a1d1f85e21e..f5e85dabb7a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1143,7 +1143,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
 	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
 	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
-		SetPagePrivate(page);
+		SetHPageRestoreReserve(page);
 		h->resv_huge_pages--;
 	}
 
@@ -1418,20 +1418,19 @@ static void __free_huge_page(struct page *page)
 	 */
 	struct hstate *h = page_hstate(page);
 	int nid = page_to_nid(page);
-	struct hugepage_subpool *spool =
-		(struct hugepage_subpool *)page_private(page);
+	struct hugepage_subpool *spool = hugetlb_page_subpool(page);
 	bool restore_reserve;
 
 	VM_BUG_ON_PAGE(page_count(page), page);
 	VM_BUG_ON_PAGE(page_mapcount(page), page);
 
-	set_page_private(page, 0);
+	hugetlb_set_page_subpool(page, NULL);
 	page->mapping = NULL;
-	restore_reserve = PagePrivate(page);
-	ClearPagePrivate(page);
+	restore_reserve = HPageRestoreReserve(page);
+	ClearHPageRestoreReserve(page);
 
 	/*
-	 * If PagePrivate() was set on page, page allocation consumed a
+	 * If HPageRestoreReserve was set on page, page allocation consumed a
 	 * reservation.  If the page was associated with a subpool, there
 	 * would have been a page reserved in the subpool before allocation
 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
@@ -2263,24 +2262,24 @@ static long vma_add_reservation(struct hstate *h,
  * This routine is called to restore a reservation on error paths.  In the
  * specific error paths, a huge page was allocated (via alloc_huge_page)
  * and is about to be freed.  If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set PagePrivate
- * in the newly allocated page.  When the page is freed via free_huge_page,
- * the global reservation count will be incremented if PagePrivate is set.
- * However, free_huge_page can not adjust the reserve map.  Adjust the
- * reserve map here to be consistent with global reserve count adjustments
- * to be made by free_huge_page.
+ * alloc_huge_page would have consumed the reservation and set
+ * HPageRestoreReserve in the newly allocated page.  When the page is freed
+ * via free_huge_page, the global reservation count will be incremented if
+ * HPageRestoreReserve is set.  However, free_huge_page can not adjust the
+ * reserve map.  Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page.
  */
 static void restore_reserve_on_error(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address,
 			struct page *page)
 {
-	if (unlikely(PagePrivate(page))) {
+	if (unlikely(HPageRestoreReserve(page))) {
 		long rc = vma_needs_reservation(h, vma, address);
 
 		if (unlikely(rc < 0)) {
 			/*
 			 * Rare out of memory condition in reserve map
-			 * manipulation.  Clear PagePrivate so that
+			 * manipulation.  Clear HPageRestoreReserve so that
 			 * global reserve count will not be incremented
 			 * by free_huge_page.  This will make it appear
 			 * as though the reservation for this page was
@@ -2289,7 +2288,7 @@ static void restore_reserve_on_error(struct hstate *h,
 			 * is better than inconsistent global huge page
 			 * accounting of reserve counts.
 			 */
-			ClearPagePrivate(page);
+			ClearHPageRestoreReserve(page);
 		} else if (rc) {
 			rc = vma_add_reservation(h, vma, address);
 			if (unlikely(rc < 0))
@@ -2297,7 +2296,7 @@ static void restore_reserve_on_error(struct hstate *h,
 				 * See above comment about rare out of
 				 * memory condition.
 				 */
-				ClearPagePrivate(page);
+				ClearHPageRestoreReserve(page);
 		} else
 			vma_end_reservation(h, vma, address);
 	}
@@ -2378,7 +2377,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 		if (!page)
 			goto out_uncharge_cgroup;
 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
-			SetPagePrivate(page);
+			SetHPageRestoreReserve(page);
 			h->resv_huge_pages--;
 		}
 		spin_lock(&hugetlb_lock);
@@ -2396,7 +2395,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
 	spin_unlock(&hugetlb_lock);
 
-	set_page_private(page, (unsigned long)spool);
+	hugetlb_set_page_subpool(page, spool);
 
 	map_commit = vma_commit_reservation(h, vma, addr);
 	if (unlikely(map_chg > map_commit)) {
@@ -3170,6 +3169,9 @@ static int __init hugetlb_init(void)
 {
 	int i;
 
+	BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
+			__NR_HPAGEFLAGS);
+
 	if (!hugepages_supported()) {
 		if (hugetlb_max_hstate || default_hstate_max_huge_pages)
 			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
@@ -4207,7 +4209,7 @@ retry_avoidcopy:
 	spin_lock(ptl);
 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
-		ClearPagePrivate(new_page);
+		ClearHPageRestoreReserve(new_page);
 
 		/* Break COW */
 		huge_ptep_clear_flush(vma, haddr, ptep);
@@ -4274,7 +4276,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 
 	if (err)
 		return err;
-	ClearPagePrivate(page);
+	ClearHPageRestoreReserve(page);
 
 	/*
 	 * set page dirty so that it will not be removed from cache/file
@@ -4436,7 +4438,7 @@ retry:
 		goto backout;
 
 	if (anon_rmap) {
-		ClearPagePrivate(page);
+		ClearHPageRestoreReserve(page);
 		hugepage_add_new_anon_rmap(page, vma, haddr);
 	} else
 		page_dup_rmap(page, true);
@@ -4750,7 +4752,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	if (vm_shared) {
 		page_dup_rmap(page, true);
 	} else {
-		ClearPagePrivate(page);
+		ClearHPageRestoreReserve(page);
 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 	}
 
-- 
cgit v1.2.3


From 8f251a3d5ce3bdea73bd045ed35db64f32e0d0d9 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:08:56 -0800
Subject: hugetlb: convert page_huge_active() HPageMigratable flag

Use the new hugetlb page specific flag HPageMigratable to replace the
page_huge_active interfaces.  By it's name, page_huge_active implied that
a huge page was on the active list.  However, that is not really what code
checking the flag wanted to know.  It really wanted to determine if the
huge page could be migrated.  This happens when the page is actually added
to the page cache and/or task page table.  This is the reasoning behind
the name change.

The VM_BUG_ON_PAGE() calls in the *_huge_active() interfaces are not
really necessary as we KNOW the page is a hugetlb page.  Therefore, they
are removed.

The routine page_huge_active checked for PageHeadHuge before testing the
active bit.  This is unnecessary in the case where we hold a reference or
lock and know it is a hugetlb head page.  page_huge_active is also called
without holding a reference or lock (scan_movable_pages), and can race
with code freeing the page.  The extra check in page_huge_active shortened
the race window, but did not prevent the race.  Offline code calling
scan_movable_pages already deals with these races, so removing the check
is acceptable.  Add comment to racy code.

[songmuchun@bytedance.com: remove set_page_huge_active() declaration from include/linux/hugetlb.h]
  Link: https://lkml.kernel.org/r/CAMZfGtUda+KoAZscU0718TN61cSFwp4zy=y2oZ=+6Z2TAZZwng@mail.gmail.com

Link: https://lkml.kernel.org/r/20210122195231.324857-3-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c       |  2 +-
 include/linux/hugetlb.h    |  7 +++++--
 include/linux/page-flags.h |  6 ------
 mm/hugetlb.c               | 45 +++++++++++----------------------------------
 mm/memory_hotplug.c        |  9 ++++++++-
 5 files changed, 25 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 907f6405e805..5a8ed6bd4f87 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -735,7 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
-		set_page_huge_active(page);
+		SetHPageMigratable(page);
 		/*
 		 * unlock_page because locked by add_to_page_cache()
 		 * put_page() due to reference from alloc_huge_page()
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 249c65e1d8ca..77f2a032fe32 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -480,9 +480,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
  *	allocation time.  Cleared when page is fully instantiated.  Free
  *	routine checks flag to restore a reservation on error paths.
+ * HPG_migratable  - Set after a newly allocated page is added to the page
+ *	cache and/or page tables.  Indicates the page is a candidate for
+ *	migration.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
+	HPG_migratable,
 	__NR_HPAGEFLAGS,
 };
 
@@ -525,6 +529,7 @@ static inline void ClearHPage##uname(struct page *page)		\
  * Create functions associated with hugetlb page flags
  */
 HPAGEFLAG(RestoreReserve, restore_reserve)
+HPAGEFLAG(Migratable, migratable)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -838,8 +843,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 }
 #endif
 
-void set_page_huge_active(struct page *page);
-
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec5d0290e0ee..db914477057b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -592,15 +592,9 @@ static inline void ClearPageCompound(struct page *page)
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
 int PageHeadHuge(struct page *page);
-bool page_huge_active(struct page *page);
 #else
 TESTPAGEFLAG_FALSE(Huge)
 TESTPAGEFLAG_FALSE(HeadHuge)
-
-static inline bool page_huge_active(struct page *page)
-{
-	return 0;
-}
 #endif
 
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f5e85dabb7a3..727c09713627 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1364,30 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size)
 	return NULL;
 }
 
-/*
- * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
- * to hstate->hugepage_activelist.)
- *
- * This function can be called for tail pages, but never returns true for them.
- */
-bool page_huge_active(struct page *page)
-{
-	return PageHeadHuge(page) && PagePrivate(&page[1]);
-}
-
-/* never called for tail page */
-void set_page_huge_active(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
-	SetPagePrivate(&page[1]);
-}
-
-static void clear_page_huge_active(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
-	ClearPagePrivate(&page[1]);
-}
-
 /*
  * Internal hugetlb specific page flag. Do not use outside of the hugetlb
  * code
@@ -1449,7 +1425,7 @@ static void __free_huge_page(struct page *page)
 	}
 
 	spin_lock(&hugetlb_lock);
-	clear_page_huge_active(page);
+	ClearHPageMigratable(page);
 	hugetlb_cgroup_uncharge_page(hstate_index(h),
 				     pages_per_huge_page(h), page);
 	hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
@@ -4218,7 +4194,7 @@ retry_avoidcopy:
 				make_huge_pte(vma, new_page, 1));
 		page_remove_rmap(old_page, true);
 		hugepage_add_new_anon_rmap(new_page, vma, haddr);
-		set_page_huge_active(new_page);
+		SetHPageMigratable(new_page);
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
@@ -4455,12 +4431,12 @@ retry:
 	spin_unlock(ptl);
 
 	/*
-	 * Only make newly allocated pages active.  Existing pages found
-	 * in the pagecache could be !page_huge_active() if they have been
-	 * isolated for migration.
+	 * Only set HPageMigratable in newly allocated pages.  Existing pages
+	 * found in the pagecache may not have HPageMigratableset if they have
+	 * been isolated for migration.
 	 */
 	if (new_page)
-		set_page_huge_active(page);
+		SetHPageMigratable(page);
 
 	unlock_page(page);
 out:
@@ -4771,7 +4747,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
 	spin_unlock(ptl);
-	set_page_huge_active(page);
+	SetHPageMigratable(page);
 	if (vm_shared)
 		unlock_page(page);
 	ret = 0;
@@ -5610,12 +5586,13 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 	bool ret = true;
 
 	spin_lock(&hugetlb_lock);
-	if (!PageHeadHuge(page) || !page_huge_active(page) ||
+	if (!PageHeadHuge(page) ||
+	    !HPageMigratable(page) ||
 	    !get_page_unless_zero(page)) {
 		ret = false;
 		goto unlock;
 	}
-	clear_page_huge_active(page);
+	ClearHPageMigratable(page);
 	list_move_tail(&page->lru, list);
 unlock:
 	spin_unlock(&hugetlb_lock);
@@ -5625,7 +5602,7 @@ unlock:
 void putback_active_hugepage(struct page *page)
 {
 	spin_lock(&hugetlb_lock);
-	set_page_huge_active(page);
+	SetHPageMigratable(page);
 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
 	spin_unlock(&hugetlb_lock);
 	put_page(page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ddcb1cd24c60..abe43c1ae920 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1260,7 +1260,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
 		if (!PageHuge(page))
 			continue;
 		head = compound_head(page);
-		if (page_huge_active(head))
+		/*
+		 * This test is racy as we hold no reference or lock.  The
+		 * hugetlb page could have been free'ed and head is no longer
+		 * a hugetlb page before the following check.  In such unlikely
+		 * cases false positives and negatives are possible.  Calling
+		 * code must deal with these scenarios.
+		 */
+		if (HPageMigratable(head))
 			goto found;
 		skip = compound_nr(head) - (page - head);
 		pfn += skip - 1;
-- 
cgit v1.2.3


From 9157c31186c358c5750dea50ac5705d61d7fc917 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:00 -0800
Subject: hugetlb: convert PageHugeTemporary() to HPageTemporary flag

Use new hugetlb specific HPageTemporary flag to replace the
PageHugeTemporary() interfaces.  PageHugeTemporary does contain a
PageHuge() check.  However, this interface is only used within hugetlb
code where we know we are dealing with a hugetlb page.  Therefore, the
check can be eliminated.

Link: https://lkml.kernel.org/r/20210122195231.324857-5-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 ++++++
 mm/hugetlb.c            | 36 +++++++-----------------------------
 2 files changed, 13 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77f2a032fe32..e97498a21010 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -483,10 +483,15 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_migratable  - Set after a newly allocated page is added to the page
  *	cache and/or page tables.  Indicates the page is a candidate for
  *	migration.
+ * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
+ *	allocator.  Typically used for migration target pages when no pages
+ *	are available in the pool.  The hugetlb free page path will
+ *	immediately free pages with this flag set to the buddy allocator.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
 	HPG_migratable,
+	HPG_temporary,
 	__NR_HPAGEFLAGS,
 };
 
@@ -530,6 +535,7 @@ static inline void ClearHPage##uname(struct page *page)		\
  */
 HPAGEFLAG(RestoreReserve, restore_reserve)
 HPAGEFLAG(Migratable, migratable)
+HPAGEFLAG(Temporary, temporary)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 727c09713627..4d7d4535a313 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1364,28 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size)
 	return NULL;
 }
 
-/*
- * Internal hugetlb specific page flag. Do not use outside of the hugetlb
- * code
- */
-static inline bool PageHugeTemporary(struct page *page)
-{
-	if (!PageHuge(page))
-		return false;
-
-	return (unsigned long)page[2].mapping == -1U;
-}
-
-static inline void SetPageHugeTemporary(struct page *page)
-{
-	page[2].mapping = (void *)-1U;
-}
-
-static inline void ClearPageHugeTemporary(struct page *page)
-{
-	page[2].mapping = NULL;
-}
-
 static void __free_huge_page(struct page *page)
 {
 	/*
@@ -1433,9 +1411,9 @@ static void __free_huge_page(struct page *page)
 	if (restore_reserve)
 		h->resv_huge_pages++;
 
-	if (PageHugeTemporary(page)) {
+	if (HPageTemporary(page)) {
 		list_del(&page->lru);
-		ClearPageHugeTemporary(page);
+		ClearHPageTemporary(page);
 		update_and_free_page(h, page);
 	} else if (h->surplus_huge_pages_node[nid]) {
 		/* remove the page from active list */
@@ -1869,7 +1847,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	 * codeflow
 	 */
 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
-		SetPageHugeTemporary(page);
+		SetHPageTemporary(page);
 		spin_unlock(&hugetlb_lock);
 		put_page(page);
 		return NULL;
@@ -1900,7 +1878,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
 	 * We do not account these pages as surplus because they are only
 	 * temporary and will be released properly on the last reference
 	 */
-	SetPageHugeTemporary(page);
+	SetHPageTemporary(page);
 
 	return page;
 }
@@ -5625,12 +5603,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
 	 * here as well otherwise the global surplus count will not match
 	 * the per-node's.
 	 */
-	if (PageHugeTemporary(newpage)) {
+	if (HPageTemporary(newpage)) {
 		int old_nid = page_to_nid(oldpage);
 		int new_nid = page_to_nid(newpage);
 
-		SetPageHugeTemporary(oldpage);
-		ClearPageHugeTemporary(newpage);
+		SetHPageTemporary(oldpage);
+		ClearHPageTemporary(newpage);
 
 		spin_lock(&hugetlb_lock);
 		if (h->surplus_huge_pages_node[old_nid]) {
-- 
cgit v1.2.3


From 6c037149014027d50175da5be4ae4531374dcbe0 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:04 -0800
Subject: hugetlb: convert PageHugeFreed to HPageFreed flag

Use new hugetlb specific HPageFreed flag to replace the PageHugeFreed
interfaces.

Link: https://lkml.kernel.org/r/20210122195231.324857-6-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c            | 23 ++++-------------------
 2 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e97498a21010..eb2682fd97b6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -487,11 +487,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  *	allocator.  Typically used for migration target pages when no pages
  *	are available in the pool.  The hugetlb free page path will
  *	immediately free pages with this flag set to the buddy allocator.
+ * HPG_freed - Set when page is on the free lists.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
 	HPG_migratable,
 	HPG_temporary,
+	HPG_freed,
 	__NR_HPAGEFLAGS,
 };
 
@@ -536,6 +538,7 @@ static inline void ClearHPage##uname(struct page *page)		\
 HPAGEFLAG(RestoreReserve, restore_reserve)
 HPAGEFLAG(Migratable, migratable)
 HPAGEFLAG(Temporary, temporary)
+HPAGEFLAG(Freed, freed)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4d7d4535a313..5377e1dad044 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -79,21 +79,6 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
-static inline bool PageHugeFreed(struct page *head)
-{
-	return page_private(head + 4) == -1UL;
-}
-
-static inline void SetPageHugeFreed(struct page *head)
-{
-	set_page_private(head + 4, -1UL);
-}
-
-static inline void ClearPageHugeFreed(struct page *head)
-{
-	set_page_private(head + 4, 0);
-}
-
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 
@@ -1053,7 +1038,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 	list_move(&page->lru, &h->hugepage_freelists[nid]);
 	h->free_huge_pages++;
 	h->free_huge_pages_node[nid]++;
-	SetPageHugeFreed(page);
+	SetHPageFreed(page);
 }
 
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1070,7 +1055,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
 		list_move(&page->lru, &h->hugepage_activelist);
 		set_page_refcounted(page);
-		ClearPageHugeFreed(page);
+		ClearHPageFreed(page);
 		h->free_huge_pages--;
 		h->free_huge_pages_node[nid]--;
 		return page;
@@ -1485,7 +1470,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 	spin_lock(&hugetlb_lock);
 	h->nr_huge_pages++;
 	h->nr_huge_pages_node[nid]++;
-	ClearPageHugeFreed(page);
+	ClearHPageFreed(page);
 	spin_unlock(&hugetlb_lock);
 }
 
@@ -1756,7 +1741,7 @@ retry:
 		 * We should make sure that the page is already on the free list
 		 * when it is dissolved.
 		 */
-		if (unlikely(!PageHugeFreed(head))) {
+		if (unlikely(!HPageFreed(head))) {
 			spin_unlock(&hugetlb_lock);
 			cond_resched();
 
-- 
cgit v1.2.3


From d95c0337774b1dc74d271e7475a96fe8838332ea Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:08 -0800
Subject: include/linux/hugetlb.h: add synchronization information for new
 hugetlb specific flags

Add comments, no functional change.

Link: https://lkml.kernel.org/r/62a80585-2a73-10cc-4a2d-5721540d4ad2@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index eb2682fd97b6..3a541c6cf5c3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -480,14 +480,24 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
  *	allocation time.  Cleared when page is fully instantiated.  Free
  *	routine checks flag to restore a reservation on error paths.
+ *	Synchronization:  Examined or modified by code that knows it has
+ *	the only reference to page.  i.e. After allocation but before use
+ *	or when the page is being freed.
  * HPG_migratable  - Set after a newly allocated page is added to the page
  *	cache and/or page tables.  Indicates the page is a candidate for
  *	migration.
+ *	Synchronization:  Initially set after new page allocation with no
+ *	locking.  When examined and modified during migration processing
+ *	(isolate, migrate, putback) the hugetlb_lock is held.
  * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
  *	allocator.  Typically used for migration target pages when no pages
  *	are available in the pool.  The hugetlb free page path will
  *	immediately free pages with this flag set to the buddy allocator.
+ *	Synchronization: Can be set after huge page allocation from buddy when
+ *	code knows it has only reference.  All other examinations and
+ *	modifications require hugetlb_lock.
  * HPG_freed - Set when page is on the free lists.
+ *	Synchronization: hugetlb_lock held for examination and modification.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
-- 
cgit v1.2.3


From 33b8f84a4ee78491a8f4f9e4c5520c9da4a10983 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Feb 2021 12:09:54 -0800
Subject: mm/hugetlb: change hugetlb_reserve_pages() to type bool

While reviewing a bug in hugetlb_reserve_pages, it was noticed that all
callers ignore the return value.  Any failure is considered an ENOMEM
error by the callers.

Change the function to be of type bool.  The function will return true if
the reservation was successful, false otherwise.  Callers currently assume
a zero return code indicates success.  Change the callers to look for true
to indicate success.  No functional change, only code cleanup.

Link: https://lkml.kernel.org/r/20201221192542.15732-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  4 ++--
 include/linux/hugetlb.h |  2 +-
 mm/hugetlb.c            | 37 ++++++++++++++-----------------------
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5a8ed6bd4f87..3eca85a4d940 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	file_accessed(file);
 
 	ret = -ENOMEM;
-	if (hugetlb_reserve_pages(inode,
+	if (!hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
 				vma->vm_flags))
@@ -1493,7 +1493,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	inode->i_size = size;
 	clear_nlink(inode);
 
-	if (hugetlb_reserve_pages(inode, 0,
+	if (!hugetlb_reserve_pages(inode, 0,
 			size >> huge_page_shift(hstate_inode(inode)), NULL,
 			acctflag))
 		file = ERR_PTR(-ENOMEM);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3a541c6cf5c3..cccd1aab69dd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -139,7 +139,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				unsigned long dst_addr,
 				unsigned long src_addr,
 				struct page **pagep);
-int hugetlb_reserve_pages(struct inode *inode, long from, long to,
+bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8e5f494291c6..8fb42c6dd74b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5016,12 +5016,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	return pages << h->order;
 }
 
-int hugetlb_reserve_pages(struct inode *inode,
+/* Return true if reservation was successful, false otherwise.  */
+bool hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
 					struct vm_area_struct *vma,
 					vm_flags_t vm_flags)
 {
-	long ret, chg, add = -1;
+	long chg, add = -1;
 	struct hstate *h = hstate_inode(inode);
 	struct hugepage_subpool *spool = subpool_inode(inode);
 	struct resv_map *resv_map;
@@ -5031,7 +5032,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	/* This should never happen */
 	if (from > to) {
 		VM_WARN(1, "%s called with a negative range\n", __func__);
-		return -EINVAL;
+		return false;
 	}
 
 	/*
@@ -5040,7 +5041,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * without using reserves
 	 */
 	if (vm_flags & VM_NORESERVE)
-		return 0;
+		return true;
 
 	/*
 	 * Shared mappings base their reservation on the number of pages that
@@ -5062,7 +5063,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 		/* Private mapping. */
 		resv_map = resv_map_alloc();
 		if (!resv_map)
-			return -ENOMEM;
+			return false;
 
 		chg = to - from;
 
@@ -5070,18 +5071,12 @@ int hugetlb_reserve_pages(struct inode *inode,
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
 
-	if (chg < 0) {
-		ret = chg;
+	if (chg < 0)
 		goto out_err;
-	}
 
-	ret = hugetlb_cgroup_charge_cgroup_rsvd(
-		hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
-
-	if (ret < 0) {
-		ret = -ENOMEM;
+	if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
+				chg * pages_per_huge_page(h), &h_cg) < 0)
 		goto out_err;
-	}
 
 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
 		/* For private mappings, the hugetlb_cgroup uncharge info hangs
@@ -5096,19 +5091,15 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * reservations already in place (gbl_reserve).
 	 */
 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
-	if (gbl_reserve < 0) {
-		ret = -ENOSPC;
+	if (gbl_reserve < 0)
 		goto out_uncharge_cgroup;
-	}
 
 	/*
 	 * Check enough hugepages are available for the reservation.
 	 * Hand the pages back to the subpool if there are not
 	 */
-	ret = hugetlb_acct_memory(h, gbl_reserve);
-	if (ret < 0) {
+	if (hugetlb_acct_memory(h, gbl_reserve) < 0)
 		goto out_put_pages;
-	}
 
 	/*
 	 * Account for the reservations made. Shared mappings record regions
@@ -5126,7 +5117,6 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 		if (unlikely(add < 0)) {
 			hugetlb_acct_memory(h, -gbl_reserve);
-			ret = add;
 			goto out_put_pages;
 		} else if (unlikely(chg > add)) {
 			/*
@@ -5147,7 +5137,8 @@ int hugetlb_reserve_pages(struct inode *inode,
 			hugetlb_acct_memory(h, -rsv_adjust);
 		}
 	}
-	return 0;
+	return true;
+
 out_put_pages:
 	/* put back original number of pages, chg */
 	(void)hugepage_subpool_put_pages(spool, chg);
@@ -5163,7 +5154,7 @@ out_err:
 			region_abort(resv_map, from, to, regions_needed);
 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		kref_put(&resv_map->refs, resv_map_release);
-	return ret;
+	return false;
 }
 
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
-- 
cgit v1.2.3


From a553e3cd2053501b658feec2be9a3b662eb1b22b Mon Sep 17 00:00:00 2001
From: Chengyang Fan <cy.fan@huawei.com>
Date: Wed, 24 Feb 2021 12:10:28 -0800
Subject: mm/migrate: remove unneeded semicolons

Remove superfluous semicolons after function definitions.

Link: https://lkml.kernel.org/r/20210115110131.2359683-1-cy.fan@huawei.com
Signed-off-by: Chengyang Fan <cy.fan@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4594838a0f7c..3a389633b68f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -89,7 +89,7 @@ extern int PageMovable(struct page *page);
 extern void __SetPageMovable(struct page *page, struct address_space *mapping);
 extern void __ClearPageMovable(struct page *page);
 #else
-static inline int PageMovable(struct page *page) { return 0; };
+static inline int PageMovable(struct page *page) { return 0; }
 static inline void __SetPageMovable(struct page *page,
 				struct address_space *mapping)
 {
-- 
cgit v1.2.3


From 4e096a18867a5a989b510f6999d9c6b6622e8f7b Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 23 Feb 2021 08:01:26 +0100
Subject: net: introduce CAN specific pointer in the struct net_device

Since 20dd3850bcf8 ("can: Speed up CAN frame receiption by using
ml_priv") the CAN framework uses per device specific data in the AF_CAN
protocol. For this purpose the struct net_device->ml_priv is used. Later
the ml_priv usage in CAN was extended for other users, one of them being
CAN_J1939.

Later in the kernel ml_priv was converted to an union, used by other
drivers. E.g. the tun driver started storing it's stats pointer.

Since tun devices can claim to be a CAN device, CAN specific protocols
will wrongly interpret this pointer, which will cause system crashes.
Mostly this issue is visible in the CAN_J1939 stack.

To fix this issue, we request a dedicated CAN pointer within the
net_device struct.

Reported-by: syzbot+5138c4dd15a0401bec7b@syzkaller.appspotmail.com
Fixes: 20dd3850bcf8 ("can: Speed up CAN frame receiption by using ml_priv")
Fixes: ffd956eef69b ("can: introduce CAN midlayer private and allocate it automatically")
Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Fixes: 497a5757ce4e ("tun: switch to net core provided statistics counters")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20210223070127.4538-1-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/can/dev/dev.c  |  4 +++-
 drivers/net/can/slcan.c    |  4 +++-
 drivers/net/can/vcan.c     |  2 +-
 drivers/net/can/vxcan.c    |  6 +++++-
 include/linux/can/can-ml.h | 12 ++++++++++++
 include/linux/netdevice.h  | 34 +++++++++++++++++++++++++++++++++-
 net/can/af_can.c           | 34 ++--------------------------------
 net/can/j1939/main.c       | 22 ++++++++--------------
 net/can/j1939/socket.c     | 13 ++++---------
 net/can/proc.c             | 19 +++++++++++++------
 10 files changed, 84 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c
index d9281ae853f8..311d8564d611 100644
--- a/drivers/net/can/dev/dev.c
+++ b/drivers/net/can/dev/dev.c
@@ -239,6 +239,7 @@ void can_setup(struct net_device *dev)
 struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 				    unsigned int txqs, unsigned int rxqs)
 {
+	struct can_ml_priv *can_ml;
 	struct net_device *dev;
 	struct can_priv *priv;
 	int size;
@@ -270,7 +271,8 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max,
 	priv = netdev_priv(dev);
 	priv->dev = dev;
 
-	dev->ml_priv = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN);
+	can_ml = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN);
+	can_set_ml_priv(dev, can_ml);
 
 	if (echo_skb_max) {
 		priv->echo_skb_max = echo_skb_max;
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index a1bd1be09548..30c8d53c9745 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -516,6 +516,7 @@ static struct slcan *slc_alloc(void)
 	int i;
 	char name[IFNAMSIZ];
 	struct net_device *dev = NULL;
+	struct can_ml_priv *can_ml;
 	struct slcan       *sl;
 	int size;
 
@@ -538,7 +539,8 @@ static struct slcan *slc_alloc(void)
 
 	dev->base_addr  = i;
 	sl = netdev_priv(dev);
-	dev->ml_priv = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN);
+	can_ml = (void *)sl + ALIGN(sizeof(*sl), NETDEV_ALIGN);
+	can_set_ml_priv(dev, can_ml);
 
 	/* Initialize channel control data */
 	sl->magic = SLCAN_MAGIC;
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 39ca14b0585d..067705e2850b 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -153,7 +153,7 @@ static void vcan_setup(struct net_device *dev)
 	dev->addr_len		= 0;
 	dev->tx_queue_len	= 0;
 	dev->flags		= IFF_NOARP;
-	dev->ml_priv		= netdev_priv(dev);
+	can_set_ml_priv(dev, netdev_priv(dev));
 
 	/* set flags according to driver capabilities */
 	if (echo)
diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index f9a524c5f6d6..8861a7d875e7 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -141,6 +141,8 @@ static const struct net_device_ops vxcan_netdev_ops = {
 
 static void vxcan_setup(struct net_device *dev)
 {
+	struct can_ml_priv *can_ml;
+
 	dev->type		= ARPHRD_CAN;
 	dev->mtu		= CANFD_MTU;
 	dev->hard_header_len	= 0;
@@ -149,7 +151,9 @@ static void vxcan_setup(struct net_device *dev)
 	dev->flags		= (IFF_NOARP|IFF_ECHO);
 	dev->netdev_ops		= &vxcan_netdev_ops;
 	dev->needs_free_netdev	= true;
-	dev->ml_priv		= netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN);
+
+	can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN);
+	can_set_ml_priv(dev, can_ml);
 }
 
 /* forward declaration for rtnl_create_link() */
diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h
index 2f5d731ae251..8afa92d15a66 100644
--- a/include/linux/can/can-ml.h
+++ b/include/linux/can/can-ml.h
@@ -44,6 +44,7 @@
 
 #include <linux/can.h>
 #include <linux/list.h>
+#include <linux/netdevice.h>
 
 #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
 #define CAN_EFF_RCV_HASH_BITS 10
@@ -65,4 +66,15 @@ struct can_ml_priv {
 #endif
 };
 
+static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev)
+{
+	return netdev_get_ml_priv(dev, ML_PRIV_CAN);
+}
+
+static inline void can_set_ml_priv(struct net_device *dev,
+				   struct can_ml_priv *ml_priv)
+{
+	netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN);
+}
+
 #endif /* CAN_ML_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ddf4cfc12615..f06fbee8638e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1584,6 +1584,12 @@ enum netdev_priv_flags {
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
 
+/* Specifies the type of the struct net_device::ml_priv pointer */
+enum netdev_ml_priv_type {
+	ML_PRIV_NONE,
+	ML_PRIV_CAN,
+};
+
 /**
  *	struct net_device - The DEVICE structure.
  *
@@ -1779,6 +1785,7 @@ enum netdev_priv_flags {
  * 	@nd_net:		Network namespace this network device is inside
  *
  * 	@ml_priv:	Mid-layer private
+ *	@ml_priv_type:  Mid-layer private type
  * 	@lstats:	Loopback statistics
  * 	@tstats:	Tunnel statistics
  * 	@dstats:	Dummy statistics
@@ -2094,8 +2101,10 @@ struct net_device {
 	possible_net_t			nd_net;
 
 	/* mid-layer private */
+	void				*ml_priv;
+	enum netdev_ml_priv_type	ml_priv_type;
+
 	union {
-		void					*ml_priv;
 		struct pcpu_lstats __percpu		*lstats;
 		struct pcpu_sw_netstats __percpu	*tstats;
 		struct pcpu_dstats __percpu		*dstats;
@@ -2286,6 +2295,29 @@ static inline void netdev_reset_rx_headroom(struct net_device *dev)
 	netdev_set_rx_headroom(dev, -1);
 }
 
+static inline void *netdev_get_ml_priv(struct net_device *dev,
+				       enum netdev_ml_priv_type type)
+{
+	if (dev->ml_priv_type != type)
+		return NULL;
+
+	return dev->ml_priv;
+}
+
+static inline void netdev_set_ml_priv(struct net_device *dev,
+				      void *ml_priv,
+				      enum netdev_ml_priv_type type)
+{
+	WARN(dev->ml_priv_type && dev->ml_priv_type != type,
+	     "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
+	     dev->ml_priv_type, type);
+	WARN(!dev->ml_priv_type && dev->ml_priv,
+	     "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");
+
+	dev->ml_priv = ml_priv;
+	dev->ml_priv_type = type;
+}
+
 /*
  * Net namespace inlines
  */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 837bb8af0ec3..cce2af10eb3e 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -304,8 +304,8 @@ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net,
 							struct net_device *dev)
 {
 	if (dev) {
-		struct can_ml_priv *ml_priv = dev->ml_priv;
-		return &ml_priv->dev_rcv_lists;
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+		return &can_ml->dev_rcv_lists;
 	} else {
 		return net->can.rx_alldev_list;
 	}
@@ -790,25 +790,6 @@ void can_proto_unregister(const struct can_proto *cp)
 }
 EXPORT_SYMBOL(can_proto_unregister);
 
-/* af_can notifier to create/remove CAN netdevice specific structs */
-static int can_notifier(struct notifier_block *nb, unsigned long msg,
-			void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	if (dev->type != ARPHRD_CAN)
-		return NOTIFY_DONE;
-
-	switch (msg) {
-	case NETDEV_REGISTER:
-		WARN(!dev->ml_priv,
-		     "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n");
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
 static int can_pernet_init(struct net *net)
 {
 	spin_lock_init(&net->can.rcvlists_lock);
@@ -876,11 +857,6 @@ static const struct net_proto_family can_family_ops = {
 	.owner  = THIS_MODULE,
 };
 
-/* notifier block for netdevice event */
-static struct notifier_block can_netdev_notifier __read_mostly = {
-	.notifier_call = can_notifier,
-};
-
 static struct pernet_operations can_pernet_ops __read_mostly = {
 	.init = can_pernet_init,
 	.exit = can_pernet_exit,
@@ -911,17 +887,12 @@ static __init int can_init(void)
 	err = sock_register(&can_family_ops);
 	if (err)
 		goto out_sock;
-	err = register_netdevice_notifier(&can_netdev_notifier);
-	if (err)
-		goto out_notifier;
 
 	dev_add_pack(&can_packet);
 	dev_add_pack(&canfd_packet);
 
 	return 0;
 
-out_notifier:
-	sock_unregister(PF_CAN);
 out_sock:
 	unregister_pernet_subsys(&can_pernet_ops);
 out_pernet:
@@ -935,7 +906,6 @@ static __exit void can_exit(void)
 	/* protocol unregister */
 	dev_remove_pack(&canfd_packet);
 	dev_remove_pack(&can_packet);
-	unregister_netdevice_notifier(&can_netdev_notifier);
 	sock_unregister(PF_CAN);
 
 	unregister_pernet_subsys(&can_pernet_ops);
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index bb914d8b4216..da3a7a7bcff2 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -140,9 +140,9 @@ static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
 static inline void j1939_priv_set(struct net_device *ndev,
 				  struct j1939_priv *priv)
 {
-	struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+	struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
 
-	can_ml_priv->j1939_priv = priv;
+	can_ml->j1939_priv = priv;
 }
 
 static void __j1939_priv_release(struct kref *kref)
@@ -211,12 +211,9 @@ static void __j1939_rx_release(struct kref *kref)
 /* get pointer to priv without increasing ref counter */
 static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
 {
-	struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+	struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
 
-	if (!can_ml_priv)
-		return NULL;
-
-	return can_ml_priv->j1939_priv;
+	return can_ml->j1939_priv;
 }
 
 static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
@@ -225,9 +222,6 @@ static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
 
 	lockdep_assert_held(&j1939_netdev_lock);
 
-	if (ndev->type != ARPHRD_CAN)
-		return NULL;
-
 	priv = j1939_ndev_to_priv(ndev);
 	if (priv)
 		j1939_priv_get(priv);
@@ -348,15 +342,16 @@ static int j1939_netdev_notify(struct notifier_block *nb,
 			       unsigned long msg, void *data)
 {
 	struct net_device *ndev = netdev_notifier_info_to_dev(data);
+	struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
 	struct j1939_priv *priv;
 
+	if (!can_ml)
+		goto notify_done;
+
 	priv = j1939_priv_get_by_ndev(ndev);
 	if (!priv)
 		goto notify_done;
 
-	if (ndev->type != ARPHRD_CAN)
-		goto notify_put;
-
 	switch (msg) {
 	case NETDEV_DOWN:
 		j1939_cancel_active_session(priv, NULL);
@@ -365,7 +360,6 @@ static int j1939_netdev_notify(struct notifier_block *nb,
 		break;
 	}
 
-notify_put:
 	j1939_priv_put(priv);
 
 notify_done:
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index f23966526a88..56aa66147d5a 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -12,6 +12,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/can/can-ml.h>
 #include <linux/can/core.h>
 #include <linux/can/skb.h>
 #include <linux/errqueue.h>
@@ -453,6 +454,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 		j1939_jsk_del(priv, jsk);
 		j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
 	} else {
+		struct can_ml_priv *can_ml;
 		struct net_device *ndev;
 
 		ndev = dev_get_by_index(net, addr->can_ifindex);
@@ -461,15 +463,8 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 			goto out_release_sock;
 		}
 
-		if (ndev->type != ARPHRD_CAN) {
-			dev_put(ndev);
-			ret = -ENODEV;
-			goto out_release_sock;
-		}
-
-		if (!ndev->ml_priv) {
-			netdev_warn_once(ndev,
-					 "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n");
+		can_ml = can_get_ml_priv(ndev);
+		if (!can_ml) {
 			dev_put(ndev);
 			ret = -ENODEV;
 			goto out_release_sock;
diff --git a/net/can/proc.c b/net/can/proc.c
index 5ea8695f507e..b15760b5c1cc 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -322,8 +322,11 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 
 	/* receive list for registered CAN devices */
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv)
-			can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv);
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+		if (can_ml)
+			can_rcvlist_proc_show_one(m, idx, dev,
+						  &can_ml->dev_rcv_lists);
 	}
 
 	rcu_read_unlock();
@@ -375,8 +378,10 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 
 	/* sff receive list for registered CAN devices */
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
-			dev_rcv_lists = dev->ml_priv;
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+		if (can_ml) {
+			dev_rcv_lists = &can_ml->dev_rcv_lists;
 			can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff,
 						    ARRAY_SIZE(dev_rcv_lists->rx_sff));
 		}
@@ -406,8 +411,10 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 
 	/* eff receive list for registered CAN devices */
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
-			dev_rcv_lists = dev->ml_priv;
+		struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+		if (can_ml) {
+			dev_rcv_lists = &can_ml->dev_rcv_lists;
 			can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff,
 						    ARRAY_SIZE(dev_rcv_lists->rx_eff));
 		}
-- 
cgit v1.2.3


From 44835d20b2a0c9b4c0c3fb96e90f4e2fd4a4e41d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:15:36 -0800
Subject: mm: add FGP_ENTRY

The functionality of find_lock_entry() and find_get_entry() can be
provided by pagecache_get_page(), which lets us delete find_lock_entry()
and make find_get_entry() static.

Link: https://lkml.kernel.org/r/20201112212641.27837-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  1 +
 mm/filemap.c            | 44 ++++++++------------------------------------
 mm/internal.h           |  3 ---
 mm/shmem.c              |  3 ++-
 mm/swap_state.c         |  3 ++-
 5 files changed, 13 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bd629d676a27..b379b2388202 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -315,6 +315,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 #define FGP_NOWAIT		0x00000020
 #define FGP_FOR_MMAP		0x00000040
 #define FGP_HEAD		0x00000080
+#define FGP_ENTRY		0x00000100
 
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
 		int fgp_flags, gfp_t cache_gfp_mask);
diff --git a/mm/filemap.c b/mm/filemap.c
index 57eae5163bce..84b7813badf1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1658,7 +1658,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
 }
 EXPORT_SYMBOL(page_cache_prev_miss);
 
-/**
+/*
  * find_get_entry - find and get a page cache entry
  * @mapping: the address_space to search
  * @index: The page cache index.
@@ -1671,7 +1671,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
  *
  * Return: The head page or shadow entry, %NULL if nothing is found.
  */
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
+static struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
 {
 	XA_STATE(xas, &mapping->i_pages, index);
 	struct page *page;
@@ -1707,39 +1707,6 @@ out:
 	return page;
 }
 
-/**
- * find_lock_entry - Locate and lock a page cache entry.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * Looks up the page at @mapping & @index.  If there is a page in the
- * cache, the head page is returned locked and with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Context: May sleep.
- * Return: The head page or shadow entry, %NULL if nothing is found.
- */
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
-{
-	struct page *page;
-
-repeat:
-	page = find_get_entry(mapping, index);
-	if (page && !xa_is_value(page)) {
-		lock_page(page);
-		/* Has the page been truncated? */
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto repeat;
-		}
-		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
-	}
-	return page;
-}
-
 /**
  * pagecache_get_page - Find and get a reference to a page.
  * @mapping: The address_space to search.
@@ -1755,6 +1722,8 @@ repeat:
  * * %FGP_LOCK - The page is returned locked.
  * * %FGP_HEAD - If the page is present and a THP, return the head page
  *   rather than the exact page specified by the index.
+ * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
+ *   instead of allocating a new page to replace it.
  * * %FGP_CREAT - If no page is present then a new page is allocated using
  *   @gfp_mask and added to the page cache and the VM's LRU list.
  *   The page is returned locked and with an increased refcount.
@@ -1779,8 +1748,11 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
 
 repeat:
 	page = find_get_entry(mapping, index);
-	if (xa_is_value(page))
+	if (xa_is_value(page)) {
+		if (fgp_flags & FGP_ENTRY)
+			return page;
 		page = NULL;
+	}
 	if (!page)
 		goto no_page;
 
diff --git a/mm/internal.h b/mm/internal.h
index 25d2b2439f19..eed74f1e6147 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -60,9 +60,6 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
 	force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
 }
 
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
-
 /**
  * page_evictable - test whether a page is evictable
  * @page: the page to test
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ea1fa53db3f..bd5bb78128af 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1812,7 +1812,8 @@ repeat:
 	sbinfo = SHMEM_SB(inode->i_sb);
 	charge_mm = vma ? vma->vm_mm : current->mm;
 
-	page = find_lock_entry(mapping, index);
+	page = pagecache_get_page(mapping, index,
+					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
 	if (xa_is_value(page)) {
 		error = shmem_swapin_page(inode, index, &page,
 					  sgp, gfp, vma, fault_type);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f270c30d4681..3cdee7b11da9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -403,7 +403,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
 {
 	swp_entry_t swp;
 	struct swap_info_struct *si;
-	struct page *page = find_get_entry(mapping, index);
+	struct page *page = pagecache_get_page(mapping, index,
+						FGP_ENTRY | FGP_HEAD, 0);
 
 	if (!page)
 		return page;
-- 
cgit v1.2.3


From 41139aa4c3a31ee7e072fc63353c74035aade2ff Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:15:48 -0800
Subject: mm/filemap: add mapping_seek_hole_data

Rewrite shmem_seek_hole_data() and move it to filemap.c.

[willy@infradead.org: don't put an xa_is_value() page]
  Link: https://lkml.kernel.org/r/20201124041507.28996-4-willy@infradead.org

Link: https://lkml.kernel.org/r/20201112212641.27837-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            | 76 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c              | 74 +++--------------------------------------------
 3 files changed, 82 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b379b2388202..3608993428d9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -760,6 +760,8 @@ extern void __delete_from_page_cache(struct page *page, void *shadow);
 void replace_page_cache_page(struct page *old, struct page *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
+loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
+		int whence);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
diff --git a/mm/filemap.c b/mm/filemap.c
index 21443850aeae..eff3006be12a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2553,6 +2553,82 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 }
 EXPORT_SYMBOL(generic_file_read_iter);
 
+static inline bool page_seek_match(struct page *page, bool seek_data)
+{
+	if (xa_is_value(page) || PageUptodate(page))
+		return seek_data;
+	return !seek_data;
+}
+
+static inline
+unsigned int seek_page_size(struct xa_state *xas, struct page *page)
+{
+	if (xa_is_value(page))
+		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+	return thp_size(page);
+}
+
+/**
+ * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ * @mapping: Address space to search.
+ * @start: First byte to consider.
+ * @end: Limit of search (exclusive).
+ * @whence: Either SEEK_HOLE or SEEK_DATA.
+ *
+ * If the page cache knows which blocks contain holes and which blocks
+ * contain data, your filesystem can use this function to implement
+ * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
+ * entirely memory-based such as tmpfs, and filesystems which support
+ * unwritten extents.
+ *
+ * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * SEEK_DATA and there is no data after @start.  There is an implicit hole
+ * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
+ * and @end contain data.
+ */
+loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
+		loff_t end, int whence)
+{
+	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
+	pgoff_t max = (end - 1) / PAGE_SIZE;
+	bool seek_data = (whence == SEEK_DATA);
+	struct page *page;
+
+	if (end <= start)
+		return -ENXIO;
+
+	rcu_read_lock();
+	while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
+		loff_t pos = xas.xa_index * PAGE_SIZE;
+
+		if (start < pos) {
+			if (!seek_data)
+				goto unlock;
+			start = pos;
+		}
+
+		if (page_seek_match(page, seek_data))
+			goto unlock;
+		start = pos + seek_page_size(&xas, page);
+		if (!xa_is_value(page))
+			put_page(page);
+	}
+	rcu_read_unlock();
+
+	if (seek_data)
+		return -ENXIO;
+	goto out;
+
+unlock:
+	rcu_read_unlock();
+	if (!xa_is_value(page))
+		put_page(page);
+out:
+	if (start > end)
+		return end;
+	return start;
+}
+
 #ifdef CONFIG_MMU
 #define MMAP_LOTSAMISS  (100)
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index bd5bb78128af..deb22e128435 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2668,86 +2668,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return retval ? retval : error;
 }
 
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the page cache.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
-				    pgoff_t index, pgoff_t end, int whence)
-{
-	struct page *page;
-	struct pagevec pvec;
-	pgoff_t indices[PAGEVEC_SIZE];
-	bool done = false;
-	int i;
-
-	pagevec_init(&pvec);
-	pvec.nr = 1;		/* start small: we may be there already */
-	while (!done) {
-		pvec.nr = find_get_entries(mapping, index,
-					pvec.nr, pvec.pages, indices);
-		if (!pvec.nr) {
-			if (whence == SEEK_DATA)
-				index = end;
-			break;
-		}
-		for (i = 0; i < pvec.nr; i++, index++) {
-			if (index < indices[i]) {
-				if (whence == SEEK_HOLE) {
-					done = true;
-					break;
-				}
-				index = indices[i];
-			}
-			page = pvec.pages[i];
-			if (page && !xa_is_value(page)) {
-				if (!PageUptodate(page))
-					page = NULL;
-			}
-			if (index >= end ||
-			    (page && whence == SEEK_DATA) ||
-			    (!page && whence == SEEK_HOLE)) {
-				done = true;
-				break;
-			}
-		}
-		pagevec_remove_exceptionals(&pvec);
-		pagevec_release(&pvec);
-		pvec.nr = PAGEVEC_SIZE;
-		cond_resched();
-	}
-	return index;
-}
-
 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	pgoff_t start, end;
-	loff_t new_offset;
 
 	if (whence != SEEK_DATA && whence != SEEK_HOLE)
 		return generic_file_llseek_size(file, offset, whence,
 					MAX_LFS_FILESIZE, i_size_read(inode));
+	if (offset < 0)
+		return -ENXIO;
+
 	inode_lock(inode);
 	/* We're holding i_mutex so we can access i_size directly */
-
-	if (offset < 0 || offset >= inode->i_size)
-		offset = -ENXIO;
-	else {
-		start = offset >> PAGE_SHIFT;
-		end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		new_offset = shmem_seek_hole_data(mapping, start, end, whence);
-		new_offset <<= PAGE_SHIFT;
-		if (new_offset > offset) {
-			if (new_offset < inode->i_size)
-				offset = new_offset;
-			else if (whence == SEEK_DATA)
-				offset = -ENXIO;
-			else
-				offset = inode->i_size;
-		}
-	}
-
+	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
 	if (offset >= 0)
 		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
 	inode_unlock(inode);
-- 
cgit v1.2.3


From ca122fe40eb463c8c11c3bfc1914f0048ca5c268 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:00 -0800
Subject: mm: add an 'end' parameter to find_get_entries

This simplifies the callers and leads to a more efficient implementation
since the XArray has this functionality already.

Link: https://lkml.kernel.org/r/20201112212641.27837-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  4 ++--
 mm/filemap.c            |  9 +++++----
 mm/shmem.c              | 10 ++--------
 mm/swap.c               |  2 +-
 4 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3608993428d9..fdb2c4e44851 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -451,8 +451,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 }
 
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-			  unsigned int nr_entries, struct page **entries,
-			  pgoff_t *indices);
+		pgoff_t end, unsigned int nr_entries, struct page **entries,
+		pgoff_t *indices);
 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 			pgoff_t end, unsigned int nr_pages,
 			struct page **pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index 61fdcdc75275..65cfdff17ac6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1865,6 +1865,7 @@ reset:
  * find_get_entries - gang pagecache lookup
  * @mapping:	The address_space to search
  * @start:	The starting page cache index
+ * @end:	The final page index (inclusive).
  * @nr_entries:	The maximum number of entries
  * @entries:	Where the resulting entries are placed
  * @indices:	The cache indices corresponding to the entries in @entries
@@ -1888,9 +1889,9 @@ reset:
  *
  * Return: the number of pages and shadow entries which were found.
  */
-unsigned find_get_entries(struct address_space *mapping,
-			  pgoff_t start, unsigned int nr_entries,
-			  struct page **entries, pgoff_t *indices)
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+		pgoff_t end, unsigned int nr_entries, struct page **entries,
+		pgoff_t *indices)
 {
 	XA_STATE(xas, &mapping->i_pages, start);
 	struct page *page;
@@ -1900,7 +1901,7 @@ unsigned find_get_entries(struct address_space *mapping,
 		return 0;
 
 	rcu_read_lock();
-	while ((page = find_get_entry(&xas, ULONG_MAX, XA_PRESENT))) {
+	while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
 		/*
 		 * Terminate early on finding a THP, to allow the caller to
 		 * handle it all at once; but continue if this is hugetlbfs.
diff --git a/mm/shmem.c b/mm/shmem.c
index 86b1f5bc502c..4aac760aa2d4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -913,8 +913,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index >= end)
-				break;
 
 			if (xa_is_value(page)) {
 				if (unfalloc)
@@ -967,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	while (index < end) {
 		cond_resched();
 
-		pvec.nr = find_get_entries(mapping, index,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE),
-				pvec.pages, indices);
+		pvec.nr = find_get_entries(mapping, index, end - 1,
+				PAGEVEC_SIZE, pvec.pages, indices);
 		if (!pvec.nr) {
 			/* If all gone or hole-punch or unfalloc, we're done */
 			if (index == start || end != -1)
@@ -982,9 +979,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			struct page *page = pvec.pages[i];
 
 			index = indices[i];
-			if (index >= end)
-				break;
-
 			if (xa_is_value(page)) {
 				if (unfalloc)
 					continue;
diff --git a/mm/swap.c b/mm/swap.c
index ab3258afcbeb..c5773a84feab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1046,7 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				pgoff_t start, unsigned nr_entries,
 				pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, nr_entries,
+	pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries,
 				    pvec->pages, indices);
 	return pagevec_count(pvec);
 }
-- 
cgit v1.2.3


From 31d270fd98d196578223e5b568a0bd3bc6028b09 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:03 -0800
Subject: mm: add an 'end' parameter to pagevec_lookup_entries

Simplifies the callers and uses the existing functionality in
find_get_entries().  We can also drop the final argument of
truncate_exceptional_pvec_entries() and simplify the logic in that
function.

Link: https://lkml.kernel.org/r/20201112212641.27837-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  5 ++---
 mm/swap.c               |  8 ++++----
 mm/truncate.c           | 41 ++++++++++-------------------------------
 3 files changed, 16 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index ad4ddc17d403..f70a9dc81504 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -26,9 +26,8 @@ struct pagevec {
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
-				struct address_space *mapping,
-				pgoff_t start, unsigned nr_entries,
-				pgoff_t *indices);
+		struct address_space *mapping, pgoff_t start, pgoff_t end,
+		unsigned nr_entries, pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
diff --git a/mm/swap.c b/mm/swap.c
index c5773a84feab..db8c354264a5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1022,6 +1022,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
  * @pvec:	Where the resulting entries are placed
  * @mapping:	The address_space to search
  * @start:	The starting entry index
+ * @end:	The highest index to return (inclusive).
  * @nr_entries:	The maximum number of pages
  * @indices:	The cache indices corresponding to the entries in @pvec
  *
@@ -1042,11 +1043,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
  * found.
  */
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
-				struct address_space *mapping,
-				pgoff_t start, unsigned nr_entries,
-				pgoff_t *indices)
+		struct address_space *mapping, pgoff_t start, pgoff_t end,
+		unsigned nr_entries, pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, ULONG_MAX, nr_entries,
+	pvec->nr = find_get_entries(mapping, start, end, nr_entries,
 				    pvec->pages, indices);
 	return pagevec_count(pvec);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index de7f4f47f780..60df23890c2d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -57,11 +57,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
  * exceptional entries similar to what pagevec_remove_exceptionals does.
  */
 static void truncate_exceptional_pvec_entries(struct address_space *mapping,
-				struct pagevec *pvec, pgoff_t *indices,
-				pgoff_t end)
+				struct pagevec *pvec, pgoff_t *indices)
 {
 	int i, j;
-	bool dax, lock;
+	bool dax;
 
 	/* Handled by shmem itself */
 	if (shmem_mapping(mapping))
@@ -75,8 +74,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 		return;
 
 	dax = dax_mapping(mapping);
-	lock = !dax && indices[j] < end;
-	if (lock)
+	if (!dax)
 		xa_lock_irq(&mapping->i_pages);
 
 	for (i = j; i < pagevec_count(pvec); i++) {
@@ -88,9 +86,6 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 			continue;
 		}
 
-		if (index >= end)
-			continue;
-
 		if (unlikely(dax)) {
 			dax_delete_mapping_entry(mapping, index);
 			continue;
@@ -99,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
 		__clear_shadow_entry(mapping, index, page);
 	}
 
-	if (lock)
+	if (!dax)
 		xa_unlock_irq(&mapping->i_pages);
 	pvec->nr = j;
 }
@@ -329,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	while (index < end && find_lock_entries(mapping, index, end - 1,
 			&pvec, indices)) {
 		index = indices[pagevec_count(&pvec) - 1] + 1;
-		truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 		for (i = 0; i < pagevec_count(&pvec); i++)
 			truncate_cleanup_page(mapping, pvec.pages[i]);
 		delete_from_page_cache_batch(mapping, &pvec);
@@ -381,8 +376,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
-		if (!pagevec_lookup_entries(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+		if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1,
+				PAGEVEC_SIZE, indices)) {
 			/* If all gone from start onwards, we're done */
 			if (index == start)
 				break;
@@ -390,23 +385,12 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			index = start;
 			continue;
 		}
-		if (index == start && indices[0] >= end) {
-			/* All gone out of hole to be punched, we're done */
-			pagevec_remove_exceptionals(&pvec);
-			pagevec_release(&pvec);
-			break;
-		}
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
 			/* We rely upon deletion not changing page->index */
 			index = indices[i];
-			if (index >= end) {
-				/* Restart punch to make sure all gone */
-				index = start - 1;
-				break;
-			}
 
 			if (xa_is_value(page))
 				continue;
@@ -417,7 +401,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			truncate_inode_page(mapping, page);
 			unlock_page(page);
 		}
-		truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+		truncate_exceptional_pvec_entries(mapping, &pvec, indices);
 		pagevec_release(&pvec);
 		index++;
 	}
@@ -513,8 +497,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
 
 			/* We rely upon deletion not changing page->index */
 			index = indices[i];
-			if (index > end)
-				break;
 
 			if (xa_is_value(page)) {
 				invalidate_exceptional_entry(mapping, index,
@@ -656,16 +638,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec);
 	index = start;
-	while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
-			indices)) {
+	while (pagevec_lookup_entries(&pvec, mapping, index, end,
+			PAGEVEC_SIZE, indices)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
 			/* We rely upon deletion not changing page->index */
 			index = indices[i];
-			if (index > end)
-				break;
 
 			if (xa_is_value(page)) {
 				if (!invalidate_exceptional_entry2(mapping,
-- 
cgit v1.2.3


From 38cefeb33749992ceaad6ea40e12f92aa8f8e28f Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:07 -0800
Subject: mm: remove nr_entries parameter from pagevec_lookup_entries

All callers want to fetch the full size of the pvec.

Link: https://lkml.kernel.org/r/20201112212641.27837-13-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 2 +-
 mm/swap.c               | 4 ++--
 mm/truncate.c           | 5 ++---
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index f70a9dc81504..72c5ea2e708d 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,7 +27,7 @@ void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		unsigned nr_entries, pgoff_t *indices);
+		pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
diff --git a/mm/swap.c b/mm/swap.c
index db8c354264a5..cd9e1ed7e78f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1044,9 +1044,9 @@ void __pagevec_lru_add(struct pagevec *pvec)
  */
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		unsigned nr_entries, pgoff_t *indices)
+		pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, end, nr_entries,
+	pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE,
 				    pvec->pages, indices);
 	return pagevec_count(pvec);
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 60df23890c2d..41e7377ad58d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -377,7 +377,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	for ( ; ; ) {
 		cond_resched();
 		if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1,
-				PAGEVEC_SIZE, indices)) {
+				indices)) {
 			/* If all gone from start onwards, we're done */
 			if (index == start)
 				break;
@@ -638,8 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec);
 	index = start;
-	while (pagevec_lookup_entries(&pvec, mapping, index, end,
-			PAGEVEC_SIZE, indices)) {
+	while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-- 
cgit v1.2.3


From cf2039af1a2eee58fdbfa68bc0c9123e77477645 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:11 -0800
Subject: mm: pass pvec directly to find_get_entries

All callers of find_get_entries() use a pvec, so pass it directly instead
of manipulating it in the caller.

Link: https://lkml.kernel.org/r/20201112212641.27837-14-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h |  3 +--
 mm/filemap.c            | 21 +++++++++------------
 mm/shmem.c              |  5 ++---
 mm/swap.c               |  4 +---
 4 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fdb2c4e44851..20225b067583 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -451,8 +451,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
 }
 
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-		pgoff_t end, unsigned int nr_entries, struct page **entries,
-		pgoff_t *indices);
+		pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 			pgoff_t end, unsigned int nr_pages,
 			struct page **pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index 65cfdff17ac6..43700480d897 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1866,14 +1866,12 @@ reset:
  * @mapping:	The address_space to search
  * @start:	The starting page cache index
  * @end:	The final page index (inclusive).
- * @nr_entries:	The maximum number of entries
- * @entries:	Where the resulting entries are placed
+ * @pvec:	Where the resulting entries are placed.
  * @indices:	The cache indices corresponding to the entries in @entries
  *
- * find_get_entries() will search for and return a group of up to
- * @nr_entries entries in the mapping.  The entries are placed at
- * @entries.  find_get_entries() takes a reference against any actual
- * pages it returns.
+ * find_get_entries() will search for and return a batch of entries in
+ * the mapping.  The entries are placed in @pvec.  find_get_entries()
+ * takes a reference on any actual pages it returns.
  *
  * The search returns a group of mapping-contiguous page cache entries
  * with ascending indexes.  There may be holes in the indices due to
@@ -1890,15 +1888,12 @@ reset:
  * Return: the number of pages and shadow entries which were found.
  */
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
-		pgoff_t end, unsigned int nr_entries, struct page **entries,
-		pgoff_t *indices)
+		pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
 {
 	XA_STATE(xas, &mapping->i_pages, start);
 	struct page *page;
 	unsigned int ret = 0;
-
-	if (!nr_entries)
-		return 0;
+	unsigned nr_entries = PAGEVEC_SIZE;
 
 	rcu_read_lock();
 	while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
@@ -1913,11 +1908,13 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 		}
 
 		indices[ret] = xas.xa_index;
-		entries[ret] = page;
+		pvec->pages[ret] = page;
 		if (++ret == nr_entries)
 			break;
 	}
 	rcu_read_unlock();
+
+	pvec->nr = ret;
 	return ret;
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 4aac760aa2d4..ee8f21832f98 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -965,9 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	while (index < end) {
 		cond_resched();
 
-		pvec.nr = find_get_entries(mapping, index, end - 1,
-				PAGEVEC_SIZE, pvec.pages, indices);
-		if (!pvec.nr) {
+		if (!find_get_entries(mapping, index, end - 1, &pvec,
+				indices)) {
 			/* If all gone or hole-punch or unfalloc, we're done */
 			if (index == start || end != -1)
 				break;
diff --git a/mm/swap.c b/mm/swap.c
index cd9e1ed7e78f..d20a746a831e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1046,9 +1046,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t start, pgoff_t end,
 		pgoff_t *indices)
 {
-	pvec->nr = find_get_entries(mapping, start, end, PAGEVEC_SIZE,
-				    pvec->pages, indices);
-	return pagevec_count(pvec);
+	return find_get_entries(mapping, start, end, pvec, indices);
 }
 
 /**
-- 
cgit v1.2.3


From a656a20241f08be532539c7d5bd82df741c2d487 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 25 Feb 2021 17:16:14 -0800
Subject: mm: remove pagevec_lookup_entries

pagevec_lookup_entries() is now just a wrapper around find_get_entries()
so remove it and convert all its callers.

Link: https://lkml.kernel.org/r/20201112212641.27837-15-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  3 ---
 mm/swap.c               | 36 ++----------------------------------
 mm/truncate.c           |  4 ++--
 3 files changed, 4 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 72c5ea2e708d..7f3f19065a9f 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -25,9 +25,6 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
diff --git a/mm/swap.c b/mm/swap.c
index d20a746a831e..31b844d4ed94 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1017,44 +1017,12 @@ void __pagevec_lru_add(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-/**
- * pagevec_lookup_entries - gang pagecache lookup
- * @pvec:	Where the resulting entries are placed
- * @mapping:	The address_space to search
- * @start:	The starting entry index
- * @end:	The highest index to return (inclusive).
- * @nr_entries:	The maximum number of pages
- * @indices:	The cache indices corresponding to the entries in @pvec
- *
- * pagevec_lookup_entries() will search for and return a group of up
- * to @nr_pages pages and shadow entries in the mapping.  All
- * entries are placed in @pvec.  pagevec_lookup_entries() takes a
- * reference against actual pages in @pvec.
- *
- * The search returns a group of mapping-contiguous entries with
- * ascending indexes.  There may be holes in the indices due to
- * not-present entries.
- *
- * Only one subpage of a Transparent Huge Page is returned in one call:
- * allowing truncate_inode_pages_range() to evict the whole THP without
- * cycling through a pagevec of extra references.
- *
- * pagevec_lookup_entries() returns the number of entries which were
- * found.
- */
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t start, pgoff_t end,
-		pgoff_t *indices)
-{
-	return find_get_entries(mapping, start, end, pvec, indices);
-}
-
 /**
  * pagevec_remove_exceptionals - pagevec exceptionals pruning
  * @pvec:	The pagevec to prune
  *
- * pagevec_lookup_entries() fills both pages and exceptional radix
- * tree entries into the pagevec.  This function prunes all
+ * find_get_entries() fills both pages and XArray value entries (aka
+ * exceptional entries) into the pagevec.  This function prunes all
  * exceptionals from @pvec without leaving holes, so that it can be
  * passed on to page-only pagevec operations.
  */
diff --git a/mm/truncate.c b/mm/truncate.c
index 41e7377ad58d..455944264663 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -376,7 +376,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	index = start;
 	for ( ; ; ) {
 		cond_resched();
-		if (!pagevec_lookup_entries(&pvec, mapping, index, end - 1,
+		if (!find_get_entries(mapping, index, end - 1, &pvec,
 				indices)) {
 			/* If all gone from start onwards, we're done */
 			if (index == start)
@@ -638,7 +638,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec);
 	index = start;
-	while (pagevec_lookup_entries(&pvec, mapping, index, end, indices)) {
+	while (find_get_entries(mapping, index, end, &pvec, indices)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-- 
cgit v1.2.3


From 164cc4fef4456727466f8e35bb654c3994748070 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 25 Feb 2021 17:16:18 -0800
Subject: mm,thp,shmem: limit shmem THP alloc gfp_mask

Patch series "mm,thp,shm: limit shmem THP alloc gfp_mask", v6.

The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and
compaction code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck on
the LRU lock in the page reclaim code, trying to allocate dozens of THPs
simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result in
quick allocation failures without direct reclaim when no 2MB free pages
are available.

With this patch applied, THP allocations for tmpfs will be a little more
aggressive than today for files mmapped with MADV_HUGEPAGE, and a little
less aggressive for files that are not mmapped or mapped without that
flag.

This patch (of 4):

The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and
compaction code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck on
the LRU lock in the page reclaim code, trying to allocate dozens of THPs
simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

Controlling the gfp_mask of THP allocations through the knobs in sysfs
allows users to determine the balance between how aggressively the system
tries to allocate THPs at fault time, and how much the application may end
up stalling attempting those allocations.

This way a THP defrag setting of "never" or "defer+madvise" will result in
quick allocation failures without direct reclaim when no 2MB free pages
are available.

With this patch applied, THP allocations for tmpfs will be a little more
aggressive than today for files mmapped with MADV_HUGEPAGE, and a little
less aggressive for files that are not mmapped or mapped without that
flag.

Link: https://lkml.kernel.org/r/20201124194925.623931-1-riel@surriel.com
Link: https://lkml.kernel.org/r/20201124194925.623931-2-riel@surriel.com
Signed-off-by: Rik van Riel <riel@surriel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Xu Yu <xuyu@linux.alibaba.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 ++
 mm/huge_memory.c    | 6 +++---
 mm/shmem.c          | 8 +++++---
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 220cd553a9e7..8572a1474e16 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -634,6 +634,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d77605c30f2e..395c75111d33 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -668,9 +668,9 @@ release:
  *	    available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
 {
-	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
 
 	/* Always do synchronous compaction */
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
@@ -762,7 +762,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		}
 		return ret;
 	}
-	gfp = alloc_hugepage_direct_gfpmask(vma);
+	gfp = vma_thp_gfp_mask(vma);
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/shmem.c b/mm/shmem.c
index ee8f21832f98..596009a44431 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1519,8 +1519,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
 		return NULL;
 
 	shmem_pseudo_vma_init(&pvma, info, hindex);
-	page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-			HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
+			       true);
 	shmem_pseudo_vma_destroy(&pvma);
 	if (page)
 		prep_transhuge_page(page);
@@ -1776,6 +1776,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct page *page;
 	enum sgp_type sgp_huge = sgp;
 	pgoff_t hindex = index;
+	gfp_t huge_gfp;
 	int error;
 	int once = 0;
 	int alloced = 0;
@@ -1862,7 +1863,8 @@ repeat:
 	}
 
 alloc_huge:
-	page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+	huge_gfp = vma_thp_gfp_mask(vma);
+	page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
 	if (IS_ERR(page)) {
 alloc_nohuge:
 		page = shmem_alloc_and_acct_page(gfp, inode,
-- 
cgit v1.2.3


From cd89fb06509903f942a0ffe97ffa63034671ed0c Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 25 Feb 2021 17:16:25 -0800
Subject: mm,thp,shmem: make khugepaged obey tmpfs mount flags

Currently if thp enabled=[madvise], mounting a tmpfs filesystem with
huge=always and mmapping files from that tmpfs does not result in
khugepaged collapsing those mappings, despite the mount flag indicating
that it should.

Fix that by breaking up the blocks of tests in hugepage_vma_check a little
bit, and testing things in the correct order.

Link: https://lkml.kernel.org/r/20201124194925.623931-4-riel@surriel.com
Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for shmem")
Signed-off-by: Rik van Riel <riel@surriel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xu Yu <xuyu@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/khugepaged.h |  2 ++
 mm/khugepaged.c            | 22 ++++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index c941b7377321..2fcc01891b47 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -3,6 +3,7 @@
 #define _LINUX_KHUGEPAGED_H
 
 #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+#include <linux/shmem_fs.h>
 
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
 		if ((khugepaged_always() ||
+		     (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
 		     (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
 		    !(vm_flags & VM_NOHUGEPAGE) &&
 		    !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 75e246f680f4..a7d6cb912b05 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -442,18 +442,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
 static bool hugepage_vma_check(struct vm_area_struct *vma,
 			       unsigned long vm_flags)
 {
-	if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-	    (vm_flags & VM_NOHUGEPAGE) ||
+	/* Explicitly disabled through madvise. */
+	if ((vm_flags & VM_NOHUGEPAGE) ||
 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
 
-	if (shmem_file(vma->vm_file) ||
-	    (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
-	     vma->vm_file &&
-	     (vm_flags & VM_DENYWRITE))) {
+	/* Enabled via shmem mount options or sysfs settings. */
+	if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
 		return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
 				HPAGE_PMD_NR);
 	}
+
+	/* THP settings require madvise. */
+	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+		return false;
+
+	/* Read-only file mappings need to be aligned for THP to work. */
+	if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+	    (vm_flags & VM_DENYWRITE)) {
+		return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+				HPAGE_PMD_NR);
+	}
+
 	if (!vma->anon_vma || vma->vm_ops)
 		return false;
 	if (vma_is_temporary_stack(vma))
-- 
cgit v1.2.3


From 3c381db1fac80373f2cc0d8c1d0bcfbf8bd4fb57 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Feb 2021 17:16:40 -0800
Subject: mm/page_alloc: count CMA pages per zone and print them in
 /proc/zoneinfo

Let's count the number of CMA pages per zone and print them in
/proc/zoneinfo.

Having access to the total number of CMA pages per zone is helpful for
debugging purposes to know where exactly the CMA pages ended up, and to
figure out how many pages of a zone might behave differently, even after
some of these pages might already have been allocated.

As one example, CMA pages part of a kernel zone cannot be used for
ordinary kernel allocations but instead behave more like ZONE_MOVABLE.

For now, we are only able to get the global nr+free cma pages from
/proc/meminfo and the free cma pages per zone from /proc/zoneinfo.

Example after this patch when booting a 6 GiB QEMU VM with
"hugetlb_cma=2G":
  # cat /proc/zoneinfo | grep cma
          cma      0
        nr_free_cma  0
          cma      0
        nr_free_cma  0
          cma      524288
        nr_free_cma  493016
          cma      0
          cma      0
  # cat /proc/meminfo | grep Cma
  CmaTotal:        2097152 kB
  CmaFree:         1972064 kB

Note: We print even without CONFIG_CMA, just like "nr_free_cma"; this way,
      one can be sure when spotting "cma 0", that there are definetly no
      CMA pages located in a zone.

[david@redhat.com: v2]
  Link: https://lkml.kernel.org/r/20210128164533.18566-1-david@redhat.com
[david@redhat.com: v3]
  Link: https://lkml.kernel.org/r/20210129113451.22085-1-david@redhat.com

Link: https://lkml.kernel.org/r/20210127101813.6370-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 15 +++++++++++++++
 mm/page_alloc.c        |  1 +
 mm/vmstat.c            |  6 ++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9198b7ade85f..5f9c4dad73ed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -503,6 +503,9 @@ struct zone {
 	 * bootmem allocator):
 	 *	managed_pages = present_pages - reserved_pages;
 	 *
+	 * cma pages is present pages that are assigned for CMA use
+	 * (MIGRATE_CMA).
+	 *
 	 * So present_pages may be used by memory hotplug or memory power
 	 * management logic to figure out unmanaged pages by checking
 	 * (present_pages - managed_pages). And managed_pages should be used
@@ -527,6 +530,9 @@ struct zone {
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
+#ifdef CONFIG_CMA
+	unsigned long		cma_pages;
+#endif
 
 	const char		*name;
 
@@ -624,6 +630,15 @@ static inline unsigned long zone_managed_pages(struct zone *zone)
 	return (unsigned long)atomic_long_read(&zone->managed_pages);
 }
 
+static inline unsigned long zone_cma_pages(struct zone *zone)
+{
+#ifdef CONFIG_CMA
+	return zone->cma_pages;
+#else
+	return 0;
+#endif
+}
+
 static inline unsigned long zone_end_pfn(const struct zone *zone)
 {
 	return zone->zone_start_pfn + zone->spanned_pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ddccc59f2f72..3e4b29ee2b1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2168,6 +2168,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
 	}
 
 	adjust_managed_page_count(page, pageblock_nr_pages);
+	page_zone(page)->cma_pages += pageblock_nr_pages;
 }
 #endif
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a0e949542204..6cdf789ced5e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1637,14 +1637,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        high     %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu"
-		   "\n        managed  %lu",
+		   "\n        managed  %lu"
+		   "\n        cma      %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
 		   zone->spanned_pages,
 		   zone->present_pages,
-		   zone_managed_pages(zone));
+		   zone_managed_pages(zone),
+		   zone_cma_pages(zone));
 
 	seq_printf(m,
 		   "\n        protection: (%ld",
-- 
cgit v1.2.3


From 629484ae73754243917e06d8d5e5f37c26e99399 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 25 Feb 2021 17:16:51 -0800
Subject: mm: vmstat: add some comments on internal storage of byte items

Byte-accounted items are used for slab object accounting at the cgroup
level, because the objects in a slab page can belong to different cgroups.
At the global level these items always change in multiples of whole slab
pages.  The vmstat code exploits this and stores these items as pages
internally, which allows for more compact per-cpu data.

This optimization isn't self-evident from the asserts and the division in
the stat update functions.  Provide the reader with some context.

Link: https://lkml.kernel.org/r/20210202184411.118614-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  6 ++++++
 mm/vmstat.c            | 12 ++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 773135fc6e19..506d625163a1 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -313,6 +313,12 @@ static inline void __mod_node_page_state(struct pglist_data *pgdat,
 			enum node_stat_item item, int delta)
 {
 	if (vmstat_item_in_bytes(item)) {
+		/*
+		 * Only cgroups use subpage accounting right now; at
+		 * the global level, these items still change in
+		 * multiples of whole pages. Store them as pages
+		 * internally to keep the per-cpu counters compact.
+		 */
 		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 		delta >>= PAGE_SHIFT;
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0b0fc3b77789..e60b36f5f0a9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -342,6 +342,12 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 	long t;
 
 	if (vmstat_item_in_bytes(item)) {
+		/*
+		 * Only cgroups use subpage accounting right now; at
+		 * the global level, these items still change in
+		 * multiples of whole pages. Store them as pages
+		 * internally to keep the per-cpu counters compact.
+		 */
 		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 		delta >>= PAGE_SHIFT;
 	}
@@ -551,6 +557,12 @@ static inline void mod_node_state(struct pglist_data *pgdat,
 	long o, n, t, z;
 
 	if (vmstat_item_in_bytes(item)) {
+		/*
+		 * Only cgroups use subpage accounting right now; at
+		 * the global level, these items still change in
+		 * multiples of whole pages. Store them as pages
+		 * internally to keep the per-cpu counters compact.
+		 */
 		VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 		delta >>= PAGE_SHIFT;
 	}
-- 
cgit v1.2.3


From 9f605f260594f99b950062fd62244251e85dbd2b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2021 17:16:57 -0800
Subject: mm: move pfn_to_online_page() out of line

Patch series "mm: Fix pfn_to_online_page() with respect to ZONE_DEVICE", v4.

A pfn-walker that uses pfn_to_online_page() may inadvertently translate a
pfn as online and in the page allocator, when it is offline managed by a
ZONE_DEVICE mapping (details in Patch 3: ("mm: Teach pfn_to_online_page()
about ZONE_DEVICE section collisions")).

The 2 proposals under consideration are teach pfn_to_online_page() to be
precise in the presence of mixed-zone sections, or teach the memory-add
code to drop the System RAM associated with ZONE_DEVICE collisions.  In
order to not regress memory capacity by a few 10s to 100s of MiB the
approach taken in this set is to add precision to pfn_to_online_page().

In the course of validating pfn_to_online_page() a couple other fixes
fell out:

1/ soft_offline_page() fails to drop the reference taken in the
   madvise(..., MADV_SOFT_OFFLINE) case.

2/ memory_failure() uses get_dev_pagemap() to lookup ZONE_DEVICE pages,
   however that mapping may contain data pages and metadata raw pfns.
   Introduce pgmap_pfn_valid() to delineate the 2 types and fail the
   handling of raw metadata pfns.

This patch (of 4);

pfn_to_online_page() is already too large to be a macro or an inline
function.  In anticipation of further logic changes / growth, move it out
of line.

No functional change, just code movement.

Link: https://lkml.kernel.org/r/161058499000.1840162.702316708443239771.stgit@dwillia2-desk3.amr.corp.intel.com
Link: https://lkml.kernel.org/r/161058499608.1840162.10165648147615238793.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 17 +----------------
 mm/memory_hotplug.c            | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 15acce5ab106..3d99de0db2dd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,22 +16,7 @@ struct resource;
 struct vmem_altmap;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-/*
- * Return page for the valid pfn only if the page is online. All pfn
- * walkers which rely on the fully initialized page->flags and others
- * should use this rather than pfn_valid && pfn_to_page
- */
-#define pfn_to_online_page(pfn)					   \
-({								   \
-	struct page *___page = NULL;				   \
-	unsigned long ___pfn = pfn;				   \
-	unsigned long ___nr = pfn_to_section_nr(___pfn);	   \
-								   \
-	if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \
-	    pfn_valid_within(___pfn))				   \
-		___page = pfn_to_page(___pfn);			   \
-	___page;						   \
-})
+struct page *pfn_to_online_page(unsigned long pfn);
 
 /*
  * Types for free bootmem stored in page->lru.next. These have to be in
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index abe43c1ae920..fc6cdd99941b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,6 +300,22 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
 	return 0;
 }
 
+/*
+ * Return page for the valid pfn only if the page is online. All pfn
+ * walkers which rely on the fully initialized page->flags and others
+ * should use this rather than pfn_valid && pfn_to_page
+ */
+struct page *pfn_to_online_page(unsigned long pfn)
+{
+	unsigned long nr = pfn_to_section_nr(pfn);
+
+	if (nr < NR_MEM_SECTIONS && online_section_nr(nr) &&
+	    pfn_valid_within(pfn))
+		return pfn_to_page(pfn);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pfn_to_online_page);
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
-- 
cgit v1.2.3


From 1f90a3477df3ff1a91e064af554cdc887c8f9e5e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2021 17:17:05 -0800
Subject: mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions

While pfn_to_online_page() is able to determine pfn_valid() at subsection
granularity it is not able to reliably determine if a given pfn is also
online if the section is mixes ZONE_{NORMAL,MOVABLE} with ZONE_DEVICE.
This means that pfn_to_online_page() may return invalid @page objects.
For example with a memory map like:

100000000-1fbffffff : System RAM
  142000000-143002e16 : Kernel code
  143200000-143713fff : Kernel rodata
  143800000-143b15b7f : Kernel data
  144227000-144ffffff : Kernel bss
1fc000000-2fbffffff : Persistent Memory (legacy)
  1fc000000-2fbffffff : namespace0.0

This command:

echo 0x1fc000000 > /sys/devices/system/memory/soft_offline_page

...succeeds when it should fail.  When it succeeds it touches an
uninitialized page and may crash or cause other damage (see
dissolve_free_huge_page()).

While the memory map above is contrived via the memmap=ss!nn kernel
command line option, the collision happens in practice on shipping
platforms.  The memory controller resources that decode spans of physical
address space are a limited resource.  One technique platform-firmware
uses to conserve those resources is to share a decoder across 2 devices to
keep the address range contiguous.  Unfortunately the unit of operation of
a decoder is 64MiB while the Linux section size is 128MiB.  This results
in situations where, without subsection hotplug memory mappings with
different lifetimes collide into one object that can only express one
lifetime.

Update move_pfn_range_to_zone() to flag (SECTION_TAINT_ZONE_DEVICE) a
section that mixes ZONE_DEVICE pfns with other online pfns.  With
SECTION_TAINT_ZONE_DEVICE to delineate, pfn_to_online_page() can fall back
to a slow-path check for ZONE_DEVICE pfns in an online section.  In the
fast path online_section() for a full ZONE_DEVICE section returns false.

Because the collision case is rare, and for simplicity, the
SECTION_TAINT_ZONE_DEVICE flag is never cleared once set.

[dan.j.williams@intel.com: fix CONFIG_ZONE_DEVICE=n build]
  Link: https://lkml.kernel.org/r/CAPcyv4iX+7LAgAeSqx7Zw-Zd=ZV9gBv8Bo7oTbwCOOqJoZ3+Yg@mail.gmail.com

Link: https://lkml.kernel.org/r/161058500675.1840162.7887862152161279354.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reported-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 34 +++++++++++++++++++++++++++-------
 mm/memory_hotplug.c    | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f9c4dad73ed..47946cec7584 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -918,6 +918,18 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool zone_is_zone_device(struct zone *zone)
+{
+	return zone_idx(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool zone_is_zone_device(struct zone *zone)
+{
+	return false;
+}
+#endif
+
 /*
  * Returns true if a zone has pages managed by the buddy allocator.
  * All the reclaim decisions have to use this function rather than
@@ -1306,13 +1318,14 @@ extern size_t mem_section_usage_size(void);
  *      which results in PFN_SECTION_SHIFT equal 6.
  * To sum it up, at least 6 bits are available.
  */
-#define	SECTION_MARKED_PRESENT	(1UL<<0)
-#define SECTION_HAS_MEM_MAP	(1UL<<1)
-#define SECTION_IS_ONLINE	(1UL<<2)
-#define SECTION_IS_EARLY	(1UL<<3)
-#define SECTION_MAP_LAST_BIT	(1UL<<4)
-#define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT	3
+#define SECTION_MARKED_PRESENT		(1UL<<0)
+#define SECTION_HAS_MEM_MAP		(1UL<<1)
+#define SECTION_IS_ONLINE		(1UL<<2)
+#define SECTION_IS_EARLY		(1UL<<3)
+#define SECTION_TAINT_ZONE_DEVICE	(1UL<<4)
+#define SECTION_MAP_LAST_BIT		(1UL<<5)
+#define SECTION_MAP_MASK		(~(SECTION_MAP_LAST_BIT-1))
+#define SECTION_NID_SHIFT		3
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
@@ -1351,6 +1364,13 @@ static inline int online_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_IS_ONLINE));
 }
 
+static inline int online_device_section(struct mem_section *section)
+{
+	unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
+
+	return section && ((section->section_mem_map & flags) == flags);
+}
+
 static inline int online_section_nr(unsigned long nr)
 {
 	return online_section(__nr_to_section(nr));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 02378f11e2d6..3af4d3851d1a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -308,6 +308,7 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
 struct page *pfn_to_online_page(unsigned long pfn)
 {
 	unsigned long nr = pfn_to_section_nr(pfn);
+	struct dev_pagemap *pgmap;
 	struct mem_section *ms;
 
 	if (nr >= NR_MEM_SECTIONS)
@@ -327,6 +328,22 @@ struct page *pfn_to_online_page(unsigned long pfn)
 	if (!pfn_section_valid(ms, pfn))
 		return NULL;
 
+	if (!online_device_section(ms))
+		return pfn_to_page(pfn);
+
+	/*
+	 * Slowpath: when ZONE_DEVICE collides with
+	 * ZONE_{NORMAL,MOVABLE} within the same section some pfns in
+	 * the section may be 'offline' but 'valid'. Only
+	 * get_dev_pagemap() can determine sub-section online status.
+	 */
+	pgmap = get_dev_pagemap(pfn, NULL);
+	put_dev_pagemap(pgmap);
+
+	/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
+	if (pgmap)
+		return NULL;
+
 	return pfn_to_page(pfn);
 }
 EXPORT_SYMBOL_GPL(pfn_to_online_page);
@@ -709,6 +726,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
 
 }
+
+static void section_taint_zone_device(unsigned long pfn)
+{
+	struct mem_section *ms = __pfn_to_section(pfn);
+
+	ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
+}
+
 /*
  * Associate the pfn range with the given zone, initializing the memmaps
  * and resizing the pgdat/zone data to span the added pages. After this
@@ -738,6 +763,19 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	resize_pgdat_range(pgdat, start_pfn, nr_pages);
 	pgdat_resize_unlock(pgdat, &flags);
 
+	/*
+	 * Subsection population requires care in pfn_to_online_page().
+	 * Set the taint to enable the slow path detection of
+	 * ZONE_DEVICE pages in an otherwise  ZONE_{NORMAL,MOVABLE}
+	 * section.
+	 */
+	if (zone_is_zone_device(zone)) {
+		if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
+			section_taint_zone_device(start_pfn);
+		if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
+			section_taint_zone_device(start_pfn + nr_pages);
+	}
+
 	/*
 	 * TODO now we have a visible range of pages which are not associated
 	 * with their zone properly. Not nice but set_pfnblock_flags_mask
-- 
cgit v1.2.3


From 34dc45be4563f344d59ba0428416d0d265aa4f4d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Feb 2021 17:17:08 -0800
Subject: mm: fix memory_failure() handling of dax-namespace metadata

Given 'struct dev_pagemap' spans both data pages and metadata pages be
careful to consult the altmap if present to delineate metadata.  In fact
the pfn_first() helper already identifies the first valid data pfn, so
export that helper for other code paths via pgmap_pfn_valid().

Other usage of get_dev_pagemap() are not a concern because those are
operating on known data pfns having been looked up by get_user_pages().
I.e.  metadata pfns are never user mapped.

Link: https://lkml.kernel.org/r/161058501758.1840162.4239831989762604527.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: 6100e34b2526 ("mm, memory_failure: Teach memory_failure() about dev_pagemap pages")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memremap.h |  6 ++++++
 mm/memory-failure.c      |  6 ++++++
 mm/memremap.c            | 15 +++++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 79c49e7f5c30..f5b464daeeca 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -137,6 +137,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 		struct dev_pagemap *pgmap);
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
@@ -165,6 +166,11 @@ static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 	return NULL;
 }
 
+static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+	return false;
+}
+
 static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 55c671904aac..24210c9bd843 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1312,6 +1312,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		 */
 		put_page(page);
 
+	/* device metadata space is not recoverable */
+	if (!pgmap_pfn_valid(pgmap, pfn)) {
+		rc = -ENXIO;
+		goto out;
+	}
+
 	/*
 	 * Prevent the inode from being freed while we are interrogating
 	 * the address_space, typically this would be handled by
diff --git a/mm/memremap.c b/mm/memremap.c
index 16b2fb482da1..2455bac89506 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -80,6 +80,21 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
 	return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
 }
 
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+	int i;
+
+	for (i = 0; i < pgmap->nr_range; i++) {
+		struct range *range = &pgmap->ranges[i];
+
+		if (pfn >= PHYS_PFN(range->start) &&
+		    pfn <= PHYS_PFN(range->end))
+			return pfn >= pfn_first(pgmap, i);
+	}
+
+	return false;
+}
+
 static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
 {
 	const struct range *range = &pgmap->ranges[range_id];
-- 
cgit v1.2.3


From 1adf8b468ff6bc64ba01ce3848da4bcf409215b4 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 25 Feb 2021 17:17:13 -0800
Subject: mm/memory_hotplug: rename all existing 'memhp' into 'mhp'

This renames all 'memhp' instances to 'mhp' except for memhp_default_state
for being a kernel command line option.  This is just a clean up and
should not cause a functional change.  Let's make it consistent rater than
mixing the two prefixes.  In preparation for more users of the 'mhp'
terminology.

Link: https://lkml.kernel.org/r/1611554093-27316-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 10 +++++-----
 include/linux/memory_hotplug.h |  4 ++--
 mm/memory_hotplug.c            | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index eef4ffb6122c..901e379676be 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -35,7 +35,7 @@ static const char *const online_type_to_str[] = {
 	[MMOP_ONLINE_MOVABLE] = "online_movable",
 };
 
-int memhp_online_type_from_str(const char *str)
+int mhp_online_type_from_str(const char *str)
 {
 	int i;
 
@@ -253,7 +253,7 @@ static int memory_subsys_offline(struct device *dev)
 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t count)
 {
-	const int online_type = memhp_online_type_from_str(buf);
+	const int online_type = mhp_online_type_from_str(buf);
 	struct memory_block *mem = to_memory_block(dev);
 	int ret;
 
@@ -387,19 +387,19 @@ static ssize_t auto_online_blocks_show(struct device *dev,
 				       struct device_attribute *attr, char *buf)
 {
 	return sysfs_emit(buf, "%s\n",
-			  online_type_to_str[memhp_default_online_type]);
+			  online_type_to_str[mhp_default_online_type]);
 }
 
 static ssize_t auto_online_blocks_store(struct device *dev,
 					struct device_attribute *attr,
 					const char *buf, size_t count)
 {
-	const int online_type = memhp_online_type_from_str(buf);
+	const int online_type = mhp_online_type_from_str(buf);
 
 	if (online_type < 0)
 		return -EINVAL;
 
-	memhp_default_online_type = online_type;
+	mhp_default_online_type = online_type;
 	return count;
 }
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3d99de0db2dd..ca5e8d137726 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -116,10 +116,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
 			   struct mhp_params *params);
 extern u64 max_mem_size;
 
-extern int memhp_online_type_from_str(const char *str);
+extern int mhp_online_type_from_str(const char *str);
 
 /* Default online_type (MMOP_*) when new memory blocks are added. */
-extern int memhp_default_online_type;
+extern int mhp_default_online_type;
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
 static inline bool movable_node_is_enabled(void)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3af4d3851d1a..ac1c686a5989 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,17 +67,17 @@ void put_online_mems(void)
 bool movable_node_enabled = false;
 
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int memhp_default_online_type = MMOP_OFFLINE;
+int mhp_default_online_type = MMOP_OFFLINE;
 #else
-int memhp_default_online_type = MMOP_ONLINE;
+int mhp_default_online_type = MMOP_ONLINE;
 #endif
 
 static int __init setup_memhp_default_state(char *str)
 {
-	const int online_type = memhp_online_type_from_str(str);
+	const int online_type = mhp_online_type_from_str(str);
 
 	if (online_type >= 0)
-		memhp_default_online_type = online_type;
+		mhp_default_online_type = online_type;
 
 	return 1;
 }
@@ -1076,7 +1076,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
 
 static int online_memory_block(struct memory_block *mem, void *arg)
 {
-	mem->online_type = memhp_default_online_type;
+	mem->online_type = mhp_default_online_type;
 	return device_online(&mem->dev);
 }
 
@@ -1157,7 +1157,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		merge_system_ram_resource(res);
 
 	/* online pages if requested */
-	if (memhp_default_online_type != MMOP_OFFLINE)
+	if (mhp_default_online_type != MMOP_OFFLINE)
 		walk_memory_blocks(start, size, NULL, online_memory_block);
 
 	return ret;
-- 
cgit v1.2.3


From 26011267e1a7ddaab50b5f81b402ca3e7fc2887c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Feb 2021 17:17:17 -0800
Subject: mm/memory_hotplug: MEMHP_MERGE_RESOURCE -> MHP_MERGE_RESOURCE

Let's make "MEMHP_MERGE_RESOURCE" consistent with "MHP_NONE", "mhp_t" and
"mhp_flags".  As discussed recently [1], "mhp" is our internal acronym for
memory hotplug now.

[1] https://lore.kernel.org/linux-mm/c37de2d0-28a1-4f7d-f944-cfd7d81c334d@redhat.com/

Link: https://lkml.kernel.org/r/20210126115829.10909-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Wei Liu <wei.liu@kernel.org>
Reviewed-by: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hv/hv_balloon.c        | 2 +-
 drivers/virtio/virtio_mem.c    | 2 +-
 drivers/xen/balloon.c          | 2 +-
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 8c471823a5af..2f776d78e3c1 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 
 		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
 		ret = add_memory(nid, PFN_PHYS((start_pfn)),
-				(HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
+				(HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
 
 		if (ret) {
 			pr_err("hot_add memory failed error is %d\n", ret);
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 9fc9ec4a25f5..d44e43869f17 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -623,7 +623,7 @@ static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 	/* Memory might get onlined immediately. */
 	atomic64_add(size, &vm->offline_size);
 	rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
-				       MEMHP_MERGE_RESOURCE);
+				       MHP_MERGE_RESOURCE);
 	if (rc) {
 		atomic64_sub(size, &vm->offline_size);
 		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index b57b2067ecbf..671c71245a7b 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void)
 	mutex_unlock(&balloon_mutex);
 	/* add_memory_resource() requires the device_hotplug lock */
 	lock_device_hotplug();
-	rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE);
+	rc = add_memory_resource(nid, resource, MHP_MERGE_RESOURCE);
 	unlock_device_hotplug();
 	mutex_lock(&balloon_mutex);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ca5e8d137726..08eeef679ab7 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -53,7 +53,7 @@ typedef int __bitwise mhp_t;
  * with this flag set, the resource pointer must no longer be used as it
  * might be stale, or the resource might have changed.
  */
-#define MEMHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
+#define MHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
 
 /*
  * Extended parameters for memory hotplug:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ac1c686a5989..6a02c3f42717 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1153,7 +1153,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 * In case we're allowed to merge the resource, flag it and trigger
 	 * merging now that adding succeeded.
 	 */
-	if (mhp_flags & MEMHP_MERGE_RESOURCE)
+	if (mhp_flags & MHP_MERGE_RESOURCE)
 		merge_system_ram_resource(res);
 
 	/* online pages if requested */
-- 
cgit v1.2.3


From e9a2e48e8704c9d20a625c6f2357147d03ea7b97 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Feb 2021 17:17:24 -0800
Subject: drivers/base/memory: don't store phys_device in memory blocks

No need to store the value for each and every memory block, as we can
easily query the value at runtime.  Reshuffle the members to optimize the
memory layout.  Also, let's clarify what the interface once was used for
and why it's legacy nowadays.

"phys_device" was used on s390x in older versions of lsmem[2]/chmem[3],
back when they were still part of s390x-tools.  They were later replaced
by the variants in linux-utils.  For example, RHEL6 and RHEL7 contain
lsmem/chmem from s390-utils.  RHEL8 switched to versions from util-linux
on s390x [4].

"phys_device" was added with sysfs support for memory hotplug in commit
3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") in
2005.  It always returned 0.

s390x started returning something != 0 on some setups (if sclp.rzm is set
by HW) in 2010 via commit 57b552ba0b2f ("memory hotplug/s390: set
phys_device").

For s390x, it allowed for identifying which memory block devices belong to
the same storage increment (RZM).  Only if all memory block devices
comprising a single storage increment were offline, the memory could
actually be removed in the hypervisor.

Since commit e5d709bb5fb7 ("s390/memory hotplug: provide
memory_block_size_bytes() function") in 2013 a memory block device spans
at least one storage increment - which is why the interface isn't really
helpful/used anymore (except by old lsmem/chmem tools).

There were once RFC patches to make use of "phys_device" in ACPI context;
however, the underlying problem could be solved using different interfaces
[1].

[1] https://patchwork.kernel.org/patch/2163871/
[2] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/lsmem
[3] https://github.com/ibm-s390-tools/s390-tools/blob/v2.1.0/zconf/chmem
[4] https://bugzilla.redhat.com/show_bug.cgi?id=1504134

Link: https://lkml.kernel.org/r/20210201181347.13262-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Tom Rix <trix@redhat.com>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-devices-memory  |  5 +++--
 Documentation/admin-guide/mm/memory-hotplug.rst |  4 ++--
 drivers/base/memory.c                           | 25 +++++++++----------------
 include/linux/memory.h                          |  3 +--
 4 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory
index 246a45b96d22..58dbc592bc57 100644
--- a/Documentation/ABI/testing/sysfs-devices-memory
+++ b/Documentation/ABI/testing/sysfs-devices-memory
@@ -26,8 +26,9 @@ Date:		September 2008
 Contact:	Badari Pulavarty <pbadari@us.ibm.com>
 Description:
 		The file /sys/devices/system/memory/memoryX/phys_device
-		is read-only and is designed to show the name of physical
-		memory device.  Implementation is currently incomplete.
+		is read-only;  it is a legacy interface only ever used on s390x
+		to expose the covered storage increment.
+Users:		Legacy s390-tools lsmem/chmem
 
 What:		/sys/devices/system/memory/memoryX/phys_index
 Date:		September 2008
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 5c4432c96c4b..245739f55ac7 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -160,8 +160,8 @@ Under each memory block, you can see 5 files:
 
                     "online_movable", "online", "offline" command
                     which will be performed on all sections in the block.
-``phys_device``     read-only: designed to show the name of physical memory
-                    device.  This is not well implemented now.
+``phys_device``	    read-only: legacy interface only ever used on s390x to
+		    expose the covered storage increment.
 ``removable``       read-only: contains an integer value indicating
                     whether the memory block is removable or not
                     removable.  A value of 1 indicates that the memory
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 901e379676be..f35298425575 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -290,20 +290,20 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 }
 
 /*
- * phys_device is a bad name for this.  What I really want
- * is a way to differentiate between memory ranges that
- * are part of physical devices that constitute
- * a complete removable unit or fru.
- * i.e. do these ranges belong to the same physical device,
- * s.t. if I offline all of these sections I can then
- * remove the physical device?
+ * Legacy interface that we cannot remove: s390x exposes the storage increment
+ * covered by a memory block, allowing for identifying which memory blocks
+ * comprise a storage increment. Since a memory block spans complete
+ * storage increments nowadays, this interface is basically unused. Other
+ * archs never exposed != 0.
  */
 static ssize_t phys_device_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct memory_block *mem = to_memory_block(dev);
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 
-	return sysfs_emit(buf, "%d\n", mem->phys_device);
+	return sysfs_emit(buf, "%d\n",
+			  arch_get_memory_phys_device(start_pfn));
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
@@ -488,11 +488,7 @@ static DEVICE_ATTR_WO(soft_offline_page);
 static DEVICE_ATTR_WO(hard_offline_page);
 #endif
 
-/*
- * Note that phys_device is optional.  It is here to allow for
- * differentiation between which *physical* devices each
- * section belongs to...
- */
+/* See phys_device_show(). */
 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 {
 	return 0;
@@ -574,7 +570,6 @@ int register_memory(struct memory_block *memory)
 static int init_memory_block(unsigned long block_id, unsigned long state)
 {
 	struct memory_block *mem;
-	unsigned long start_pfn;
 	int ret = 0;
 
 	mem = find_memory_block_by_id(block_id);
@@ -588,8 +583,6 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
 
 	mem->start_section_nr = block_id * sections_per_block;
 	mem->state = state;
-	start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	mem->phys_device = arch_get_memory_phys_device(start_pfn);
 	mem->nid = NUMA_NO_NODE;
 
 	ret = register_memory(mem);
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 439a89e758d8..4da95e684e20 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -27,9 +27,8 @@ struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
 	int online_type;		/* for passing data to online routine */
-	int phys_device;		/* to which fru does this belong? */
-	struct device dev;
 	int nid;			/* NID for this memory block */
+	struct device dev;
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
-- 
cgit v1.2.3


From bca3feaa0764ab5a4cbe6817871601f1d00c059d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 25 Feb 2021 17:17:33 -0800
Subject: mm/memory_hotplug: prevalidate the address range being added with
 platform

Patch series "mm/memory_hotplug: Pre-validate the address range with platform", v5.

This series adds a mechanism allowing platforms to weigh in and
prevalidate incoming address range before proceeding further with the
memory hotplug.  This helps prevent potential platform errors for the
given address range, down the hotplug call chain, which inevitably fails
the hotplug itself.

This mechanism was suggested by David Hildenbrand during another
discussion with respect to a memory hotplug fix on arm64 platform.

https://lore.kernel.org/linux-arm-kernel/1600332402-30123-1-git-send-email-anshuman.khandual@arm.com/

This mechanism focuses on the addressibility aspect and not [sub] section
alignment aspect.  Hence check_hotplug_memory_range() and check_pfn_span()
have been left unchanged.

This patch (of 4):

This introduces mhp_range_allowed() which can be called in various memory
hotplug paths to prevalidate the address range which is being added, with
the platform.  Then mhp_range_allowed() calls mhp_get_pluggable_range()
which provides applicable address range depending on whether linear
mapping is required or not.  For ranges that require linear mapping, it
calls a new arch callback arch_get_mappable_range() which the platform can
override.  So the new callback, in turn provides the platform an
opportunity to configure acceptable memory hotplug address ranges in case
there are constraints.

This mechanism will help prevent platform specific errors deep down during
hotplug calls.  This drops now redundant
check_hotplug_memory_addressable() check in __add_pages() but instead adds
a VM_BUG_ON() check which would ensure that the range has been validated
with mhp_range_allowed() earlier in the call chain.  Besides
mhp_get_pluggable_range() also can be used by potential memory hotplug
callers to avail the allowed physical range which would go through on a
given platform.

This does not really add any new range check in generic memory hotplug but
instead compensates for lost checks in arch_add_memory() where applicable
and check_hotplug_memory_addressable(), with unified mhp_range_allowed().

[akpm@linux-foundation.org: make pagemap_range() return -EINVAL when mhp_range_allowed() fails]

Link: https://lkml.kernel.org/r/1612149902-7867-1-git-send-email-anshuman.khandual@arm.com
Link: https://lkml.kernel.org/r/1612149902-7867-2-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com> # s390
Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pankaj Gupta <pankaj.gupta@cloud.ionos.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 10 ++++++
 mm/memory_hotplug.c            | 78 ++++++++++++++++++++++++++++++++----------
 mm/memremap.c                  |  8 ++++-
 3 files changed, 76 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 08eeef679ab7..7288aa5ef73b 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -66,6 +66,9 @@ struct mhp_params {
 	pgprot_t pgprot;
 };
 
+bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
+struct range mhp_get_pluggable_range(bool need_mapping);
+
 /*
  * Zone resizing functions
  *
@@ -266,6 +269,13 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
+/*
+ * Keep this declaration outside CONFIG_MEMORY_HOTPLUG as some
+ * platforms might override and use arch_get_mappable_range()
+ * for internal non memory hotplug purposes.
+ */
+struct range arch_get_mappable_range(void);
+
 #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
 /*
  * pgdat resizing functions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a969463bdda4..5ba51a8bdaeb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -107,6 +107,9 @@ static struct resource *register_memory_resource(u64 start, u64 size,
 	if (strcmp(resource_name, "System RAM"))
 		flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
 
+	if (!mhp_range_allowed(start, size, true))
+		return ERR_PTR(-E2BIG);
+
 	/*
 	 * Make sure value parsed from 'mem=' only restricts memory adding
 	 * while booting, so that memory hotplug won't be impacted. Please
@@ -284,22 +287,6 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 	return 0;
 }
 
-static int check_hotplug_memory_addressable(unsigned long pfn,
-					    unsigned long nr_pages)
-{
-	const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
-
-	if (max_addr >> MAX_PHYSMEM_BITS) {
-		const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
-		WARN(1,
-		     "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
-		     (u64)PFN_PHYS(pfn), max_addr, max_allowed);
-		return -E2BIG;
-	}
-
-	return 0;
-}
-
 /*
  * Return page for the valid pfn only if the page is online. All pfn
  * walkers which rely on the fully initialized page->flags and others
@@ -365,9 +352,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 	if (WARN_ON_ONCE(!params->pgprot.pgprot))
 		return -EINVAL;
 
-	err = check_hotplug_memory_addressable(pfn, nr_pages);
-	if (err)
-		return err;
+	VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
 
 	if (altmap) {
 		/*
@@ -1248,6 +1233,61 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(add_memory_driver_managed);
 
+/*
+ * Platforms should define arch_get_mappable_range() that provides
+ * maximum possible addressable physical memory range for which the
+ * linear mapping could be created. The platform returned address
+ * range must adhere to these following semantics.
+ *
+ * - range.start <= range.end
+ * - Range includes both end points [range.start..range.end]
+ *
+ * There is also a fallback definition provided here, allowing the
+ * entire possible physical address range in case any platform does
+ * not define arch_get_mappable_range().
+ */
+struct range __weak arch_get_mappable_range(void)
+{
+	struct range mhp_range = {
+		.start = 0UL,
+		.end = -1ULL,
+	};
+	return mhp_range;
+}
+
+struct range mhp_get_pluggable_range(bool need_mapping)
+{
+	const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+	struct range mhp_range;
+
+	if (need_mapping) {
+		mhp_range = arch_get_mappable_range();
+		if (mhp_range.start > max_phys) {
+			mhp_range.start = 0;
+			mhp_range.end = 0;
+		}
+		mhp_range.end = min_t(u64, mhp_range.end, max_phys);
+	} else {
+		mhp_range.start = 0;
+		mhp_range.end = max_phys;
+	}
+	return mhp_range;
+}
+EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
+
+bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
+{
+	struct range mhp_range = mhp_get_pluggable_range(need_mapping);
+	u64 end = start + size;
+
+	if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
+		return true;
+
+	pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
+		start, end, mhp_range.start, mhp_range.end);
+	return false;
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * Confirm all pages in a range [start, end) belong to the same zone (skipping
diff --git a/mm/memremap.c b/mm/memremap.c
index 2455bac89506..7aa7d6e80ee5 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -200,6 +200,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
 static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 		int range_id, int nid)
 {
+	const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE;
 	struct range *range = &pgmap->ranges[range_id];
 	struct dev_pagemap *conflict_pgmap;
 	int error, is_ram;
@@ -245,6 +246,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	if (error)
 		goto err_pfn_remap;
 
+	if (!mhp_range_allowed(range->start, range_len(range), !is_private)) {
+		error = -EINVAL;
+		goto err_pfn_remap;
+	}
+
 	mem_hotplug_begin();
 
 	/*
@@ -258,7 +264,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	 * the CPU, we do want the linear mapping and thus use
 	 * arch_add_memory().
 	 */
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+	if (is_private) {
 		error = add_pages(nid, PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), params);
 	} else {
-- 
cgit v1.2.3


From 5d5d19eda6b0ee790af89c45e3f678345be6f50f Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 25 Feb 2021 17:18:09 -0800
Subject: mm/rmap: fix potential pte_unmap on an not mapped pte

For PMD-mapped page (usually THP), pvmw->pte is NULL.  For PTE-mapped THP,
pvmw->pte is mapped.  But for HugeTLB pages, pvmw->pte is not mapped and
set to the relevant page table entry.  So in page_vma_mapped_walk_done(),
we may do pte_unmap() for HugeTLB pte which is not mapped.  Fix this by
checking pvmw->page against PageHuge before trying to do pte_unmap().

Link: https://lkml.kernel.org/r/20210127093349.39081-1-linmiaohe@huawei.com
Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()")
Signed-off-by: Hongxiang Lou <louhongxiang@huawei.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michel Lespinasse <walken@google.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Brian Geffon <bgeffon@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 70085ca1a3fc..def5c62c93b3 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -213,7 +213,8 @@ struct page_vma_mapped_walk {
 
 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 {
-	if (pvmw->pte)
+	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
+	if (pvmw->pte && !PageHuge(pvmw->page))
 		pte_unmap(pvmw->pte);
 	if (pvmw->ptl)
 		spin_unlock(pvmw->ptl);
-- 
cgit v1.2.3


From fc6697a89f56d9773b2fbff718d4cf2a6d63379d Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Thu, 25 Feb 2021 17:18:17 -0800
Subject: mm/zswap: add the flag can_sleep_mapped

Patch series "Fix the compatibility of zsmalloc and zswap".

Patch #1 adds a flag to zpool, then zswap used to determine if zpool
drivers such as zbud/z3fold/zsmalloc will enter an atomic context after
mapping.

The difference between zbud/z3fold and zsmalloc is that zsmalloc requires
an atomic context that since its map function holds a preempt-disabled,
but zbud/z3fold don't require an atomic context.  So patch #2 sets flag
sleep_mapped to true indicating that zbud/z3fold can sleep after mapping.
zsmalloc didn't support sleep after mapping, so don't set that flag to
true.

This patch (of 2):

Add a flag to zpool, named is "can_sleep_mapped", and have it set true for
zbud/z3fold, not set this flag for zsmalloc, so its default value is
false.  Then zswap could go the current path if the flag is true; and if
it's false, copy data from src to a temporary buffer, then unmap the
handle, take the mutex, process the buffer instead of src to avoid
sleeping function called from atomic context.

[natechancellor@gmail.com: add return value in zswap_frontswap_load]
  Link: https://lkml.kernel.org/r/20210121214804.926843-1-natechancellor@gmail.com
[tiantao6@hisilicon.com: fix potential memory leak]
  Link: https://lkml.kernel.org/r/1611538365-51811-1-git-send-email-tiantao6@hisilicon.com
[colin.king@canonical.com: fix potential uninitialized pointer read on tmp]
  Link: https://lkml.kernel.org/r/20210128141728.639030-1-colin.king@canonical.com
[tiantao6@hisilicon.com: fix variable 'entry' is uninitialized when used]
  Link: https://lkml.kernel.org/r/1611223030-58346-1-git-send-email-tiantao6@hisilicon.comLink: https://lkml.kernel.org/r/1611035683-12732-1-git-send-email-tiantao6@hisilicon.com

Link: https://lkml.kernel.org/r/1611035683-12732-2-git-send-email-tiantao6@hisilicon.com
Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Vitaly Wool <vitaly.wool@konsulko.com>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zpool.h |  3 +++
 mm/zpool.c            | 13 +++++++++++++
 mm/zswap.c            | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 51bf43076165..e8997010612a 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *pool);
  * @malloc:	allocate mem from a pool.
  * @free:	free mem from a pool.
  * @shrink:	shrink the pool.
+ * @sleep_mapped: whether zpool driver can sleep during map.
  * @map:	map a handle.
  * @unmap:	unmap a handle.
  * @total_size:	get total size of a pool.
@@ -100,6 +101,7 @@ struct zpool_driver {
 	int (*shrink)(void *pool, unsigned int pages,
 				unsigned int *reclaimed);
 
+	bool sleep_mapped;
 	void *(*map)(void *pool, unsigned long handle,
 				enum zpool_mapmode mm);
 	void (*unmap)(void *pool, unsigned long handle);
@@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_driver *driver);
 int zpool_unregister_driver(struct zpool_driver *driver);
 
 bool zpool_evictable(struct zpool *pool);
+bool zpool_can_sleep_mapped(struct zpool *pool);
 
 #endif
diff --git a/mm/zpool.c b/mm/zpool.c
index 3744a2d1a624..5ed71207ced7 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -23,6 +23,7 @@ struct zpool {
 	void *pool;
 	const struct zpool_ops *ops;
 	bool evictable;
+	bool can_sleep_mapped;
 
 	struct list_head list;
 };
@@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
 	zpool->pool = driver->create(name, gfp, ops, zpool);
 	zpool->ops = ops;
 	zpool->evictable = driver->shrink && ops && ops->evict;
+	zpool->can_sleep_mapped = driver->sleep_mapped;
 
 	if (!zpool->pool) {
 		pr_err("couldn't create %s pool\n", type);
@@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool)
 	return zpool->evictable;
 }
 
+/**
+ * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
+ * @zpool:	The zpool to test
+ *
+ * Returns: true if zpool can sleep; false otherwise.
+ */
+bool zpool_can_sleep_mapped(struct zpool *zpool)
+{
+	return zpool->can_sleep_mapped;
+}
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
 MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zswap.c b/mm/zswap.c
index 1e41c2857068..578d9f256920 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 
-	u8 *src;
+	u8 *src, *tmp = NULL;
 	unsigned int dlen;
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_NONE,
 	};
 
+	if (!zpool_can_sleep_mapped(pool)) {
+		tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+		if (!tmp)
+			return -ENOMEM;
+	}
+
 	/* extract swpentry from data */
 	zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
 	swpentry = zhdr->swpentry; /* here */
@@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 		/* entry was invalidated */
 		spin_unlock(&tree->lock);
 		zpool_unmap_handle(pool, handle);
+		kfree(tmp);
 		return 0;
 	}
 	spin_unlock(&tree->lock);
@@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 		dlen = PAGE_SIZE;
 		src = (u8 *)zhdr + sizeof(struct zswap_header);
 
+		if (!zpool_can_sleep_mapped(pool)) {
+
+			memcpy(tmp, src, entry->length);
+			src = tmp;
+
+			zpool_unmap_handle(pool, handle);
+		}
+
 		mutex_lock(acomp_ctx->mutex);
 		sg_init_one(&input, src, entry->length);
 		sg_init_table(&output, 1);
@@ -1033,7 +1048,11 @@ fail:
 	spin_unlock(&tree->lock);
 
 end:
-	zpool_unmap_handle(pool, handle);
+	if (zpool_can_sleep_mapped(pool))
+		zpool_unmap_handle(pool, handle);
+	else
+		kfree(tmp);
+
 	return ret;
 }
 
@@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	struct zswap_entry *entry;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
-	u8 *src, *dst;
+	u8 *src, *dst, *tmp;
 	unsigned int dlen;
 	int ret;
 
@@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 		dst = kmap_atomic(page);
 		zswap_fill_page(dst, entry->value);
 		kunmap_atomic(dst);
+		ret = 0;
 		goto freeentry;
 	}
 
+	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+		tmp = kmalloc(entry->length, GFP_ATOMIC);
+		if (!tmp) {
+			ret = -ENOMEM;
+			goto freeentry;
+		}
+	}
+
 	/* decompress */
 	dlen = PAGE_SIZE;
 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
 	if (zpool_evictable(entry->pool->zpool))
 		src += sizeof(struct zswap_header);
 
+	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+		memcpy(tmp, src, entry->length);
+		src = tmp;
+
+		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	}
+
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 	mutex_lock(acomp_ctx->mutex);
 	sg_init_one(&input, src, entry->length);
@@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
 	mutex_unlock(acomp_ctx->mutex);
 
-	zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	if (zpool_can_sleep_mapped(entry->pool->zpool))
+		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	else
+		kfree(tmp);
+
 	BUG_ON(ret);
 
 freeentry:
@@ -1279,7 +1320,7 @@ freeentry:
 	zswap_entry_put(tree, entry);
 	spin_unlock(&tree->lock);
 
-	return 0;
+	return ret;
 }
 
 /* frees an entry in zswap */
-- 
cgit v1.2.3


From 2395928158059b8f9858365fce7713ce7fef62e4 Mon Sep 17 00:00:00 2001
From: Rokudo Yan <wu-yan@tcl.com>
Date: Thu, 25 Feb 2021 17:18:31 -0800
Subject: zsmalloc: account the number of compacted pages correctly

There exists multiple path may do zram compaction concurrently.
1. auto-compaction triggered during memory reclaim
2. userspace utils write zram<id>/compaction node

So, multiple threads may call zs_shrinker_scan/zs_compact concurrently.
But pages_compacted is a per zsmalloc pool variable and modification
of the variable is not serialized(through under class->lock).
There are two issues here:
1. the pages_compacted may not equal to total number of pages
freed(due to concurrently add).
2. zs_shrinker_scan may not return the correct number of pages
freed(issued by current shrinker).

The fix is simple:
1. account the number of pages freed in zs_compact locally.
2. use actomic variable pages_compacted to accumulate total number.

Link: https://lkml.kernel.org/r/20210202122235.26885-1-wu-yan@tcl.com
Fixes: 860c707dca155a56 ("zsmalloc: account the number of compacted pages")
Signed-off-by: Rokudo Yan <wu-yan@tcl.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c |  2 +-
 include/linux/zsmalloc.h      |  2 +-
 mm/zsmalloc.c                 | 17 +++++++++++------
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d7018543842e..a711a2e2a794 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1081,7 +1081,7 @@ static ssize_t mm_stat_show(struct device *dev,
 			zram->limit_pages << PAGE_SHIFT,
 			max_used << PAGE_SHIFT,
 			(u64)atomic64_read(&zram->stats.same_pages),
-			pool_stats.pages_compacted,
+			atomic_long_read(&pool_stats.pages_compacted),
 			(u64)atomic64_read(&zram->stats.huge_pages),
 			(u64)atomic64_read(&zram->stats.huge_pages_since));
 	up_read(&zram->init_lock);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 4807ca4d52e0..2a430e713ce5 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -35,7 +35,7 @@ enum zs_mapmode {
 
 struct zs_pool_stats {
 	/* How many pages were migrated (freed) */
-	unsigned long pages_compacted;
+	atomic_long_t pages_compacted;
 };
 
 struct zs_pool;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index cf0ed0e4e911..1518732f95c3 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2212,11 +2212,13 @@ static unsigned long zs_can_compact(struct size_class *class)
 	return obj_wasted * class->pages_per_zspage;
 }
 
-static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+static unsigned long __zs_compact(struct zs_pool *pool,
+				  struct size_class *class)
 {
 	struct zs_compact_control cc;
 	struct zspage *src_zspage;
 	struct zspage *dst_zspage = NULL;
+	unsigned long pages_freed = 0;
 
 	spin_lock(&class->lock);
 	while ((src_zspage = isolate_zspage(class, true))) {
@@ -2246,7 +2248,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 		putback_zspage(class, dst_zspage);
 		if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
 			free_zspage(pool, class, src_zspage);
-			pool->stats.pages_compacted += class->pages_per_zspage;
+			pages_freed += class->pages_per_zspage;
 		}
 		spin_unlock(&class->lock);
 		cond_resched();
@@ -2257,12 +2259,15 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 		putback_zspage(class, src_zspage);
 
 	spin_unlock(&class->lock);
+
+	return pages_freed;
 }
 
 unsigned long zs_compact(struct zs_pool *pool)
 {
 	int i;
 	struct size_class *class;
+	unsigned long pages_freed = 0;
 
 	for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
 		class = pool->size_class[i];
@@ -2270,10 +2275,11 @@ unsigned long zs_compact(struct zs_pool *pool)
 			continue;
 		if (class->index != i)
 			continue;
-		__zs_compact(pool, class);
+		pages_freed += __zs_compact(pool, class);
 	}
+	atomic_long_add(pages_freed, &pool->stats.pages_compacted);
 
-	return pool->stats.pages_compacted;
+	return pages_freed;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
 
@@ -2290,13 +2296,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
 	struct zs_pool *pool = container_of(shrinker, struct zs_pool,
 			shrinker);
 
-	pages_freed = pool->stats.pages_compacted;
 	/*
 	 * Compact classes and calculate compaction delta.
 	 * Can run concurrently with a manually triggered
 	 * (by user) compaction.
 	 */
-	pages_freed = zs_compact(pool) - pages_freed;
+	pages_freed = zs_compact(pool);
 
 	return pages_freed ? pages_freed : SHRINK_STOP;
 }
-- 
cgit v1.2.3


From 4be408cec257d1156d35647db57726f5ef977630 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Thu, 25 Feb 2021 17:18:38 -0800
Subject: mm: page-flags.h: Typo fix (It -> If)

The "If" was wrongly spelled as "It".

Link: https://lkml.kernel.org/r/1608959036-91409-1-git-send-email-guoren@kernel.org
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Steven Price <steven.price@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index db914477057b..04a34c08e0a6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -810,7 +810,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
- * these flags set.  It they are, there is a problem.
+ * these flags set.  If they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE				\
 	(1UL << PG_lru		| 1UL << PG_locked	|	\
@@ -821,7 +821,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
- * Pages being prepped should not have these flags set.  It they are set,
+ * Pages being prepped should not have these flags set.  If they are set,
  * there has been a kernel bug or struct page corruption.
  *
  * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
-- 
cgit v1.2.3


From 0ce20dd840897b12ae70869c69f1ba34d6d16965 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:18:53 -0800
Subject: mm: add Kernel Electric-Fence infrastructure

Patch series "KFENCE: A low-overhead sampling-based memory safety error detector", v7.

This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a
low-overhead sampling-based memory safety error detector of heap
use-after-free, invalid-free, and out-of-bounds access errors.  This
series enables KFENCE for the x86 and arm64 architectures, and adds
KFENCE hooks to the SLAB and SLUB allocators.

KFENCE is designed to be enabled in production kernels, and has near
zero performance overhead. Compared to KASAN, KFENCE trades performance
for precision. The main motivation behind KFENCE's design, is that with
enough total uptime KFENCE will detect bugs in code paths not typically
exercised by non-production test workloads. One way to quickly achieve a
large enough total uptime is when the tool is deployed across a large
fleet of machines.

KFENCE objects each reside on a dedicated page, at either the left or
right page boundaries. The pages to the left and right of the object
page are "guard pages", whose attributes are changed to a protected
state, and cause page faults on any attempted access to them. Such page
faults are then intercepted by KFENCE, which handles the fault
gracefully by reporting a memory access error.

Guarded allocations are set up based on a sample interval (can be set
via kfence.sample_interval). After expiration of the sample interval,
the next allocation through the main allocator (SLAB or SLUB) returns a
guarded allocation from the KFENCE object pool. At this point, the timer
is reset, and the next allocation is set up after the expiration of the
interval.

To enable/disable a KFENCE allocation through the main allocator's
fast-path without overhead, KFENCE relies on static branches via the
static keys infrastructure. The static branch is toggled to redirect the
allocation to KFENCE.

The KFENCE memory pool is of fixed size, and if the pool is exhausted no
further KFENCE allocations occur. The default config is conservative
with only 255 objects, resulting in a pool size of 2 MiB (with 4 KiB
pages).

We have verified by running synthetic benchmarks (sysbench I/O,
hackbench) and production server-workload benchmarks that a kernel with
KFENCE (using sample intervals 100-500ms) is performance-neutral
compared to a non-KFENCE baseline kernel.

KFENCE is inspired by GWP-ASan [1], a userspace tool with similar
properties. The name "KFENCE" is a homage to the Electric Fence Malloc
Debugger [2].

For more details, see Documentation/dev-tools/kfence.rst added in the
series -- also viewable here:

	https://raw.githubusercontent.com/google/kasan/kfence/Documentation/dev-tools/kfence.rst

[1] http://llvm.org/docs/GwpAsan.html
[2] https://linux.die.net/man/3/efence

This patch (of 9):

This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a
low-overhead sampling-based memory safety error detector of heap
use-after-free, invalid-free, and out-of-bounds access errors.

KFENCE is designed to be enabled in production kernels, and has near
zero performance overhead. Compared to KASAN, KFENCE trades performance
for precision. The main motivation behind KFENCE's design, is that with
enough total uptime KFENCE will detect bugs in code paths not typically
exercised by non-production test workloads. One way to quickly achieve a
large enough total uptime is when the tool is deployed across a large
fleet of machines.

KFENCE objects each reside on a dedicated page, at either the left or
right page boundaries. The pages to the left and right of the object
page are "guard pages", whose attributes are changed to a protected
state, and cause page faults on any attempted access to them. Such page
faults are then intercepted by KFENCE, which handles the fault
gracefully by reporting a memory access error. To detect out-of-bounds
writes to memory within the object's page itself, KFENCE also uses
pattern-based redzones. The following figure illustrates the page
layout:

  ---+-----------+-----------+-----------+-----------+-----------+---
     | xxxxxxxxx | O :       | xxxxxxxxx |       : O | xxxxxxxxx |
     | xxxxxxxxx | B :       | xxxxxxxxx |       : B | xxxxxxxxx |
     | x GUARD x | J : RED-  | x GUARD x | RED-  : J | x GUARD x |
     | xxxxxxxxx | E :  ZONE | xxxxxxxxx |  ZONE : E | xxxxxxxxx |
     | xxxxxxxxx | C :       | xxxxxxxxx |       : C | xxxxxxxxx |
     | xxxxxxxxx | T :       | xxxxxxxxx |       : T | xxxxxxxxx |
  ---+-----------+-----------+-----------+-----------+-----------+---

Guarded allocations are set up based on a sample interval (can be set
via kfence.sample_interval). After expiration of the sample interval, a
guarded allocation from the KFENCE object pool is returned to the main
allocator (SLAB or SLUB). At this point, the timer is reset, and the
next allocation is set up after the expiration of the interval.

To enable/disable a KFENCE allocation through the main allocator's
fast-path without overhead, KFENCE relies on static branches via the
static keys infrastructure. The static branch is toggled to redirect the
allocation to KFENCE. To date, we have verified by running synthetic
benchmarks (sysbench I/O, hackbench) that a kernel compiled with KFENCE
is performance-neutral compared to the non-KFENCE baseline.

For more details, see Documentation/dev-tools/kfence.rst (added later in
the series).

[elver@google.com: fix parameter description for kfence_object_start()]
  Link: https://lkml.kernel.org/r/20201106092149.GA2851373@elver.google.com
[elver@google.com: avoid stalling work queue task without allocations]
  Link: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJiw@mail.gmail.com
  Link: https://lkml.kernel.org/r/20201110135320.3309507-1-elver@google.com
[elver@google.com: fix potential deadlock due to wake_up()]
  Link: https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com
  Link: https://lkml.kernel.org/r/20210104130749.1768991-1-elver@google.com
[elver@google.com: add option to use KFENCE without static keys]
  Link: https://lkml.kernel.org/r/20210111091544.3287013-1-elver@google.com
[elver@google.com: add missing copyright and description headers]
  Link: https://lkml.kernel.org/r/20210118092159.145934-1-elver@google.com

Link: https://lkml.kernel.org/r/20201103175841.3495947-2-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: SeongJae Park <sjpark@amazon.de>
Co-developed-by: Marco Elver <elver@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfence.h | 216 +++++++++++++
 init/main.c            |   3 +
 lib/Kconfig.debug      |   1 +
 lib/Kconfig.kfence     |  67 ++++
 mm/Makefile            |   1 +
 mm/kfence/Makefile     |   3 +
 mm/kfence/core.c       | 840 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kfence/kfence.h     | 113 +++++++
 mm/kfence/report.c     | 240 ++++++++++++++
 9 files changed, 1484 insertions(+)
 create mode 100644 include/linux/kfence.h
 create mode 100644 lib/Kconfig.kfence
 create mode 100644 mm/kfence/Makefile
 create mode 100644 mm/kfence/core.c
 create mode 100644 mm/kfence/kfence.h
 create mode 100644 mm/kfence/report.c

(limited to 'include/linux')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
new file mode 100644
index 000000000000..81f3911cb298
--- /dev/null
+++ b/include/linux/kfence.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). Public interface for allocator and fault
+ * handler integration. For more info see Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef _LINUX_KFENCE_H
+#define _LINUX_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KFENCE
+
+/*
+ * We allocate an even number of pages, as it simplifies calculations to map
+ * address to metadata indices; effectively, the very first page serves as an
+ * extended guard page, but otherwise has no special purpose.
+ */
+#define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE)
+extern char *__kfence_pool;
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+#include <linux/static_key.h>
+DECLARE_STATIC_KEY_FALSE(kfence_allocation_key);
+#else
+#include <linux/atomic.h>
+extern atomic_t kfence_allocation_gate;
+#endif
+
+/**
+ * is_kfence_address() - check if an address belongs to KFENCE pool
+ * @addr: address to check
+ *
+ * Return: true or false depending on whether the address is within the KFENCE
+ * object range.
+ *
+ * KFENCE objects live in a separate page range and are not to be intermixed
+ * with regular heap objects (e.g. KFENCE objects must never be added to the
+ * allocator freelists). Failing to do so may and will result in heap
+ * corruptions, therefore is_kfence_address() must be used to check whether
+ * an object requires specific handling.
+ *
+ * Note: This function may be used in fast-paths, and is performance critical.
+ * Future changes should take this into account; for instance, we want to avoid
+ * introducing another load and therefore need to keep KFENCE_POOL_SIZE a
+ * constant (until immediate patching support is added to the kernel).
+ */
+static __always_inline bool is_kfence_address(const void *addr)
+{
+	/*
+	 * The non-NULL check is required in case the __kfence_pool pointer was
+	 * never initialized; keep it in the slow-path after the range-check.
+	 */
+	return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+}
+
+/**
+ * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ */
+void __init kfence_alloc_pool(void);
+
+/**
+ * kfence_init() - perform KFENCE initialization at boot time
+ *
+ * Requires that kfence_alloc_pool() was called before. This sets up the
+ * allocation gate timer, and requires that workqueues are available.
+ */
+void __init kfence_init(void);
+
+/**
+ * kfence_shutdown_cache() - handle shutdown_cache() for KFENCE objects
+ * @s: cache being shut down
+ *
+ * Before shutting down a cache, one must ensure there are no remaining objects
+ * allocated from it. Because KFENCE objects are not referenced from the cache
+ * directly, we need to check them here.
+ *
+ * Note that shutdown_cache() is internal to SL*B, and kmem_cache_destroy() does
+ * not return if allocated objects still exist: it prints an error message and
+ * simply aborts destruction of a cache, leaking memory.
+ *
+ * If the only such objects are KFENCE objects, we will not leak the entire
+ * cache, but instead try to provide more useful debug info by making allocated
+ * objects "zombie allocations". Objects may then still be used or freed (which
+ * is handled gracefully), but usage will result in showing KFENCE error reports
+ * which include stack traces to the user of the object, the original allocation
+ * site, and caller to shutdown_cache().
+ */
+void kfence_shutdown_cache(struct kmem_cache *s);
+
+/*
+ * Allocate a KFENCE object. Allocators must not call this function directly,
+ * use kfence_alloc() instead.
+ */
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags);
+
+/**
+ * kfence_alloc() - allocate a KFENCE object with a low probability
+ * @s:     struct kmem_cache with object requirements
+ * @size:  exact size of the object to allocate (can be less than @s->size
+ *         e.g. for kmalloc caches)
+ * @flags: GFP flags
+ *
+ * Return:
+ * * NULL     - must proceed with allocating as usual,
+ * * non-NULL - pointer to a KFENCE object.
+ *
+ * kfence_alloc() should be inserted into the heap allocation fast path,
+ * allowing it to transparently return KFENCE-allocated objects with a low
+ * probability using a static branch (the probability is controlled by the
+ * kfence.sample_interval boot parameter).
+ */
+static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+	if (static_branch_unlikely(&kfence_allocation_key))
+#else
+	if (unlikely(!atomic_read(&kfence_allocation_gate)))
+#endif
+		return __kfence_alloc(s, size, flags);
+	return NULL;
+}
+
+/**
+ * kfence_ksize() - get actual amount of memory allocated for a KFENCE object
+ * @addr: pointer to a heap object
+ *
+ * Return:
+ * * 0     - not a KFENCE object, must call __ksize() instead,
+ * * non-0 - this many bytes can be accessed without causing a memory error.
+ *
+ * kfence_ksize() returns the number of bytes requested for a KFENCE object at
+ * allocation time. This number may be less than the object size of the
+ * corresponding struct kmem_cache.
+ */
+size_t kfence_ksize(const void *addr);
+
+/**
+ * kfence_object_start() - find the beginning of a KFENCE object
+ * @addr: address within a KFENCE-allocated object
+ *
+ * Return: address of the beginning of the object.
+ *
+ * SL[AU]B-allocated objects are laid out within a page one by one, so it is
+ * easy to calculate the beginning of an object given a pointer inside it and
+ * the object size. The same is not true for KFENCE, which places a single
+ * object at either end of the page. This helper function is used to find the
+ * beginning of a KFENCE-allocated object.
+ */
+void *kfence_object_start(const void *addr);
+
+/**
+ * __kfence_free() - release a KFENCE heap object to KFENCE pool
+ * @addr: object to be freed
+ *
+ * Requires: is_kfence_address(addr)
+ *
+ * Release a KFENCE object and mark it as freed.
+ */
+void __kfence_free(void *addr);
+
+/**
+ * kfence_free() - try to release an arbitrary heap object to KFENCE pool
+ * @addr: object to be freed
+ *
+ * Return:
+ * * false - object doesn't belong to KFENCE pool and was ignored,
+ * * true  - object was released to KFENCE pool.
+ *
+ * Release a KFENCE object and mark it as freed. May be called on any object,
+ * even non-KFENCE objects, to simplify integration of the hooks into the
+ * allocator's free codepath. The allocator must check the return value to
+ * determine if it was a KFENCE object or not.
+ */
+static __always_inline __must_check bool kfence_free(void *addr)
+{
+	if (!is_kfence_address(addr))
+		return false;
+	__kfence_free(addr);
+	return true;
+}
+
+/**
+ * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
+ * @addr: faulting address
+ *
+ * Return:
+ * * false - address outside KFENCE pool,
+ * * true  - page fault handled by KFENCE, no additional handling required.
+ *
+ * A page fault inside KFENCE pool indicates a memory error, such as an
+ * out-of-bounds access, a use-after-free or an invalid memory access. In these
+ * cases KFENCE prints an error message and marks the offending page as
+ * present, so that the kernel can proceed.
+ */
+bool __must_check kfence_handle_page_fault(unsigned long addr);
+
+#else /* CONFIG_KFENCE */
+
+static inline bool is_kfence_address(const void *addr) { return false; }
+static inline void kfence_alloc_pool(void) { }
+static inline void kfence_init(void) { }
+static inline void kfence_shutdown_cache(struct kmem_cache *s) { }
+static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; }
+static inline size_t kfence_ksize(const void *addr) { return 0; }
+static inline void *kfence_object_start(const void *addr) { return NULL; }
+static inline void __kfence_free(void *addr) { }
+static inline bool __must_check kfence_free(void *addr) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; }
+
+#endif
+
+#endif /* _LINUX_KFENCE_H */
diff --git a/init/main.c b/init/main.c
index e9933cbf60d4..261051070e3c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -40,6 +40,7 @@
 #include <linux/security.h>
 #include <linux/smp.h>
 #include <linux/profile.h>
+#include <linux/kfence.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
 #include <linux/kallsyms.h>
@@ -824,6 +825,7 @@ static void __init mm_init(void)
 	 */
 	page_ext_init_flatmem();
 	init_mem_debugging_and_hardening();
+	kfence_alloc_pool();
 	report_meminit();
 	mem_init();
 	/* page_owner must be initialized after buddy is ready */
@@ -955,6 +957,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
 	hrtimers_init();
 	softirq_init();
 	timekeeping_init();
+	kfence_init();
 
 	/*
 	 * For best initial stack canary entropy, prepare it after:
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f9febffffc21..2779c29d9981 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -938,6 +938,7 @@ config DEBUG_STACKOVERFLOW
 	  If in doubt, say "N".
 
 source "lib/Kconfig.kasan"
+source "lib/Kconfig.kfence"
 
 endmenu # "Memory Debugging"
 
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
new file mode 100644
index 000000000000..b88ac9d6b2e6
--- /dev/null
+++ b/lib/Kconfig.kfence
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config HAVE_ARCH_KFENCE
+	bool
+
+menuconfig KFENCE
+	bool "KFENCE: low-overhead sampling-based memory safety error detector"
+	depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB)
+	select STACKTRACE
+	help
+	  KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
+	  access, use-after-free, and invalid-free errors. KFENCE is designed
+	  to have negligible cost to permit enabling it in production
+	  environments.
+
+	  Note that, KFENCE is not a substitute for explicit testing with tools
+	  such as KASAN. KFENCE can detect a subset of bugs that KASAN can
+	  detect, albeit at very different performance profiles. If you can
+	  afford to use KASAN, continue using KASAN, for example in test
+	  environments. If your kernel targets production use, and cannot
+	  enable KASAN due to its cost, consider using KFENCE.
+
+if KFENCE
+
+config KFENCE_STATIC_KEYS
+	bool "Use static keys to set up allocations"
+	default y
+	depends on JUMP_LABEL # To ensure performance, require jump labels
+	help
+	  Use static keys (static branches) to set up KFENCE allocations. Using
+	  static keys is normally recommended, because it avoids a dynamic
+	  branch in the allocator's fast path. However, with very low sample
+	  intervals, or on systems that do not support jump labels, a dynamic
+	  branch may still be an acceptable performance trade-off.
+
+config KFENCE_SAMPLE_INTERVAL
+	int "Default sample interval in milliseconds"
+	default 100
+	help
+	  The KFENCE sample interval determines the frequency with which heap
+	  allocations will be guarded by KFENCE. May be overridden via boot
+	  parameter "kfence.sample_interval".
+
+	  Set this to 0 to disable KFENCE by default, in which case only
+	  setting "kfence.sample_interval" to a non-zero value enables KFENCE.
+
+config KFENCE_NUM_OBJECTS
+	int "Number of guarded objects available"
+	range 1 65535
+	default 255
+	help
+	  The number of guarded objects available. For each KFENCE object, 2
+	  pages are required; with one containing the object and two adjacent
+	  ones used as guard pages.
+
+config KFENCE_STRESS_TEST_FAULTS
+	int "Stress testing of fault handling and error reporting" if EXPERT
+	default 0
+	help
+	  The inverse probability with which to randomly protect KFENCE object
+	  pages, resulting in spurious use-after-frees. The main purpose of
+	  this option is to stress test KFENCE with concurrent error reports
+	  and allocations/frees. A value of 0 disables stress testing logic.
+
+	  Only for KFENCE testing; set to 0 if you are not a KFENCE developer.
+
+endif # KFENCE
diff --git a/mm/Makefile b/mm/Makefile
index 135bbb65511a..72227b24a616 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KASAN)	+= kasan/
+obj-$(CONFIG_KFENCE) += kfence/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
new file mode 100644
index 000000000000..d991e9a349f0
--- /dev/null
+++ b/mm/kfence/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_KFENCE) := core.o report.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
new file mode 100644
index 000000000000..d6a32c13336b
--- /dev/null
+++ b/mm/kfence/core.c
@@ -0,0 +1,840 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE guarded object allocator and fault handling.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kfence: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kfence.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Disables KFENCE on the first warning assuming an irrecoverable error. */
+#define KFENCE_WARN_ON(cond)                                                   \
+	({                                                                     \
+		const bool __cond = WARN_ON(cond);                             \
+		if (unlikely(__cond))                                          \
+			WRITE_ONCE(kfence_enabled, false);                     \
+		__cond;                                                        \
+	})
+
+/* === Data ================================================================= */
+
+static bool kfence_enabled __read_mostly;
+
+static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kfence."
+
+static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
+{
+	unsigned long num;
+	int ret = kstrtoul(val, 0, &num);
+
+	if (ret < 0)
+		return ret;
+
+	if (!num) /* Using 0 to indicate KFENCE is disabled. */
+		WRITE_ONCE(kfence_enabled, false);
+	else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
+		return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */
+
+	*((unsigned long *)kp->arg) = num;
+	return 0;
+}
+
+static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
+{
+	if (!READ_ONCE(kfence_enabled))
+		return sprintf(buffer, "0\n");
+
+	return param_get_ulong(buffer, kp);
+}
+
+static const struct kernel_param_ops sample_interval_param_ops = {
+	.set = param_set_sample_interval,
+	.get = param_get_sample_interval,
+};
+module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+
+/* The pool of pages used for guard pages and objects. */
+char *__kfence_pool __ro_after_init;
+EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
+
+/*
+ * Per-object metadata, with one-to-one mapping of object metadata to
+ * backing pages (in __kfence_pool).
+ */
+static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
+struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* Freelist with available objects. */
+static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
+static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* The static key to set up a KFENCE allocation. */
+DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
+#endif
+
+/* Gates the allocation, ensuring only one succeeds in a given period. */
+atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+
+/* Statistics counters for debugfs. */
+enum kfence_counter_id {
+	KFENCE_COUNTER_ALLOCATED,
+	KFENCE_COUNTER_ALLOCS,
+	KFENCE_COUNTER_FREES,
+	KFENCE_COUNTER_ZOMBIES,
+	KFENCE_COUNTER_BUGS,
+	KFENCE_COUNTER_COUNT,
+};
+static atomic_long_t counters[KFENCE_COUNTER_COUNT];
+static const char *const counter_names[] = {
+	[KFENCE_COUNTER_ALLOCATED]	= "currently allocated",
+	[KFENCE_COUNTER_ALLOCS]		= "total allocations",
+	[KFENCE_COUNTER_FREES]		= "total frees",
+	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
+	[KFENCE_COUNTER_BUGS]		= "total bugs",
+};
+static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
+
+/* === Internals ============================================================ */
+
+static bool kfence_protect(unsigned long addr)
+{
+	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
+}
+
+static bool kfence_unprotect(unsigned long addr)
+{
+	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
+}
+
+static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
+{
+	long index;
+
+	/* The checks do not affect performance; only called from slow-paths. */
+
+	if (!is_kfence_address((void *)addr))
+		return NULL;
+
+	/*
+	 * May be an invalid index if called with an address at the edge of
+	 * __kfence_pool, in which case we would report an "invalid access"
+	 * error.
+	 */
+	index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
+	if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
+		return NULL;
+
+	return &kfence_metadata[index];
+}
+
+static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
+{
+	unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
+	unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
+
+	/* The checks do not affect performance; only called from slow-paths. */
+
+	/* Only call with a pointer into kfence_metadata. */
+	if (KFENCE_WARN_ON(meta < kfence_metadata ||
+			   meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
+		return 0;
+
+	/*
+	 * This metadata object only ever maps to 1 page; verify that the stored
+	 * address is in the expected range.
+	 */
+	if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
+		return 0;
+
+	return pageaddr;
+}
+
+/*
+ * Update the object's metadata state, including updating the alloc/free stacks
+ * depending on the state transition.
+ */
+static noinline void metadata_update_state(struct kfence_metadata *meta,
+					   enum kfence_object_state next)
+{
+	struct kfence_track *track =
+		next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
+
+	lockdep_assert_held(&meta->lock);
+
+	/*
+	 * Skip over 1 (this) functions; noinline ensures we do not accidentally
+	 * skip over the caller by never inlining.
+	 */
+	track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+	track->pid = task_pid_nr(current);
+
+	/*
+	 * Pairs with READ_ONCE() in
+	 *	kfence_shutdown_cache(),
+	 *	kfence_handle_page_fault().
+	 */
+	WRITE_ONCE(meta->state, next);
+}
+
+/* Write canary byte to @addr. */
+static inline bool set_canary_byte(u8 *addr)
+{
+	*addr = KFENCE_CANARY_PATTERN(addr);
+	return true;
+}
+
+/* Check canary byte at @addr. */
+static inline bool check_canary_byte(u8 *addr)
+{
+	if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
+		return true;
+
+	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+	kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr),
+			    KFENCE_ERROR_CORRUPTION);
+	return false;
+}
+
+/* __always_inline this to ensure we won't do an indirect call to fn. */
+static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
+{
+	const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
+	unsigned long addr;
+
+	lockdep_assert_held(&meta->lock);
+
+	/*
+	 * We'll iterate over each canary byte per-side until fn() returns
+	 * false. However, we'll still iterate over the canary bytes to the
+	 * right of the object even if there was an error in the canary bytes to
+	 * the left of the object. Specifically, if check_canary_byte()
+	 * generates an error, showing both sides might give more clues as to
+	 * what the error is about when displaying which bytes were corrupted.
+	 */
+
+	/* Apply to left of object. */
+	for (addr = pageaddr; addr < meta->addr; addr++) {
+		if (!fn((u8 *)addr))
+			break;
+	}
+
+	/* Apply to right of object. */
+	for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
+		if (!fn((u8 *)addr))
+			break;
+	}
+}
+
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+{
+	struct kfence_metadata *meta = NULL;
+	unsigned long flags;
+	struct page *page;
+	void *addr;
+
+	/* Try to obtain a free object. */
+	raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+	if (!list_empty(&kfence_freelist)) {
+		meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
+		list_del_init(&meta->list);
+	}
+	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+	if (!meta)
+		return NULL;
+
+	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
+		/*
+		 * This is extremely unlikely -- we are reporting on a
+		 * use-after-free, which locked meta->lock, and the reporting
+		 * code via printk calls kmalloc() which ends up in
+		 * kfence_alloc() and tries to grab the same object that we're
+		 * reporting on. While it has never been observed, lockdep does
+		 * report that there is a possibility of deadlock. Fix it by
+		 * using trylock and bailing out gracefully.
+		 */
+		raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+		/* Put the object back on the freelist. */
+		list_add_tail(&meta->list, &kfence_freelist);
+		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+		return NULL;
+	}
+
+	meta->addr = metadata_to_pageaddr(meta);
+	/* Unprotect if we're reusing this page. */
+	if (meta->state == KFENCE_OBJECT_FREED)
+		kfence_unprotect(meta->addr);
+
+	/*
+	 * Note: for allocations made before RNG initialization, will always
+	 * return zero. We still benefit from enabling KFENCE as early as
+	 * possible, even when the RNG is not yet available, as this will allow
+	 * KFENCE to detect bugs due to earlier allocations. The only downside
+	 * is that the out-of-bounds accesses detected are deterministic for
+	 * such allocations.
+	 */
+	if (prandom_u32_max(2)) {
+		/* Allocate on the "right" side, re-calculate address. */
+		meta->addr += PAGE_SIZE - size;
+		meta->addr = ALIGN_DOWN(meta->addr, cache->align);
+	}
+
+	addr = (void *)meta->addr;
+
+	/* Update remaining metadata. */
+	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
+	WRITE_ONCE(meta->cache, cache);
+	meta->size = size;
+	for_each_canary(meta, set_canary_byte);
+
+	/* Set required struct page fields. */
+	page = virt_to_page(meta->addr);
+	page->slab_cache = cache;
+
+	raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+	/* Memory initialization. */
+
+	/*
+	 * We check slab_want_init_on_alloc() ourselves, rather than letting
+	 * SL*B do the initialization, as otherwise we might overwrite KFENCE's
+	 * redzone.
+	 */
+	if (unlikely(slab_want_init_on_alloc(gfp, cache)))
+		memzero_explicit(addr, size);
+	if (cache->ctor)
+		cache->ctor(addr);
+
+	if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS))
+		kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
+
+	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
+	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
+
+	return addr;
+}
+
+static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
+{
+	struct kcsan_scoped_access assert_page_exclusive;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&meta->lock, flags);
+
+	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
+		/* Invalid or double-free, bail out. */
+		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+		kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE);
+		raw_spin_unlock_irqrestore(&meta->lock, flags);
+		return;
+	}
+
+	/* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
+	kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
+				  KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
+				  &assert_page_exclusive);
+
+	if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
+		kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
+
+	/* Restore page protection if there was an OOB access. */
+	if (meta->unprotected_page) {
+		kfence_protect(meta->unprotected_page);
+		meta->unprotected_page = 0;
+	}
+
+	/* Check canary bytes for memory corruption. */
+	for_each_canary(meta, check_canary_byte);
+
+	/*
+	 * Clear memory if init-on-free is set. While we protect the page, the
+	 * data is still there, and after a use-after-free is detected, we
+	 * unprotect the page, so the data is still accessible.
+	 */
+	if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
+		memzero_explicit(addr, meta->size);
+
+	/* Mark the object as freed. */
+	metadata_update_state(meta, KFENCE_OBJECT_FREED);
+
+	raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+	/* Protect to detect use-after-frees. */
+	kfence_protect((unsigned long)addr);
+
+	kcsan_end_scoped_access(&assert_page_exclusive);
+	if (!zombie) {
+		/* Add it to the tail of the freelist for reuse. */
+		raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+		KFENCE_WARN_ON(!list_empty(&meta->list));
+		list_add_tail(&meta->list, &kfence_freelist);
+		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+		atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
+		atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
+	} else {
+		/* See kfence_shutdown_cache(). */
+		atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
+	}
+}
+
+static void rcu_guarded_free(struct rcu_head *h)
+{
+	struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
+
+	kfence_guarded_free((void *)meta->addr, meta, false);
+}
+
+static bool __init kfence_init_pool(void)
+{
+	unsigned long addr = (unsigned long)__kfence_pool;
+	struct page *pages;
+	int i;
+
+	if (!__kfence_pool)
+		return false;
+
+	if (!arch_kfence_init_pool())
+		goto err;
+
+	pages = virt_to_page(addr);
+
+	/*
+	 * Set up object pages: they must have PG_slab set, to avoid freeing
+	 * these as real pages.
+	 *
+	 * We also want to avoid inserting kfence_free() in the kfree()
+	 * fast-path in SLUB, and therefore need to ensure kfree() correctly
+	 * enters __slab_free() slow-path.
+	 */
+	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+		if (!i || (i % 2))
+			continue;
+
+		/* Verify we do not have a compound head page. */
+		if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
+			goto err;
+
+		__SetPageSlab(&pages[i]);
+	}
+
+	/*
+	 * Protect the first 2 pages. The first page is mostly unnecessary, and
+	 * merely serves as an extended guard page. However, adding one
+	 * additional page in the beginning gives us an even number of pages,
+	 * which simplifies the mapping of address to metadata index.
+	 */
+	for (i = 0; i < 2; i++) {
+		if (unlikely(!kfence_protect(addr)))
+			goto err;
+
+		addr += PAGE_SIZE;
+	}
+
+	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+		struct kfence_metadata *meta = &kfence_metadata[i];
+
+		/* Initialize metadata. */
+		INIT_LIST_HEAD(&meta->list);
+		raw_spin_lock_init(&meta->lock);
+		meta->state = KFENCE_OBJECT_UNUSED;
+		meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
+		list_add_tail(&meta->list, &kfence_freelist);
+
+		/* Protect the right redzone. */
+		if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
+			goto err;
+
+		addr += 2 * PAGE_SIZE;
+	}
+
+	return true;
+
+err:
+	/*
+	 * Only release unprotected pages, and do not try to go back and change
+	 * page attributes due to risk of failing to do so as well. If changing
+	 * page attributes for some pages fails, it is very likely that it also
+	 * fails for the first page, and therefore expect addr==__kfence_pool in
+	 * most failure cases.
+	 */
+	memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
+	__kfence_pool = NULL;
+	return false;
+}
+
+/* === DebugFS Interface ==================================================== */
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
+	for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
+		seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+/*
+ * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
+ * start_object() and next_object() return the object index + 1, because NULL is used
+ * to stop iteration.
+ */
+static void *start_object(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+		return (void *)((long)*pos + 1);
+	return NULL;
+}
+
+static void stop_object(struct seq_file *seq, void *v)
+{
+}
+
+static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+		return (void *)((long)*pos + 1);
+	return NULL;
+}
+
+static int show_object(struct seq_file *seq, void *v)
+{
+	struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&meta->lock, flags);
+	kfence_print_object(seq, meta);
+	raw_spin_unlock_irqrestore(&meta->lock, flags);
+	seq_puts(seq, "---------------------------------\n");
+
+	return 0;
+}
+
+static const struct seq_operations object_seqops = {
+	.start = start_object,
+	.next = next_object,
+	.stop = stop_object,
+	.show = show_object,
+};
+
+static int open_objects(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &object_seqops);
+}
+
+static const struct file_operations objects_fops = {
+	.open = open_objects,
+	.read = seq_read,
+	.llseek = seq_lseek,
+};
+
+static int __init kfence_debugfs_init(void)
+{
+	struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
+
+	debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
+	debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
+	return 0;
+}
+
+late_initcall(kfence_debugfs_init);
+
+/* === Allocation Gate Timer ================================================ */
+
+/*
+ * Set up delayed work, which will enable and disable the static key. We need to
+ * use a work queue (rather than a simple timer), since enabling and disabling a
+ * static key cannot be done from an interrupt.
+ *
+ * Note: Toggling a static branch currently causes IPIs, and here we'll end up
+ * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
+ * more aggressive sampling intervals), we could get away with a variant that
+ * avoids IPIs, at the cost of not immediately capturing allocations if the
+ * instructions remain cached.
+ */
+static struct delayed_work kfence_timer;
+static void toggle_allocation_gate(struct work_struct *work)
+{
+	if (!READ_ONCE(kfence_enabled))
+		return;
+
+	/* Enable static key, and await allocation to happen. */
+	atomic_set(&kfence_allocation_gate, 0);
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+	static_branch_enable(&kfence_allocation_key);
+	/*
+	 * Await an allocation. Timeout after 1 second, in case the kernel stops
+	 * doing allocations, to avoid stalling this worker task for too long.
+	 */
+	{
+		unsigned long end_wait = jiffies + HZ;
+
+		do {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&kfence_allocation_gate) != 0)
+				break;
+			schedule_timeout(1);
+		} while (time_before(jiffies, end_wait));
+		__set_current_state(TASK_RUNNING);
+	}
+	/* Disable static key and reset timer. */
+	static_branch_disable(&kfence_allocation_key);
+#endif
+	schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+}
+static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
+
+/* === Public interface ===================================================== */
+
+void __init kfence_alloc_pool(void)
+{
+	if (!kfence_sample_interval)
+		return;
+
+	__kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+	if (!__kfence_pool)
+		pr_err("failed to allocate pool\n");
+}
+
+void __init kfence_init(void)
+{
+	/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
+	if (!kfence_sample_interval)
+		return;
+
+	if (!kfence_init_pool()) {
+		pr_err("%s failed\n", __func__);
+		return;
+	}
+
+	WRITE_ONCE(kfence_enabled, true);
+	schedule_delayed_work(&kfence_timer, 0);
+	pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE,
+		CONFIG_KFENCE_NUM_OBJECTS);
+	if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+		pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool,
+			(void *)(__kfence_pool + KFENCE_POOL_SIZE));
+	else
+		pr_cont("\n");
+}
+
+void kfence_shutdown_cache(struct kmem_cache *s)
+{
+	unsigned long flags;
+	struct kfence_metadata *meta;
+	int i;
+
+	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+		bool in_use;
+
+		meta = &kfence_metadata[i];
+
+		/*
+		 * If we observe some inconsistent cache and state pair where we
+		 * should have returned false here, cache destruction is racing
+		 * with either kmem_cache_alloc() or kmem_cache_free(). Taking
+		 * the lock will not help, as different critical section
+		 * serialization will have the same outcome.
+		 */
+		if (READ_ONCE(meta->cache) != s ||
+		    READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
+			continue;
+
+		raw_spin_lock_irqsave(&meta->lock, flags);
+		in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
+		raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+		if (in_use) {
+			/*
+			 * This cache still has allocations, and we should not
+			 * release them back into the freelist so they can still
+			 * safely be used and retain the kernel's default
+			 * behaviour of keeping the allocations alive (leak the
+			 * cache); however, they effectively become "zombie
+			 * allocations" as the KFENCE objects are the only ones
+			 * still in use and the owning cache is being destroyed.
+			 *
+			 * We mark them freed, so that any subsequent use shows
+			 * more useful error messages that will include stack
+			 * traces of the user of the object, the original
+			 * allocation, and caller to shutdown_cache().
+			 */
+			kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
+		}
+	}
+
+	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+		meta = &kfence_metadata[i];
+
+		/* See above. */
+		if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
+			continue;
+
+		raw_spin_lock_irqsave(&meta->lock, flags);
+		if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
+			meta->cache = NULL;
+		raw_spin_unlock_irqrestore(&meta->lock, flags);
+	}
+}
+
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+	/*
+	 * allocation_gate only needs to become non-zero, so it doesn't make
+	 * sense to continue writing to it and pay the associated contention
+	 * cost, in case we have a large number of concurrent allocations.
+	 */
+	if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
+		return NULL;
+
+	if (!READ_ONCE(kfence_enabled))
+		return NULL;
+
+	if (size > PAGE_SIZE)
+		return NULL;
+
+	return kfence_guarded_alloc(s, size, flags);
+}
+
+size_t kfence_ksize(const void *addr)
+{
+	const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+	/*
+	 * Read locklessly -- if there is a race with __kfence_alloc(), this is
+	 * either a use-after-free or invalid access.
+	 */
+	return meta ? meta->size : 0;
+}
+
+void *kfence_object_start(const void *addr)
+{
+	const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+	/*
+	 * Read locklessly -- if there is a race with __kfence_alloc(), this is
+	 * either a use-after-free or invalid access.
+	 */
+	return meta ? (void *)meta->addr : NULL;
+}
+
+void __kfence_free(void *addr)
+{
+	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+	/*
+	 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+	 * the object, as the object page may be recycled for other-typed
+	 * objects once it has been freed. meta->cache may be NULL if the cache
+	 * was destroyed.
+	 */
+	if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
+		call_rcu(&meta->rcu_head, rcu_guarded_free);
+	else
+		kfence_guarded_free(addr, meta, false);
+}
+
+bool kfence_handle_page_fault(unsigned long addr)
+{
+	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
+	struct kfence_metadata *to_report = NULL;
+	enum kfence_error_type error_type;
+	unsigned long flags;
+
+	if (!is_kfence_address((void *)addr))
+		return false;
+
+	if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
+		return kfence_unprotect(addr); /* ... unprotect and proceed. */
+
+	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+
+	if (page_index % 2) {
+		/* This is a redzone, report a buffer overflow. */
+		struct kfence_metadata *meta;
+		int distance = 0;
+
+		meta = addr_to_metadata(addr - PAGE_SIZE);
+		if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+			to_report = meta;
+			/* Data race ok; distance calculation approximate. */
+			distance = addr - data_race(meta->addr + meta->size);
+		}
+
+		meta = addr_to_metadata(addr + PAGE_SIZE);
+		if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+			/* Data race ok; distance calculation approximate. */
+			if (!to_report || distance > data_race(meta->addr) - addr)
+				to_report = meta;
+		}
+
+		if (!to_report)
+			goto out;
+
+		raw_spin_lock_irqsave(&to_report->lock, flags);
+		to_report->unprotected_page = addr;
+		error_type = KFENCE_ERROR_OOB;
+
+		/*
+		 * If the object was freed before we took the look we can still
+		 * report this as an OOB -- the report will simply show the
+		 * stacktrace of the free as well.
+		 */
+	} else {
+		to_report = addr_to_metadata(addr);
+		if (!to_report)
+			goto out;
+
+		raw_spin_lock_irqsave(&to_report->lock, flags);
+		error_type = KFENCE_ERROR_UAF;
+		/*
+		 * We may race with __kfence_alloc(), and it is possible that a
+		 * freed object may be reallocated. We simply report this as a
+		 * use-after-free, with the stack trace showing the place where
+		 * the object was re-allocated.
+		 */
+	}
+
+out:
+	if (to_report) {
+		kfence_report_error(addr, to_report, error_type);
+		raw_spin_unlock_irqrestore(&to_report->lock, flags);
+	} else {
+		/* This may be a UAF or OOB access, but we can't be sure. */
+		kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID);
+	}
+
+	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
+}
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
new file mode 100644
index 000000000000..1014060f9707
--- /dev/null
+++ b/mm/kfence/kfence.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). For more info please see
+ * Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef MM_KFENCE_KFENCE_H
+#define MM_KFENCE_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../slab.h" /* for struct kmem_cache */
+
+/* For non-debug builds, avoid leaking kernel pointers into dmesg. */
+#ifdef CONFIG_DEBUG_KERNEL
+#define PTR_FMT "%px"
+#else
+#define PTR_FMT "%p"
+#endif
+
+/*
+ * Get the canary byte pattern for @addr. Use a pattern that varies based on the
+ * lower 3 bits of the address, to detect memory corruptions with higher
+ * probability, where similar constants are used.
+ */
+#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
+
+/* Maximum stack depth for reports. */
+#define KFENCE_STACK_DEPTH 64
+
+/* KFENCE object states. */
+enum kfence_object_state {
+	KFENCE_OBJECT_UNUSED,		/* Object is unused. */
+	KFENCE_OBJECT_ALLOCATED,	/* Object is currently allocated. */
+	KFENCE_OBJECT_FREED,		/* Object was allocated, and then freed. */
+};
+
+/* Alloc/free tracking information. */
+struct kfence_track {
+	pid_t pid;
+	int num_stack_entries;
+	unsigned long stack_entries[KFENCE_STACK_DEPTH];
+};
+
+/* KFENCE metadata per guarded allocation. */
+struct kfence_metadata {
+	struct list_head list;		/* Freelist node; access under kfence_freelist_lock. */
+	struct rcu_head rcu_head;	/* For delayed freeing. */
+
+	/*
+	 * Lock protecting below data; to ensure consistency of the below data,
+	 * since the following may execute concurrently: __kfence_alloc(),
+	 * __kfence_free(), kfence_handle_page_fault(). However, note that we
+	 * cannot grab the same metadata off the freelist twice, and multiple
+	 * __kfence_alloc() cannot run concurrently on the same metadata.
+	 */
+	raw_spinlock_t lock;
+
+	/* The current state of the object; see above. */
+	enum kfence_object_state state;
+
+	/*
+	 * Allocated object address; cannot be calculated from size, because of
+	 * alignment requirements.
+	 *
+	 * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
+	 */
+	unsigned long addr;
+
+	/*
+	 * The size of the original allocation.
+	 */
+	size_t size;
+
+	/*
+	 * The kmem_cache cache of the last allocation; NULL if never allocated
+	 * or the cache has already been destroyed.
+	 */
+	struct kmem_cache *cache;
+
+	/*
+	 * In case of an invalid access, the page that was unprotected; we
+	 * optimistically only store one address.
+	 */
+	unsigned long unprotected_page;
+
+	/* Allocation and free stack information. */
+	struct kfence_track alloc_track;
+	struct kfence_track free_track;
+};
+
+extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* KFENCE error types for report generation. */
+enum kfence_error_type {
+	KFENCE_ERROR_OOB,		/* Detected a out-of-bounds access. */
+	KFENCE_ERROR_UAF,		/* Detected a use-after-free access. */
+	KFENCE_ERROR_CORRUPTION,	/* Detected a memory corruption on free. */
+	KFENCE_ERROR_INVALID,		/* Invalid access of unknown type. */
+	KFENCE_ERROR_INVALID_FREE,	/* Invalid free. */
+};
+
+void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
+			 enum kfence_error_type type);
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
+
+#endif /* MM_KFENCE_KFENCE_H */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
new file mode 100644
index 000000000000..64f27c8d46a3
--- /dev/null
+++ b/mm/kfence/report.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE reporting.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/printk.h>
+#include <linux/seq_file.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Helper function to either print to a seq_file or to console. */
+__printf(2, 3)
+static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	if (seq)
+		seq_vprintf(seq, fmt, args);
+	else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
+/*
+ * Get the number of stack entries to skip to get out of MM internals. @type is
+ * optional, and if set to NULL, assumes an allocation or free stack.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries,
+			    const enum kfence_error_type *type)
+{
+	char buf[64];
+	int skipnr, fallback = 0;
+	bool is_access_fault = false;
+
+	if (type) {
+		/* Depending on error type, find different stack entries. */
+		switch (*type) {
+		case KFENCE_ERROR_UAF:
+		case KFENCE_ERROR_OOB:
+		case KFENCE_ERROR_INVALID:
+			is_access_fault = true;
+			break;
+		case KFENCE_ERROR_CORRUPTION:
+		case KFENCE_ERROR_INVALID_FREE:
+			break;
+		}
+	}
+
+	for (skipnr = 0; skipnr < num_entries; skipnr++) {
+		int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
+
+		if (is_access_fault) {
+			if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len))
+				goto found;
+		} else {
+			if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
+			    !strncmp(buf, "__slab_free", len)) {
+				/*
+				 * In case of tail calls from any of the below
+				 * to any of the above.
+				 */
+				fallback = skipnr + 1;
+			}
+
+			/* Also the *_bulk() variants by only checking prefixes. */
+			if (str_has_prefix(buf, "kfree") ||
+			    str_has_prefix(buf, "kmem_cache_free") ||
+			    str_has_prefix(buf, "__kmalloc") ||
+			    str_has_prefix(buf, "kmem_cache_alloc"))
+				goto found;
+		}
+	}
+	if (fallback < num_entries)
+		return fallback;
+found:
+	skipnr++;
+	return skipnr < num_entries ? skipnr : 0;
+}
+
+static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta,
+			       bool show_alloc)
+{
+	const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
+
+	if (track->num_stack_entries) {
+		/* Skip allocation/free internals stack. */
+		int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+
+		/* stack_trace_seq_print() does not exist; open code our own. */
+		for (; i < track->num_stack_entries; i++)
+			seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]);
+	} else {
+		seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation");
+	}
+}
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta)
+{
+	const int size = abs(meta->size);
+	const unsigned long start = meta->addr;
+	const struct kmem_cache *const cache = meta->cache;
+
+	lockdep_assert_held(&meta->lock);
+
+	if (meta->state == KFENCE_OBJECT_UNUSED) {
+		seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata);
+		return;
+	}
+
+	seq_con_printf(seq,
+		       "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT
+		       ", size=%d, cache=%s] allocated by task %d:\n",
+		       meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
+		       (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
+	kfence_print_stack(seq, meta, true);
+
+	if (meta->state == KFENCE_OBJECT_FREED) {
+		seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid);
+		kfence_print_stack(seq, meta, false);
+	}
+}
+
+/*
+ * Show bytes at @addr that are different from the expected canary values, up to
+ * @max_bytes.
+ */
+static void print_diff_canary(unsigned long address, size_t bytes_to_show,
+			      const struct kfence_metadata *meta)
+{
+	const unsigned long show_until_addr = address + bytes_to_show;
+	const u8 *cur, *end;
+
+	/* Do not show contents of object nor read into following guard page. */
+	end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr)
+						: min(show_until_addr, PAGE_ALIGN(address)));
+
+	pr_cont("[");
+	for (cur = (const u8 *)address; cur < end; cur++) {
+		if (*cur == KFENCE_CANARY_PATTERN(cur))
+			pr_cont(" .");
+		else if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+			pr_cont(" 0x%02x", *cur);
+		else /* Do not leak kernel memory in non-debug builds. */
+			pr_cont(" !");
+	}
+	pr_cont(" ]");
+}
+
+void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
+			 enum kfence_error_type type)
+{
+	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
+	int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+	int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+	const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+
+	/* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
+	if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
+		return;
+
+	if (meta)
+		lockdep_assert_held(&meta->lock);
+	/*
+	 * Because we may generate reports in printk-unfriendly parts of the
+	 * kernel, such as scheduler code, the use of printk() could deadlock.
+	 * Until such time that all printing code here is safe in all parts of
+	 * the kernel, accept the risk, and just get our message out (given the
+	 * system might already behave unpredictably due to the memory error).
+	 * As such, also disable lockdep to hide warnings, and avoid disabling
+	 * lockdep for the rest of the kernel.
+	 */
+	lockdep_off();
+
+	pr_err("==================================================================\n");
+	/* Print report header. */
+	switch (type) {
+	case KFENCE_ERROR_OOB: {
+		const bool left_of_object = address < meta->addr;
+
+		pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
+		       (void *)address,
+		       left_of_object ? meta->addr - address : address - meta->addr,
+		       left_of_object ? "left" : "right", object_index);
+		break;
+	}
+	case KFENCE_ERROR_UAF:
+		pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n",
+		       (void *)address, object_index);
+		break;
+	case KFENCE_ERROR_CORRUPTION:
+		pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address);
+		print_diff_canary(address, 16, meta);
+		pr_cont(" (in kfence-#%zd):\n", object_index);
+		break;
+	case KFENCE_ERROR_INVALID:
+		pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address);
+		break;
+	case KFENCE_ERROR_INVALID_FREE:
+		pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
+		pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address,
+		       object_index);
+		break;
+	}
+
+	/* Print stack trace and object info. */
+	stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0);
+
+	if (meta) {
+		pr_err("\n");
+		kfence_print_object(NULL, meta);
+	}
+
+	/* Print report footer. */
+	pr_err("\n");
+	dump_stack_print_info(KERN_ERR);
+	pr_err("==================================================================\n");
+
+	lockdep_on();
+
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
+
+	/* We encountered a memory unsafety error, taint the kernel! */
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
+}
-- 
cgit v1.2.3


From d438fabce7860df3cb9337776be6f90b59ced8ed Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 25 Feb 2021 17:19:08 -0800
Subject: kfence: use pt_regs to generate stack trace on faults

Instead of removing the fault handling portion of the stack trace based on
the fault handler's name, just use struct pt_regs directly.

Change kfence_handle_page_fault() to take a struct pt_regs, and plumb it
through to kfence_report_error() for out-of-bounds, use-after-free, or
invalid access errors, where pt_regs is used to generate the stack trace.

If the kernel is a DEBUG_KERNEL, also show registers for more information.

Link: https://lkml.kernel.org/r/20201105092133.2075331-1-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/kfence.h |  2 --
 arch/arm64/mm/fault.c           |  2 +-
 arch/x86/include/asm/kfence.h   |  6 ----
 arch/x86/mm/fault.c             |  2 +-
 include/linux/kfence.h          |  5 ++--
 mm/kfence/core.c                | 10 +++----
 mm/kfence/kfence.h              |  4 +--
 mm/kfence/report.c              | 63 +++++++++++++++++++++++------------------
 8 files changed, 48 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
index 42a06f83850a..d061176d57ea 100644
--- a/arch/arm64/include/asm/kfence.h
+++ b/arch/arm64/include/asm/kfence.h
@@ -10,8 +10,6 @@
 
 #include <asm/cacheflush.h>
 
-#define KFENCE_SKIP_ARCH_FAULT_HANDLER "el1_sync"
-
 static inline bool arch_kfence_init_pool(void) { return true; }
 
 static inline bool kfence_protect_page(unsigned long addr, bool protect)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 42515900ab2e..56d9423ca59c 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (kfence_handle_page_fault(addr))
+		if (kfence_handle_page_fault(addr, regs))
 			return;
 
 		msg = "paging request";
diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
index a0659dbd93ea..97bbb4a9083a 100644
--- a/arch/x86/include/asm/kfence.h
+++ b/arch/x86/include/asm/kfence.h
@@ -16,12 +16,6 @@
 #include <asm/set_memory.h>
 #include <asm/tlbflush.h>
 
-/*
- * The page fault handler entry function, up to which the stack trace is
- * truncated in reports.
- */
-#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault"
-
 /* Force 4K pages for __kfence_pool. */
 static inline bool arch_kfence_init_pool(void)
 {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 99fe6d3e690d..38868b4ce8b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -682,7 +682,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		efi_crash_gracefully_on_page_fault(address);
 
 	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address))
+	if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs))
 		return;
 
 oops:
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 81f3911cb298..5a56bcf5606c 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
 /**
  * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
  * @addr: faulting address
+ * @regs: current struct pt_regs (can be NULL, but shows full stack trace)
  *
  * Return:
  * * false - address outside KFENCE pool,
@@ -196,7 +197,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
  * cases KFENCE prints an error message and marks the offending page as
  * present, so that the kernel can proceed.
  */
-bool __must_check kfence_handle_page_fault(unsigned long addr);
+bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs);
 
 #else /* CONFIG_KFENCE */
 
@@ -209,7 +210,7 @@ static inline size_t kfence_ksize(const void *addr) { return 0; }
 static inline void *kfence_object_start(const void *addr) { return NULL; }
 static inline void __kfence_free(void *addr) { }
 static inline bool __must_check kfence_free(void *addr) { return false; }
-static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; }
 
 #endif
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index d6a32c13336b..61c76670a7a9 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr)
 		return true;
 
 	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-	kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr),
+	kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr),
 			    KFENCE_ERROR_CORRUPTION);
 	return false;
 }
@@ -351,7 +351,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
 		/* Invalid or double-free, bail out. */
 		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-		kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE);
+		kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE);
 		raw_spin_unlock_irqrestore(&meta->lock, flags);
 		return;
 	}
@@ -766,7 +766,7 @@ void __kfence_free(void *addr)
 		kfence_guarded_free(addr, meta, false);
 }
 
-bool kfence_handle_page_fault(unsigned long addr)
+bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
 {
 	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
 	struct kfence_metadata *to_report = NULL;
@@ -829,11 +829,11 @@ bool kfence_handle_page_fault(unsigned long addr)
 
 out:
 	if (to_report) {
-		kfence_report_error(addr, to_report, error_type);
+		kfence_report_error(addr, regs, to_report, error_type);
 		raw_spin_unlock_irqrestore(&to_report->lock, flags);
 	} else {
 		/* This may be a UAF or OOB access, but we can't be sure. */
-		kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID);
+		kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID);
 	}
 
 	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 1014060f9707..0d83e628a97d 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -105,8 +105,8 @@ enum kfence_error_type {
 	KFENCE_ERROR_INVALID_FREE,	/* Invalid free. */
 };
 
-void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
-			 enum kfence_error_type type);
+void kfence_report_error(unsigned long address, struct pt_regs *regs,
+			 const struct kfence_metadata *meta, enum kfence_error_type type);
 
 void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
 
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 64f27c8d46a3..4dbfa9a382e4 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/lockdep.h>
 #include <linux/printk.h>
+#include <linux/sched/debug.h>
 #include <linux/seq_file.h>
 #include <linux/stacktrace.h>
 #include <linux/string.h>
@@ -41,7 +42,6 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
 {
 	char buf[64];
 	int skipnr, fallback = 0;
-	bool is_access_fault = false;
 
 	if (type) {
 		/* Depending on error type, find different stack entries. */
@@ -49,8 +49,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
 		case KFENCE_ERROR_UAF:
 		case KFENCE_ERROR_OOB:
 		case KFENCE_ERROR_INVALID:
-			is_access_fault = true;
-			break;
+			/*
+			 * kfence_handle_page_fault() may be called with pt_regs
+			 * set to NULL; in that case we'll simply show the full
+			 * stack trace.
+			 */
+			return 0;
 		case KFENCE_ERROR_CORRUPTION:
 		case KFENCE_ERROR_INVALID_FREE:
 			break;
@@ -60,26 +64,21 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
 	for (skipnr = 0; skipnr < num_entries; skipnr++) {
 		int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
 
-		if (is_access_fault) {
-			if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len))
-				goto found;
-		} else {
-			if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
-			    !strncmp(buf, "__slab_free", len)) {
-				/*
-				 * In case of tail calls from any of the below
-				 * to any of the above.
-				 */
-				fallback = skipnr + 1;
-			}
-
-			/* Also the *_bulk() variants by only checking prefixes. */
-			if (str_has_prefix(buf, "kfree") ||
-			    str_has_prefix(buf, "kmem_cache_free") ||
-			    str_has_prefix(buf, "__kmalloc") ||
-			    str_has_prefix(buf, "kmem_cache_alloc"))
-				goto found;
+		if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
+		    !strncmp(buf, "__slab_free", len)) {
+			/*
+			 * In case of tail calls from any of the below
+			 * to any of the above.
+			 */
+			fallback = skipnr + 1;
 		}
+
+		/* Also the *_bulk() variants by only checking prefixes. */
+		if (str_has_prefix(buf, "kfree") ||
+		    str_has_prefix(buf, "kmem_cache_free") ||
+		    str_has_prefix(buf, "__kmalloc") ||
+		    str_has_prefix(buf, "kmem_cache_alloc"))
+			goto found;
 	}
 	if (fallback < num_entries)
 		return fallback;
@@ -157,13 +156,20 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
 	pr_cont(" ]");
 }
 
-void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
-			 enum kfence_error_type type)
+void kfence_report_error(unsigned long address, struct pt_regs *regs,
+			 const struct kfence_metadata *meta, enum kfence_error_type type)
 {
 	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
-	int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
-	int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
 	const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+	int num_stack_entries;
+	int skipnr = 0;
+
+	if (regs) {
+		num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0);
+	} else {
+		num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+		skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+	}
 
 	/* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
 	if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
@@ -227,7 +233,10 @@ void kfence_report_error(unsigned long address, const struct kfence_metadata *me
 
 	/* Print report footer. */
 	pr_err("\n");
-	dump_stack_print_info(KERN_ERR);
+	if (IS_ENABLED(CONFIG_DEBUG_KERNEL) && regs)
+		show_regs(regs);
+	else
+		dump_stack_print_info(KERN_ERR);
 	pr_err("==================================================================\n");
 
 	lockdep_on();
-- 
cgit v1.2.3


From d3fb45f370d927224af35d22d34ea465884afec8 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:19:11 -0800
Subject: mm, kfence: insert KFENCE hooks for SLAB

Inserts KFENCE hooks into the SLAB allocator.

To pass the originally requested size to KFENCE, add an argument
'orig_size' to slab_alloc*(). The additional argument is required to
preserve the requested original size for kmalloc() allocations, which
uses size classes (e.g. an allocation of 272 bytes will return an object
of size 512). Therefore, kmem_cache::size does not represent the
kmalloc-caller's requested size, and we must introduce the argument
'orig_size' to propagate the originally requested size to KFENCE.

Without the originally requested size, we would not be able to detect
out-of-bounds accesses for objects placed at the end of a KFENCE object
page if that object is not equal to the kmalloc-size class it was
bucketed into.

When KFENCE is disabled, there is no additional overhead, since
slab_alloc*() functions are __always_inline.

Link: https://lkml.kernel.org/r/20201103175841.3495947-5-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Co-developed-by: Marco Elver <elver@google.com>

Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h |  3 +++
 mm/kfence/core.c         |  2 ++
 mm/slab.c                | 38 +++++++++++++++++++++++++++++---------
 mm/slab_common.c         |  5 ++++-
 4 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9eb430c163c2..3aa5e1e73ab6 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_SLAB_DEF_H
 #define	_LINUX_SLAB_DEF_H
 
+#include <linux/kfence.h>
 #include <linux/reciprocal_div.h>
 
 /*
@@ -114,6 +115,8 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 static inline int objs_per_slab_page(const struct kmem_cache *cache,
 				     const struct page *page)
 {
+	if (is_kfence_address(page_address(page)))
+		return 1;
 	return cache->num;
 }
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 61c76670a7a9..05c18aa11851 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Set required struct page fields. */
 	page = virt_to_page(meta->addr);
 	page->slab_cache = cache;
+	if (IS_ENABLED(CONFIG_SLAB))
+		page->s_mem = addr;
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
diff --git a/mm/slab.c b/mm/slab.c
index 35c68d99d460..51fd424e0d6d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -100,6 +100,7 @@
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
+#include	<linux/kfence.h>
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
@@ -3208,7 +3209,7 @@ must_grow:
 }
 
 static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
 		   unsigned long caller)
 {
 	unsigned long save_flags;
@@ -3221,6 +3222,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (unlikely(!cachep))
 		return NULL;
 
+	ptr = kfence_alloc(cachep, orig_size, flags);
+	if (unlikely(ptr))
+		goto out_hooks;
+
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 
@@ -3253,6 +3258,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
 		memset(ptr, 0, cachep->object_size);
 
+out_hooks:
 	slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
 	return ptr;
 }
@@ -3290,7 +3296,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 
 static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -3301,6 +3307,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (unlikely(!cachep))
 		return NULL;
 
+	objp = kfence_alloc(cachep, orig_size, flags);
+	if (unlikely(objp))
+		goto out;
+
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	objp = __do_cache_alloc(cachep, flags);
@@ -3311,6 +3321,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
 		memset(objp, 0, cachep->object_size);
 
+out:
 	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
 	return objp;
 }
@@ -3416,6 +3427,12 @@ free_done:
 static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
 					 unsigned long caller)
 {
+	if (is_kfence_address(objp)) {
+		kmemleak_free_recursive(objp, cachep->flags);
+		__kfence_free(objp);
+		return;
+	}
+
 	if (unlikely(slab_want_init_on_free(cachep)))
 		memset(objp, 0, cachep->object_size);
 
@@ -3482,7 +3499,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-	void *ret = slab_alloc(cachep, flags, _RET_IP_);
+	void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret,
 			       cachep->object_size, cachep->size, flags);
@@ -3515,7 +3532,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 	local_irq_disable();
 	for (i = 0; i < size; i++) {
-		void *objp = __do_cache_alloc(s, flags);
+		void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
 
 		if (unlikely(!objp))
 			goto error;
@@ -3548,7 +3565,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
 	void *ret;
 
-	ret = slab_alloc(cachep, flags, _RET_IP_);
+	ret = slab_alloc(cachep, flags, size, _RET_IP_);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(_RET_IP_, ret,
@@ -3574,7 +3591,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
  */
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+	void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    cachep->object_size, cachep->size,
@@ -3592,7 +3609,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
 {
 	void *ret;
 
-	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+	ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc_node(_RET_IP_, ret,
@@ -3673,7 +3690,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
-	ret = slab_alloc(cachep, flags, caller);
+	ret = slab_alloc(cachep, flags, size, caller);
 
 	ret = kasan_kmalloc(cachep, ret, size, flags);
 	trace_kmalloc(caller, ret,
@@ -4172,7 +4189,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 	BUG_ON(objnr >= cachep->num);
 
 	/* Find offset within object. */
-	offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+	if (is_kfence_address(ptr))
+		offset = ptr - kfence_object_start(ptr);
+	else
+		offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
 
 	/* Allow address range falling entirely within usercopy region. */
 	if (offset >= cachep->useroffset &&
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7c8298c17145..284954ef1da5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -12,6 +12,7 @@
 #include <linux/memory.h>
 #include <linux/cache.h>
 #include <linux/compiler.h>
+#include <linux/kfence.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
@@ -430,6 +431,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	rcu_barrier();
 
 	list_for_each_entry_safe(s, s2, &to_destroy, list) {
+		kfence_shutdown_cache(s);
 #ifdef SLAB_SUPPORTS_SYSFS
 		sysfs_slab_release(s);
 #else
@@ -455,6 +457,7 @@ static int shutdown_cache(struct kmem_cache *s)
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
 		schedule_work(&slab_caches_to_rcu_destroy_work);
 	} else {
+		kfence_shutdown_cache(s);
 #ifdef SLAB_SUPPORTS_SYSFS
 		sysfs_slab_unlink(s);
 		sysfs_slab_release(s);
@@ -1235,7 +1238,7 @@ size_t ksize(const void *objp)
 	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
 		return 0;
 
-	size = __ksize(objp);
+	size = kfence_ksize(objp) ?: __ksize(objp);
 	/*
 	 * We assume that ksize callers could use whole allocated area,
 	 * so we need to unpoison this area.
-- 
cgit v1.2.3


From b89fb5ef0ce611b5db8eb9d3a5a7fcaab2cbe9e4 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 25 Feb 2021 17:19:16 -0800
Subject: mm, kfence: insert KFENCE hooks for SLUB

Inserts KFENCE hooks into the SLUB allocator.

To pass the originally requested size to KFENCE, add an argument
'orig_size' to slab_alloc*(). The additional argument is required to
preserve the requested original size for kmalloc() allocations, which
uses size classes (e.g. an allocation of 272 bytes will return an object
of size 512). Therefore, kmem_cache::size does not represent the
kmalloc-caller's requested size, and we must introduce the argument
'orig_size' to propagate the originally requested size to KFENCE.

Without the originally requested size, we would not be able to detect
out-of-bounds accesses for objects placed at the end of a KFENCE object
page if that object is not equal to the kmalloc-size class it was
bucketed into.

When KFENCE is disabled, there is no additional overhead, since
slab_alloc*() functions are __always_inline.

Link: https://lkml.kernel.org/r/20201103175841.3495947-6-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Co-developed-by: Marco Elver <elver@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  3 +++
 mm/kfence/core.c         |  2 ++
 mm/slub.c                | 60 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 51 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 1be0ed5befa1..dcde82a4434c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -7,6 +7,7 @@
  *
  * (C) 2007 SGI, Christoph Lameter
  */
+#include <linux/kfence.h>
 #include <linux/kobject.h>
 #include <linux/reciprocal_div.h>
 
@@ -185,6 +186,8 @@ static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 					const struct page *page, void *obj)
 {
+	if (is_kfence_address(obj))
+		return 0;
 	return __obj_to_index(cache, page_address(page), obj);
 }
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 05c18aa11851..7692af715fdb 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -317,6 +317,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Set required struct page fields. */
 	page = virt_to_page(meta->addr);
 	page->slab_cache = cache;
+	if (IS_ENABLED(CONFIG_SLUB))
+		page->objects = 1;
 	if (IS_ENABLED(CONFIG_SLAB))
 		page->s_mem = addr;
 
diff --git a/mm/slub.c b/mm/slub.c
index b2833ce85c92..383616af28c4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
 #include <linux/ctype.h>
 #include <linux/debugobjects.h>
 #include <linux/kallsyms.h>
+#include <linux/kfence.h>
 #include <linux/memory.h>
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
@@ -1570,6 +1571,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 	void *old_tail = *tail ? *tail : *head;
 	int rsize;
 
+	if (is_kfence_address(next)) {
+		slab_free_hook(s, next);
+		return true;
+	}
+
 	/* Head and tail of the reconstructed freelist */
 	*head = NULL;
 	*tail = NULL;
@@ -2809,7 +2815,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
  * Otherwise we can simply pick the next object from the lockless free list.
  */
 static __always_inline void *slab_alloc_node(struct kmem_cache *s,
-		gfp_t gfpflags, int node, unsigned long addr)
+		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
 {
 	void *object;
 	struct kmem_cache_cpu *c;
@@ -2820,6 +2826,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 	s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
 	if (!s)
 		return NULL;
+
+	object = kfence_alloc(s, orig_size, gfpflags);
+	if (unlikely(object))
+		goto out;
+
 redo:
 	/*
 	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2892,20 +2903,21 @@ redo:
 	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
 		memset(kasan_reset_tag(object), 0, s->object_size);
 
+out:
 	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
 
 	return object;
 }
 
 static __always_inline void *slab_alloc(struct kmem_cache *s,
-		gfp_t gfpflags, unsigned long addr)
+		gfp_t gfpflags, unsigned long addr, size_t orig_size)
 {
-	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
 }
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
 
 	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
 				s->size, gfpflags);
@@ -2917,7 +2929,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+	void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
@@ -2928,7 +2940,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
 
 	trace_kmem_cache_alloc_node(_RET_IP_, ret,
 				    s->object_size, s->size, gfpflags, node);
@@ -2942,7 +2954,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 				    gfp_t gfpflags,
 				    int node, size_t size)
 {
-	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
@@ -2976,6 +2988,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 	stat(s, FREE_SLOWPATH);
 
+	if (kfence_free(head))
+		return;
+
 	if (kmem_cache_debug(s) &&
 	    !free_debug_processing(s, page, head, tail, cnt, addr))
 		return;
@@ -3220,6 +3235,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
 		df->s = cache_from_obj(s, object); /* Support for memcg */
 	}
 
+	if (is_kfence_address(object)) {
+		slab_free_hook(df->s, object);
+		__kfence_free(object);
+		p[size] = NULL; /* mark object processed */
+		return size;
+	}
+
 	/* Start new detached freelist */
 	df->page = page;
 	set_freepointer(df->s, object, NULL);
@@ -3295,8 +3317,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	c = this_cpu_ptr(s->cpu_slab);
 
 	for (i = 0; i < size; i++) {
-		void *object = c->freelist;
+		void *object = kfence_alloc(s, s->object_size, flags);
 
+		if (unlikely(object)) {
+			p[i] = object;
+			continue;
+		}
+
+		object = c->freelist;
 		if (unlikely(!object)) {
 			/*
 			 * We may have removed an object from c->freelist using
@@ -4021,7 +4049,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, flags, _RET_IP_);
+	ret = slab_alloc(s, flags, _RET_IP_, size);
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
@@ -4069,7 +4097,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc_node(s, flags, node, _RET_IP_);
+	ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
@@ -4095,6 +4123,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 	struct kmem_cache *s;
 	unsigned int offset;
 	size_t object_size;
+	bool is_kfence = is_kfence_address(ptr);
 
 	ptr = kasan_reset_tag(ptr);
 
@@ -4107,10 +4136,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 			       to_user, 0, n);
 
 	/* Find offset within object. */
-	offset = (ptr - page_address(page)) % s->size;
+	if (is_kfence)
+		offset = ptr - kfence_object_start(ptr);
+	else
+		offset = (ptr - page_address(page)) % s->size;
 
 	/* Adjust for redzone and reject if within the redzone. */
-	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
+	if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
 		if (offset < s->red_left_pad)
 			usercopy_abort("SLUB object in left red zone",
 				       s->name, to_user, offset, n);
@@ -4527,7 +4559,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc(s, gfpflags, caller);
+	ret = slab_alloc(s, gfpflags, caller, size);
 
 	/* Honor the call site pointer we received. */
 	trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4558,7 +4590,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	ret = slab_alloc_node(s, gfpflags, node, caller);
+	ret = slab_alloc_node(s, gfpflags, node, caller, size);
 
 	/* Honor the call site pointer we received. */
 	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
-- 
cgit v1.2.3


From bc8fbc5f305aecf63423da91e5faf4c0ce40bf38 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 25 Feb 2021 17:19:31 -0800
Subject: kfence: add test suite

Add KFENCE test suite, testing various error detection scenarios. Makes
use of KUnit for test organization. Since KFENCE's interface to obtain
error reports is via the console, the test verifies that KFENCE outputs
expected reports to the console.

[elver@google.com: fix typo in test]
  Link: https://lkml.kernel.org/r/X9lHQExmHGvETxY4@elver.google.com
[elver@google.com: show access type in report]
  Link: https://lkml.kernel.org/r/20210111091544.3287013-2-elver@google.com

Link: https://lkml.kernel.org/r/20201103175841.3495947-9-elver@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Co-developed-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Jann Horn <jannh@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joern Engel <joern@purestorage.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: SeongJae Park <sjpark@amazon.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kfence.rst |  12 +-
 arch/arm64/mm/fault.c              |   2 +-
 arch/x86/mm/fault.c                |   3 +-
 include/linux/kfence.h             |   9 +-
 lib/Kconfig.kfence                 |  13 +
 mm/kfence/Makefile                 |   3 +
 mm/kfence/core.c                   |  11 +-
 mm/kfence/kfence.h                 |   2 +-
 mm/kfence/kfence_test.c            | 858 +++++++++++++++++++++++++++++++++++++
 mm/kfence/report.c                 |  27 +-
 10 files changed, 915 insertions(+), 25 deletions(-)
 create mode 100644 mm/kfence/kfence_test.c

(limited to 'include/linux')

diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0e2fb6ef3016..58a0a5fa1ddc 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -65,9 +65,9 @@ Error reports
 A typical out-of-bounds access looks like this::
 
     ==================================================================
-    BUG: KFENCE: out-of-bounds in test_out_of_bounds_read+0xa3/0x22b
+    BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa3/0x22b
 
-    Out-of-bounds access at 0xffffffffb672efff (1B left of kfence-#17):
+    Out-of-bounds read at 0xffffffffb672efff (1B left of kfence-#17):
      test_out_of_bounds_read+0xa3/0x22b
      kunit_try_run_case+0x51/0x85
      kunit_generic_run_threadfn_adapter+0x16/0x30
@@ -94,9 +94,9 @@ its origin. Note that, real kernel addresses are only shown for
 Use-after-free accesses are reported as::
 
     ==================================================================
-    BUG: KFENCE: use-after-free in test_use_after_free_read+0xb3/0x143
+    BUG: KFENCE: use-after-free read in test_use_after_free_read+0xb3/0x143
 
-    Use-after-free access at 0xffffffffb673dfe0 (in kfence-#24):
+    Use-after-free read at 0xffffffffb673dfe0 (in kfence-#24):
      test_use_after_free_read+0xb3/0x143
      kunit_try_run_case+0x51/0x85
      kunit_generic_run_threadfn_adapter+0x16/0x30
@@ -193,9 +193,9 @@ where it was not possible to determine an associated object, e.g. if adjacent
 object pages had not yet been allocated::
 
     ==================================================================
-    BUG: KFENCE: invalid access in test_invalid_access+0x26/0xe0
+    BUG: KFENCE: invalid read in test_invalid_access+0x26/0xe0
 
-    Invalid access at 0xffffffffb670b00a:
+    Invalid read at 0xffffffffb670b00a:
      test_invalid_access+0x26/0xe0
      kunit_try_run_case+0x51/0x85
      kunit_generic_run_threadfn_adapter+0x16/0x30
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 56d9423ca59c..f37d4e3830b7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -390,7 +390,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (kfence_handle_page_fault(addr, regs))
+		if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
 			return;
 
 		msg = "paging request";
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 38868b4ce8b0..a73347e2cdfc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -682,7 +682,8 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		efi_crash_gracefully_on_page_fault(address);
 
 	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, regs))
+	if (!(error_code & X86_PF_PROT) &&
+	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
 		return;
 
 oops:
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 5a56bcf5606c..a70d1ea03532 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -186,6 +186,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
 /**
  * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
  * @addr: faulting address
+ * @is_write: is access a write
  * @regs: current struct pt_regs (can be NULL, but shows full stack trace)
  *
  * Return:
@@ -197,7 +198,7 @@ static __always_inline __must_check bool kfence_free(void *addr)
  * cases KFENCE prints an error message and marks the offending page as
  * present, so that the kernel can proceed.
  */
-bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs);
+bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs);
 
 #else /* CONFIG_KFENCE */
 
@@ -210,7 +211,11 @@ static inline size_t kfence_ksize(const void *addr) { return 0; }
 static inline void *kfence_object_start(const void *addr) { return NULL; }
 static inline void __kfence_free(void *addr) { }
 static inline bool __must_check kfence_free(void *addr) { return false; }
-static inline bool __must_check kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write,
+							 struct pt_regs *regs)
+{
+	return false;
+}
 
 #endif
 
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
index 605125ac2ae0..78f50ccb3b45 100644
--- a/lib/Kconfig.kfence
+++ b/lib/Kconfig.kfence
@@ -66,4 +66,17 @@ config KFENCE_STRESS_TEST_FAULTS
 
 	  Only for KFENCE testing; set to 0 if you are not a KFENCE developer.
 
+config KFENCE_KUNIT_TEST
+	tristate "KFENCE integration test suite" if !KUNIT_ALL_TESTS
+	default KUNIT_ALL_TESTS
+	depends on TRACEPOINTS && KUNIT
+	help
+	  Test suite for KFENCE, testing various error detection scenarios with
+	  various allocation types, and checking that reports are correctly
+	  output to console.
+
+	  Say Y here if you want the test to be built into the kernel and run
+	  during boot; say M if you want the test to build as a module; say N
+	  if you are unsure.
+
 endif # KFENCE
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
index d991e9a349f0..6872cd5e5390 100644
--- a/mm/kfence/Makefile
+++ b/mm/kfence/Makefile
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_KFENCE) := core.o report.o
+
+CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
+obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7692af715fdb..cfe3d32ac5b7 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -216,7 +216,7 @@ static inline bool check_canary_byte(u8 *addr)
 		return true;
 
 	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-	kfence_report_error((unsigned long)addr, NULL, addr_to_metadata((unsigned long)addr),
+	kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
 			    KFENCE_ERROR_CORRUPTION);
 	return false;
 }
@@ -355,7 +355,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
 		/* Invalid or double-free, bail out. */
 		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-		kfence_report_error((unsigned long)addr, NULL, meta, KFENCE_ERROR_INVALID_FREE);
+		kfence_report_error((unsigned long)addr, false, NULL, meta,
+				    KFENCE_ERROR_INVALID_FREE);
 		raw_spin_unlock_irqrestore(&meta->lock, flags);
 		return;
 	}
@@ -770,7 +771,7 @@ void __kfence_free(void *addr)
 		kfence_guarded_free(addr, meta, false);
 }
 
-bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
+bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
 {
 	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
 	struct kfence_metadata *to_report = NULL;
@@ -833,11 +834,11 @@ bool kfence_handle_page_fault(unsigned long addr, struct pt_regs *regs)
 
 out:
 	if (to_report) {
-		kfence_report_error(addr, regs, to_report, error_type);
+		kfence_report_error(addr, is_write, regs, to_report, error_type);
 		raw_spin_unlock_irqrestore(&to_report->lock, flags);
 	} else {
 		/* This may be a UAF or OOB access, but we can't be sure. */
-		kfence_report_error(addr, regs, NULL, KFENCE_ERROR_INVALID);
+		kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
 	}
 
 	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 0d83e628a97d..1accc840dbbe 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -105,7 +105,7 @@ enum kfence_error_type {
 	KFENCE_ERROR_INVALID_FREE,	/* Invalid free. */
 };
 
-void kfence_report_error(unsigned long address, struct pt_regs *regs,
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
 			 const struct kfence_metadata *meta, enum kfence_error_type type);
 
 void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
new file mode 100644
index 000000000000..db1bb596acaf
--- /dev/null
+++ b/mm/kfence/kfence_test.c
@@ -0,0 +1,858 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KFENCE memory safety error detector. Since the interface with
+ * which KFENCE's reports are obtained is via the console, this is the output we
+ * should verify. For each test case checks the presence (or absence) of
+ * generated reports. Relies on 'console' tracepoint to capture reports as they
+ * appear in the kernel log.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ *         Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <trace/events/printk.h>
+
+#include "kfence.h"
+
+/* Report as observed from console. */
+static struct {
+	spinlock_t lock;
+	int nlines;
+	char lines[2][256];
+} observed = {
+	.lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+	unsigned long flags;
+	int nlines;
+
+	spin_lock_irqsave(&observed.lock, flags);
+	nlines = observed.nlines;
+
+	if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) {
+		/*
+		 * KFENCE report and related to the test.
+		 *
+		 * The provided @buf is not NUL-terminated; copy no more than
+		 * @len bytes and let strscpy() add the missing NUL-terminator.
+		 */
+		strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+		nlines = 1;
+	} else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) {
+		strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+	}
+
+	WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+	spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+	return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+	enum kfence_error_type type; /* The type or error. */
+	void *fn; /* Function pointer to expected function where access occurred. */
+	char *addr; /* Address at which the bad access occurred. */
+	bool is_write; /* Is access a write. */
+};
+
+static const char *get_access_type(const struct expect_report *r)
+{
+	return r->is_write ? "write" : "read";
+}
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+	bool ret = false;
+	unsigned long flags;
+	typeof(observed.lines) expect;
+	const char *end;
+	char *cur;
+
+	/* Doubled-checked locking. */
+	if (!report_available())
+		return false;
+
+	/* Generate expected report contents. */
+
+	/* Title */
+	cur = expect[0];
+	end = &expect[0][sizeof(expect[0]) - 1];
+	switch (r->type) {
+	case KFENCE_ERROR_OOB:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s",
+				 get_access_type(r));
+		break;
+	case KFENCE_ERROR_UAF:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s",
+				 get_access_type(r));
+		break;
+	case KFENCE_ERROR_CORRUPTION:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption");
+		break;
+	case KFENCE_ERROR_INVALID:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s",
+				 get_access_type(r));
+		break;
+	case KFENCE_ERROR_INVALID_FREE:
+		cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free");
+		break;
+	}
+
+	scnprintf(cur, end - cur, " in %pS", r->fn);
+	/* The exact offset won't match, remove it; also strip module name. */
+	cur = strchr(expect[0], '+');
+	if (cur)
+		*cur = '\0';
+
+	/* Access information */
+	cur = expect[1];
+	end = &expect[1][sizeof(expect[1]) - 1];
+
+	switch (r->type) {
+	case KFENCE_ERROR_OOB:
+		cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
+		break;
+	case KFENCE_ERROR_UAF:
+		cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
+		break;
+	case KFENCE_ERROR_CORRUPTION:
+		cur += scnprintf(cur, end - cur, "Corrupted memory at");
+		break;
+	case KFENCE_ERROR_INVALID:
+		cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
+		break;
+	case KFENCE_ERROR_INVALID_FREE:
+		cur += scnprintf(cur, end - cur, "Invalid free of");
+		break;
+	}
+
+	cur += scnprintf(cur, end - cur, " 0x" PTR_FMT, (void *)r->addr);
+
+	spin_lock_irqsave(&observed.lock, flags);
+	if (!report_available())
+		goto out; /* A new report is being captured. */
+
+	/* Finally match expected output to what we actually observed. */
+	ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]);
+out:
+	spin_unlock_irqrestore(&observed.lock, flags);
+	return ret;
+}
+
+/* ===== Test cases ===== */
+
+#define TEST_PRIV_WANT_MEMCACHE ((void *)1)
+
+/* Cache used by tests; if NULL, allocate from kmalloc instead. */
+static struct kmem_cache *test_cache;
+
+static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags,
+			       void (*ctor)(void *))
+{
+	if (test->priv != TEST_PRIV_WANT_MEMCACHE)
+		return size;
+
+	kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor);
+
+	/*
+	 * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any
+	 * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to
+	 * allocate via memcg, if enabled.
+	 */
+	flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT;
+	test_cache = kmem_cache_create("test", size, 1, flags, ctor);
+	KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache");
+
+	return size;
+}
+
+static void test_cache_destroy(void)
+{
+	if (!test_cache)
+		return;
+
+	kmem_cache_destroy(test_cache);
+	test_cache = NULL;
+}
+
+static inline size_t kmalloc_cache_alignment(size_t size)
+{
+	return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align;
+}
+
+/* Must always inline to match stack trace against caller. */
+static __always_inline void test_free(void *ptr)
+{
+	if (test_cache)
+		kmem_cache_free(test_cache, ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * If this should be a KFENCE allocation, and on which side the allocation and
+ * the closest guard page should be.
+ */
+enum allocation_policy {
+	ALLOCATE_ANY, /* KFENCE, any side. */
+	ALLOCATE_LEFT, /* KFENCE, left side of page. */
+	ALLOCATE_RIGHT, /* KFENCE, right side of page. */
+	ALLOCATE_NONE, /* No KFENCE allocation. */
+};
+
+/*
+ * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the
+ * current test_cache if set up.
+ */
+static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy)
+{
+	void *alloc;
+	unsigned long timeout, resched_after;
+	const char *policy_name;
+
+	switch (policy) {
+	case ALLOCATE_ANY:
+		policy_name = "any";
+		break;
+	case ALLOCATE_LEFT:
+		policy_name = "left";
+		break;
+	case ALLOCATE_RIGHT:
+		policy_name = "right";
+		break;
+	case ALLOCATE_NONE:
+		policy_name = "none";
+		break;
+	}
+
+	kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+		   policy_name, !!test_cache);
+
+	/*
+	 * 100x the sample interval should be more than enough to ensure we get
+	 * a KFENCE allocation eventually.
+	 */
+	timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+	/*
+	 * Especially for non-preemption kernels, ensure the allocation-gate
+	 * timer can catch up: after @resched_after, every failed allocation
+	 * attempt yields, to ensure the allocation-gate timer is scheduled.
+	 */
+	resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+	do {
+		if (test_cache)
+			alloc = kmem_cache_alloc(test_cache, gfp);
+		else
+			alloc = kmalloc(size, gfp);
+
+		if (is_kfence_address(alloc)) {
+			struct page *page = virt_to_head_page(alloc);
+			struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)];
+
+			/*
+			 * Verify that various helpers return the right values
+			 * even for KFENCE objects; these are required so that
+			 * memcg accounting works correctly.
+			 */
+			KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
+			KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
+
+			if (policy == ALLOCATE_ANY)
+				return alloc;
+			if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+				return alloc;
+			if (policy == ALLOCATE_RIGHT &&
+			    !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+				return alloc;
+		} else if (policy == ALLOCATE_NONE)
+			return alloc;
+
+		test_free(alloc);
+
+		if (time_after(jiffies, resched_after))
+			cond_resched();
+	} while (time_before(jiffies, timeout));
+
+	KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE");
+	return NULL; /* Unreachable. */
+}
+
+static void test_out_of_bounds_read(struct kunit *test)
+{
+	size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_OOB,
+		.fn = test_out_of_bounds_read,
+		.is_write = false,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+
+	/*
+	 * If we don't have our own cache, adjust based on alignment, so that we
+	 * actually access guard pages on either side.
+	 */
+	if (!test_cache)
+		size = kmalloc_cache_alignment(size);
+
+	/* Test both sides. */
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+	expect.addr = buf - 1;
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	test_free(buf);
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+	expect.addr = buf + size;
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	test_free(buf);
+}
+
+static void test_out_of_bounds_write(struct kunit *test)
+{
+	size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_OOB,
+		.fn = test_out_of_bounds_write,
+		.is_write = true,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+	expect.addr = buf - 1;
+	WRITE_ONCE(*expect.addr, 42);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	test_free(buf);
+}
+
+static void test_use_after_free_read(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_use_after_free_read,
+		.is_write = false,
+	};
+
+	setup_test_cache(test, size, 0, NULL);
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	test_free(expect.addr);
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_double_free(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_INVALID_FREE,
+		.fn = test_double_free,
+	};
+
+	setup_test_cache(test, size, 0, NULL);
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	test_free(expect.addr);
+	test_free(expect.addr); /* Double-free. */
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_invalid_addr_free(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_INVALID_FREE,
+		.fn = test_invalid_addr_free,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	expect.addr = buf + 1; /* Free on invalid address. */
+	test_free(expect.addr); /* Invalid address free. */
+	test_free(buf); /* No error. */
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_corruption(struct kunit *test)
+{
+	size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_CORRUPTION,
+		.fn = test_corruption,
+	};
+	char *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+
+	/* Test both sides. */
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+	expect.addr = buf + size;
+	WRITE_ONCE(*expect.addr, 42);
+	test_free(buf);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+	expect.addr = buf - 1;
+	WRITE_ONCE(*expect.addr, 42);
+	test_free(buf);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * KFENCE is unable to detect an OOB if the allocation's alignment requirements
+ * leave a gap between the object and the guard page. Specifically, an
+ * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB
+ * respectively. Therefore it is impossible for the allocated object to
+ * contiguously line up with the right guard page.
+ *
+ * However, we test that an access to memory beyond the gap results in KFENCE
+ * detecting an OOB access.
+ */
+static void test_kmalloc_aligned_oob_read(struct kunit *test)
+{
+	const size_t size = 73;
+	const size_t align = kmalloc_cache_alignment(size);
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_OOB,
+		.fn = test_kmalloc_aligned_oob_read,
+		.is_write = false,
+	};
+	char *buf;
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+
+	/*
+	 * The object is offset to the right, so there won't be an OOB to the
+	 * left of it.
+	 */
+	READ_ONCE(*(buf - 1));
+	KUNIT_EXPECT_FALSE(test, report_available());
+
+	/*
+	 * @buf must be aligned on @align, therefore buf + size belongs to the
+	 * same page -> no OOB.
+	 */
+	READ_ONCE(*(buf + size));
+	KUNIT_EXPECT_FALSE(test, report_available());
+
+	/* Overflowing by @align bytes will result in an OOB. */
+	expect.addr = buf + size + align;
+	READ_ONCE(*expect.addr);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+	test_free(buf);
+}
+
+static void test_kmalloc_aligned_oob_write(struct kunit *test)
+{
+	const size_t size = 73;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_CORRUPTION,
+		.fn = test_kmalloc_aligned_oob_write,
+	};
+	char *buf;
+
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+	/*
+	 * The object is offset to the right, so we won't get a page
+	 * fault immediately after it.
+	 */
+	expect.addr = buf + size;
+	WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1);
+	KUNIT_EXPECT_FALSE(test, report_available());
+	test_free(buf);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test cache shrinking and destroying with KFENCE. */
+static void test_shrink_memcache(struct kunit *test)
+{
+	const size_t size = 32;
+	void *buf;
+
+	setup_test_cache(test, size, 0, NULL);
+	KUNIT_EXPECT_TRUE(test, test_cache);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	kmem_cache_shrink(test_cache);
+	test_free(buf);
+
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void ctor_set_x(void *obj)
+{
+	/* Every object has at least 8 bytes. */
+	memset(obj, 'x', 8);
+}
+
+/* Ensure that SL*B does not modify KFENCE objects on bulk free. */
+static void test_free_bulk(struct kunit *test)
+{
+	int iter;
+
+	for (iter = 0; iter < 5; iter++) {
+		const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0,
+						     (iter & 1) ? ctor_set_x : NULL);
+		void *objects[] = {
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+			test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+		};
+
+		kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects);
+		KUNIT_ASSERT_FALSE(test, report_available());
+		test_cache_destroy();
+	}
+}
+
+/* Test init-on-free works. */
+static void test_init_on_free(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_init_on_free,
+		.is_write = false,
+	};
+	int i;
+
+	if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
+		return;
+	/* Assume it hasn't been disabled on command line. */
+
+	setup_test_cache(test, size, 0, NULL);
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	for (i = 0; i < size; i++)
+		expect.addr[i] = i + 1;
+	test_free(expect.addr);
+
+	for (i = 0; i < size; i++) {
+		/*
+		 * This may fail if the page was recycled by KFENCE and then
+		 * written to again -- this however, is near impossible with a
+		 * default config.
+		 */
+		KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0);
+
+		if (!i) /* Only check first access to not fail test if page is ever re-protected. */
+			KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+	}
+}
+
+/* Ensure that constructors work properly. */
+static void test_memcache_ctor(struct kunit *test)
+{
+	const size_t size = 32;
+	char *buf;
+	int i;
+
+	setup_test_cache(test, size, 0, ctor_set_x);
+	buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+
+	for (i = 0; i < 8; i++)
+		KUNIT_EXPECT_EQ(test, buf[i], (char)'x');
+
+	test_free(buf);
+
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/* Test that memory is zeroed if requested. */
+static void test_gfpzero(struct kunit *test)
+{
+	const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */
+	char *buf1, *buf2;
+	int i;
+
+	if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
+		kunit_warn(test, "skipping ... would take too long\n");
+		return;
+	}
+
+	setup_test_cache(test, size, 0, NULL);
+	buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	for (i = 0; i < size; i++)
+		buf1[i] = i + 1;
+	test_free(buf1);
+
+	/* Try to get same address again -- this can take a while. */
+	for (i = 0;; i++) {
+		buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY);
+		if (buf1 == buf2)
+			break;
+		test_free(buf2);
+
+		if (i == CONFIG_KFENCE_NUM_OBJECTS) {
+			kunit_warn(test, "giving up ... cannot get same object back\n");
+			return;
+		}
+	}
+
+	for (i = 0; i < size; i++)
+		KUNIT_EXPECT_EQ(test, buf2[i], (char)0);
+
+	test_free(buf2);
+
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void test_invalid_access(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.type = KFENCE_ERROR_INVALID,
+		.fn = test_invalid_access,
+		.addr = &__kfence_pool[10],
+		.is_write = false,
+	};
+
+	READ_ONCE(__kfence_pool[10]);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test SLAB_TYPESAFE_BY_RCU works. */
+static void test_memcache_typesafe_by_rcu(struct kunit *test)
+{
+	const size_t size = 32;
+	struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_memcache_typesafe_by_rcu,
+		.is_write = false,
+	};
+
+	setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL);
+	KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+
+	expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+	*expect.addr = 42;
+
+	rcu_read_lock();
+	test_free(expect.addr);
+	KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+	/*
+	 * Up to this point, memory should not have been freed yet, and
+	 * therefore there should be no KFENCE report from the above access.
+	 */
+	rcu_read_unlock();
+
+	/* Above access to @expect.addr should not have generated a report! */
+	KUNIT_EXPECT_FALSE(test, report_available());
+
+	/* Only after rcu_barrier() is the memory guaranteed to be freed. */
+	rcu_barrier();
+
+	/* Expect use-after-free. */
+	KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+	KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test krealloc(). */
+static void test_krealloc(struct kunit *test)
+{
+	const size_t size = 32;
+	const struct expect_report expect = {
+		.type = KFENCE_ERROR_UAF,
+		.fn = test_krealloc,
+		.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY),
+		.is_write = false,
+	};
+	char *buf = expect.addr;
+	int i;
+
+	KUNIT_EXPECT_FALSE(test, test_cache);
+	KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */
+	for (i = 0; i < size; i++)
+		buf[i] = i + 1;
+
+	/* Check that we successfully change the size. */
+	buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */
+	/* Note: Might no longer be a KFENCE alloc. */
+	KUNIT_EXPECT_GE(test, ksize(buf), size * 3);
+	for (i = 0; i < size; i++)
+		KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+	for (; i < size * 3; i++) /* Fill to extra bytes. */
+		buf[i] = i + 1;
+
+	buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */
+	KUNIT_EXPECT_GE(test, ksize(buf), size * 2);
+	for (i = 0; i < size * 2; i++)
+		KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+
+	buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */
+	KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR);
+	KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */
+
+	READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */
+	KUNIT_ASSERT_TRUE(test, report_matches(&expect));
+}
+
+/* Test that some objects from a bulk allocation belong to KFENCE pool. */
+static void test_memcache_alloc_bulk(struct kunit *test)
+{
+	const size_t size = 32;
+	bool pass = false;
+	unsigned long timeout;
+
+	setup_test_cache(test, size, 0, NULL);
+	KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+	/*
+	 * 100x the sample interval should be more than enough to ensure we get
+	 * a KFENCE allocation eventually.
+	 */
+	timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+	do {
+		void *objects[100];
+		int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
+						   objects);
+		if (!num)
+			continue;
+		for (i = 0; i < ARRAY_SIZE(objects); i++) {
+			if (is_kfence_address(objects[i])) {
+				pass = true;
+				break;
+			}
+		}
+		kmem_cache_free_bulk(test_cache, num, objects);
+		/*
+		 * kmem_cache_alloc_bulk() disables interrupts, and calling it
+		 * in a tight loop may not give KFENCE a chance to switch the
+		 * static branch. Call cond_resched() to let KFENCE chime in.
+		 */
+		cond_resched();
+	} while (!pass && time_before(jiffies, timeout));
+
+	KUNIT_EXPECT_TRUE(test, pass);
+	KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/*
+ * KUnit does not provide a way to provide arguments to tests, and we encode
+ * additional info in the name. Set up 2 tests per test case, one using the
+ * default allocator, and another using a custom memcache (suffix '-memcache').
+ */
+#define KFENCE_KUNIT_CASE(test_name)						\
+	{ .run_case = test_name, .name = #test_name },				\
+	{ .run_case = test_name, .name = #test_name "-memcache" }
+
+static struct kunit_case kfence_test_cases[] = {
+	KFENCE_KUNIT_CASE(test_out_of_bounds_read),
+	KFENCE_KUNIT_CASE(test_out_of_bounds_write),
+	KFENCE_KUNIT_CASE(test_use_after_free_read),
+	KFENCE_KUNIT_CASE(test_double_free),
+	KFENCE_KUNIT_CASE(test_invalid_addr_free),
+	KFENCE_KUNIT_CASE(test_corruption),
+	KFENCE_KUNIT_CASE(test_free_bulk),
+	KFENCE_KUNIT_CASE(test_init_on_free),
+	KUNIT_CASE(test_kmalloc_aligned_oob_read),
+	KUNIT_CASE(test_kmalloc_aligned_oob_write),
+	KUNIT_CASE(test_shrink_memcache),
+	KUNIT_CASE(test_memcache_ctor),
+	KUNIT_CASE(test_invalid_access),
+	KUNIT_CASE(test_gfpzero),
+	KUNIT_CASE(test_memcache_typesafe_by_rcu),
+	KUNIT_CASE(test_krealloc),
+	KUNIT_CASE(test_memcache_alloc_bulk),
+	{},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&observed.lock, flags);
+	for (i = 0; i < ARRAY_SIZE(observed.lines); i++)
+		observed.lines[i][0] = '\0';
+	observed.nlines = 0;
+	spin_unlock_irqrestore(&observed.lock, flags);
+
+	/* Any test with 'memcache' in its name will want a memcache. */
+	if (strstr(test->name, "memcache"))
+		test->priv = TEST_PRIV_WANT_MEMCACHE;
+	else
+		test->priv = NULL;
+
+	return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+	test_cache_destroy();
+}
+
+static struct kunit_suite kfence_test_suite = {
+	.name = "kfence",
+	.test_cases = kfence_test_cases,
+	.init = test_init,
+	.exit = test_exit,
+};
+static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL };
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	check_trace_callback_type_console(probe_console);
+	if (!strcmp(tp->name, "console"))
+		WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	if (!strcmp(tp->name, "console"))
+		tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kfence_test_init(void)
+{
+	/*
+	 * Because we want to be able to build the test as a module, we need to
+	 * iterate through all known tracepoints, since the static registration
+	 * won't work here.
+	 */
+	for_each_kernel_tracepoint(register_tracepoints, NULL);
+	return __kunit_test_suites_init(kfence_test_suites);
+}
+
+static void kfence_test_exit(void)
+{
+	__kunit_test_suites_exit(kfence_test_suites);
+	for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+	tracepoint_synchronize_unregister();
+}
+
+late_initcall(kfence_test_init);
+module_exit(kfence_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>");
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 4dbfa9a382e4..901bd7ee83d8 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -156,7 +156,12 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
 	pr_cont(" ]");
 }
 
-void kfence_report_error(unsigned long address, struct pt_regs *regs,
+static const char *get_access_type(bool is_write)
+{
+	return is_write ? "write" : "read";
+}
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
 			 const struct kfence_metadata *meta, enum kfence_error_type type)
 {
 	unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
@@ -194,17 +199,19 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs,
 	case KFENCE_ERROR_OOB: {
 		const bool left_of_object = address < meta->addr;
 
-		pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]);
-		pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
-		       (void *)address,
+		pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
+		       (void *)stack_entries[skipnr]);
+		pr_err("Out-of-bounds %s at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
+		       get_access_type(is_write), (void *)address,
 		       left_of_object ? meta->addr - address : address - meta->addr,
 		       left_of_object ? "left" : "right", object_index);
 		break;
 	}
 	case KFENCE_ERROR_UAF:
-		pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]);
-		pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n",
-		       (void *)address, object_index);
+		pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
+		       (void *)stack_entries[skipnr]);
+		pr_err("Use-after-free %s at 0x" PTR_FMT " (in kfence-#%zd):\n",
+		       get_access_type(is_write), (void *)address, object_index);
 		break;
 	case KFENCE_ERROR_CORRUPTION:
 		pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
@@ -213,8 +220,10 @@ void kfence_report_error(unsigned long address, struct pt_regs *regs,
 		pr_cont(" (in kfence-#%zd):\n", object_index);
 		break;
 	case KFENCE_ERROR_INVALID:
-		pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]);
-		pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address);
+		pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
+		       (void *)stack_entries[skipnr]);
+		pr_err("Invalid %s at 0x" PTR_FMT ":\n", get_access_type(is_write),
+		       (void *)address);
 		break;
 	case KFENCE_ERROR_INVALID_FREE:
 		pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
-- 
cgit v1.2.3


From 928501344fc645f80390afc12708c81b3595745d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 25 Feb 2021 17:19:55 -0800
Subject: kasan, mm: don't save alloc stacks twice

Patch series "kasan: optimizations and fixes for HW_TAGS", v4.

This patchset makes the HW_TAGS mode more efficient, mostly by reworking
poisoning approaches and simplifying/inlining some internal helpers.

With this change, the overhead of HW_TAGS annotations excluding setting
and checking memory tags is ~3%.  The performance impact caused by tags
will be unknown until we have hardware that supports MTE.

As a side-effect, this patchset speeds up generic KASAN by ~15%.

This patch (of 13):

Currently KASAN saves allocation stacks in both kasan_slab_alloc() and
kasan_kmalloc() annotations.  This patch changes KASAN to save allocation
stacks for slab objects from kmalloc caches in kasan_kmalloc() only, and
stacks for other slab objects in kasan_slab_alloc() only.

This change requires ____kasan_kmalloc() knowing whether the object
belongs to a kmalloc cache.  This is implemented by adding a flag field to
the kasan_info structure.  That flag is only set for kmalloc caches via a
new kasan_cache_create_kmalloc() annotation.

Link: https://lkml.kernel.org/r/cover.1612546384.git.andreyknvl@google.com
Link: https://lkml.kernel.org/r/7c673ebca8d00f40a7ad6f04ab9a2bddeeae2097.1612546384.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  9 +++++++++
 mm/kasan/common.c     | 18 ++++++++++++++----
 mm/slab_common.c      |  1 +
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7eaf2d9effb4..3fb31e8a353e 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -83,6 +83,7 @@ static inline void kasan_disable_current(void) {}
 struct kasan_cache {
 	int alloc_meta_offset;
 	int free_meta_offset;
+	bool is_kmalloc;
 };
 
 #ifdef CONFIG_KASAN_HW_TAGS
@@ -143,6 +144,13 @@ static __always_inline void kasan_cache_create(struct kmem_cache *cache,
 		__kasan_cache_create(cache, size, flags);
 }
 
+void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
+static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
+{
+	if (kasan_enabled())
+		__kasan_cache_create_kmalloc(cache);
+}
+
 size_t __kasan_metadata_size(struct kmem_cache *cache);
 static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
 {
@@ -278,6 +286,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
+static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index af1768c4fee5..d8d83ca56fe2 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -210,6 +210,11 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
 		*size = optimal_size;
 }
 
+void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
+{
+	cache->kasan_info.is_kmalloc = true;
+}
+
 size_t __kasan_metadata_size(struct kmem_cache *cache)
 {
 	if (!kasan_stack_collection_enabled())
@@ -394,17 +399,22 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 	}
 }
 
-static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+static void set_alloc_info(struct kmem_cache *cache, void *object,
+				gfp_t flags, bool is_kmalloc)
 {
 	struct kasan_alloc_meta *alloc_meta;
 
+	/* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */
+	if (cache->kasan_info.is_kmalloc && !is_kmalloc)
+		return;
+
 	alloc_meta = kasan_get_alloc_meta(cache, object);
 	if (alloc_meta)
 		kasan_set_track(&alloc_meta->alloc_track, flags);
 }
 
 static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
-				size_t size, gfp_t flags, bool keep_tag)
+				size_t size, gfp_t flags, bool is_kmalloc)
 {
 	unsigned long redzone_start;
 	unsigned long redzone_end;
@@ -423,7 +433,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
 				KASAN_GRANULE_SIZE);
 	redzone_end = round_up((unsigned long)object + cache->object_size,
 				KASAN_GRANULE_SIZE);
-	tag = assign_tag(cache, object, false, keep_tag);
+	tag = assign_tag(cache, object, false, is_kmalloc);
 
 	/* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
 	kasan_unpoison(set_tag(object, tag), size);
@@ -431,7 +441,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
 			   KASAN_KMALLOC_REDZONE);
 
 	if (kasan_stack_collection_enabled())
-		set_alloc_info(cache, (void *)object, flags);
+		set_alloc_info(cache, (void *)object, flags, is_kmalloc);
 
 	return set_tag(object, tag);
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 284954ef1da5..897c3a446b04 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -643,6 +643,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
 		panic("Out of memory when creating slab %s\n", name);
 
 	create_boot_cache(s, name, size, flags, useroffset, usersize);
+	kasan_cache_create_kmalloc(s);
 	list_add(&s->list, &slab_caches);
 	s->refcount = 1;
 	return s;
-- 
cgit v1.2.3


From 200072ce33b298cf14d3ed2a570f5eb27609677d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 25 Feb 2021 17:20:11 -0800
Subject: kasan: unify large kfree checks

Unify checks in kasan_kfree_large() and in kasan_slab_free_mempool() for
large allocations as it's done for small kfree() allocations.

With this change, kasan_slab_free_mempool() starts checking that the first
byte of the memory that's being freed is accessible.

Link: https://lkml.kernel.org/r/14ffc4cd867e0b1ed58f7527e3b748a1b4ad08aa.1612546384.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 16 ++++++++--------
 mm/kasan/common.c     | 36 ++++++++++++++++++++++++++----------
 2 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3fb31e8a353e..b91732bd05d7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -200,6 +200,13 @@ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 	return false;
 }
 
+void __kasan_kfree_large(void *ptr, unsigned long ip);
+static __always_inline void kasan_kfree_large(void *ptr)
+{
+	if (kasan_enabled())
+		__kasan_kfree_large(ptr, _RET_IP_);
+}
+
 void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
 static __always_inline void kasan_slab_free_mempool(void *ptr)
 {
@@ -247,13 +254,6 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
-void __kasan_kfree_large(void *ptr, unsigned long ip);
-static __always_inline void kasan_kfree_large(void *ptr)
-{
-	if (kasan_enabled())
-		__kasan_kfree_large(ptr, _RET_IP_);
-}
-
 /*
  * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
  * the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -302,6 +302,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 {
 	return false;
 }
+static inline void kasan_kfree_large(void *ptr) {}
 static inline void kasan_slab_free_mempool(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags)
@@ -322,7 +323,6 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_kfree_large(void *ptr) {}
 static inline bool kasan_check_byte(const void *address)
 {
 	return true;
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 48d51daeda95..8a3d66393dc5 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -364,6 +364,31 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 	return ____kasan_slab_free(cache, object, ip, true);
 }
 
+static bool ____kasan_kfree_large(void *ptr, unsigned long ip)
+{
+	if (ptr != page_address(virt_to_head_page(ptr))) {
+		kasan_report_invalid_free(ptr, ip);
+		return true;
+	}
+
+	if (!kasan_byte_accessible(ptr)) {
+		kasan_report_invalid_free(ptr, ip);
+		return true;
+	}
+
+	/*
+	 * The object will be poisoned by kasan_free_pages() or
+	 * kasan_slab_free_mempool().
+	 */
+
+	return false;
+}
+
+void __kasan_kfree_large(void *ptr, unsigned long ip)
+{
+	____kasan_kfree_large(ptr, ip);
+}
+
 void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 {
 	struct page *page;
@@ -377,10 +402,8 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
 	 */
 	if (unlikely(!PageSlab(page))) {
-		if (ptr != page_address(page)) {
-			kasan_report_invalid_free(ptr, ip);
+		if (____kasan_kfree_large(ptr, ip))
 			return;
-		}
 		kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
 		____kasan_slab_free(page->slab_cache, ptr, ip, false);
@@ -539,13 +562,6 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 		return ____kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 
-void __kasan_kfree_large(void *ptr, unsigned long ip)
-{
-	if (ptr != page_address(virt_to_head_page(ptr)))
-		kasan_report_invalid_free(ptr, ip);
-	/* The object will be poisoned by kasan_free_pages(). */
-}
-
 bool __kasan_check_byte(const void *address, unsigned long ip)
 {
 	if (!kasan_byte_accessible(address)) {
-- 
cgit v1.2.3


From df54714f579a77662054132161612ce3da876b0d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 25 Feb 2021 17:20:56 -0800
Subject: include/linux: remove repeated words

Drop the doubled word "for" in a comment. {firewire-cdev.h}
Drop the doubled word "in" in a comment. {input.h}
Drop the doubled word "a" in a comment. {mdev.h}
Drop the doubled word "the" in a comment. {ptrace.h}

Link: https://lkml.kernel.org/r/20210126232444.22861-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mdev.h               | 2 +-
 include/linux/ptrace.h             | 2 +-
 include/uapi/linux/firewire-cdev.h | 2 +-
 include/uapi/linux/input.h         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 9004375c462e..27eb383cb95d 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -42,7 +42,7 @@ struct device *mdev_get_iommu_device(struct device *dev);
  *			@mdev: mdev_device structure on of mediated device
  *			      that is being created
  *			Returns integer: success (0) or error (< 0)
- * @remove:		Called to free resources in parent device's driver for a
+ * @remove:		Called to free resources in parent device's driver for
  *			a mediated device. It is mandatory to provide 'remove'
  *			ops.
  *			@mdev: mdev_device device structure which is being
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 2a9df80ea887..b5ebf6c01292 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -171,7 +171,7 @@ static inline void ptrace_event(int event, unsigned long message)
  *
  * Check whether @event is enabled and, if so, report @event and @pid
  * to the ptrace parent.  @pid is reported as the pid_t seen from the
- * the ptrace parent's pid namespace.
+ * ptrace parent's pid namespace.
  *
  * Called without locks.
  */
diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h
index 7e5b5c10a49c..5effa9832802 100644
--- a/include/uapi/linux/firewire-cdev.h
+++ b/include/uapi/linux/firewire-cdev.h
@@ -844,7 +844,7 @@ struct fw_cdev_queue_iso {
  * struct fw_cdev_start_iso - Start an isochronous transmission or reception
  * @cycle:	Cycle in which to start I/O.  If @cycle is greater than or
  *		equal to 0, the I/O will start on that cycle.
- * @sync:	Determines the value to wait for for receive packets that have
+ * @sync:	Determines the value to wait for receive packets that have
  *		the %FW_CDEV_ISO_SYNC bit set
  * @tags:	Tag filter bit mask.  Only valid for isochronous reception.
  *		Determines the tag values for which packets will be accepted.
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 9a61c28ed3ae..ee3127461ee0 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -84,7 +84,7 @@ struct input_id {
  * in units per radian.
  * When INPUT_PROP_ACCELEROMETER is set the resolution changes.
  * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in
- * in units per g (units/g) and in units per degree per second
+ * units per g (units/g) and in units per degree per second
  * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ).
  */
 struct input_absinfo {
-- 
cgit v1.2.3


From c131bd0b5448bb577b7a9ed48c4e528807e8d5af Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 25 Feb 2021 17:21:00 -0800
Subject: treewide: Miguel has moved

Update contact info.

Link: https://lkml.kernel.org/r/20210206162524.GA11520@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap                                            |  1 +
 CREDITS                                             |  9 +++------
 Documentation/admin-guide/auxdisplay/cfag12864b.rst |  2 +-
 Documentation/admin-guide/auxdisplay/ks0108.rst     |  2 +-
 MAINTAINERS                                         | 12 ++++++------
 drivers/auxdisplay/cfag12864b.c                     |  4 ++--
 drivers/auxdisplay/cfag12864bfb.c                   |  4 ++--
 drivers/auxdisplay/ks0108.c                         |  4 ++--
 include/linux/cfag12864b.h                          |  2 +-
 include/linux/ks0108.h                              |  2 +-
 samples/auxdisplay/cfag12864b-example.c             |  2 +-
 11 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/.mailmap b/.mailmap
index 87a8bbdbf749..85b93cdefc87 100644
--- a/.mailmap
+++ b/.mailmap
@@ -237,6 +237,7 @@ Maxime Ripard <mripard@kernel.org> <maxime.ripard@free-electrons.com>
 Mayuresh Janorkar <mayur@ti.com>
 Michael Buesch <m@bues.ch>
 Michel Dänzer <michel@tungstengraphics.com>
+Miguel Ojeda <ojeda@kernel.org> <miguel.ojeda.sandonis@gmail.com>
 Mike Rapoport <rppt@kernel.org> <mike@compulab.co.il>
 Mike Rapoport <rppt@kernel.org> <mike.rapoport@gmail.com>
 Mike Rapoport <rppt@kernel.org> <rppt@linux.ibm.com>
diff --git a/CREDITS b/CREDITS
index be097156bd71..cef83b958cbe 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2841,14 +2841,11 @@ S: Subiaco, 6008
 S: Perth, Western Australia
 S: Australia
 
-N: Miguel Ojeda Sandonis
-E: miguel.ojeda.sandonis@gmail.com
-W: http://miguelojeda.es
-W: http://jair.lab.fi.uva.es/~migojed/
+N: Miguel Ojeda
+E: ojeda@kernel.org
+W: https://ojeda.dev
 D: Author of the ks0108, cfag12864b and cfag12864bfb auxiliary display drivers.
 D: Maintainer of the auxiliary display drivers tree (drivers/auxdisplay/*)
-S: C/ Mieses 20, 9-B
-S: Valladolid 47009
 S: Spain
 
 N: Peter Oruba
diff --git a/Documentation/admin-guide/auxdisplay/cfag12864b.rst b/Documentation/admin-guide/auxdisplay/cfag12864b.rst
index 18c2865bd322..da385d851acc 100644
--- a/Documentation/admin-guide/auxdisplay/cfag12864b.rst
+++ b/Documentation/admin-guide/auxdisplay/cfag12864b.rst
@@ -3,7 +3,7 @@ cfag12864b LCD Driver Documentation
 ===================================
 
 :License:		GPLv2
-:Author & Maintainer:	Miguel Ojeda Sandonis
+:Author & Maintainer:	Miguel Ojeda <ojeda@kernel.org>
 :Date:			2006-10-27
 
 
diff --git a/Documentation/admin-guide/auxdisplay/ks0108.rst b/Documentation/admin-guide/auxdisplay/ks0108.rst
index c0b7faf73136..a7d3fe509373 100644
--- a/Documentation/admin-guide/auxdisplay/ks0108.rst
+++ b/Documentation/admin-guide/auxdisplay/ks0108.rst
@@ -3,7 +3,7 @@ ks0108 LCD Controller Driver Documentation
 ==========================================
 
 :License:		GPLv2
-:Author & Maintainer:	Miguel Ojeda Sandonis
+:Author & Maintainer:	Miguel Ojeda <ojeda@kernel.org>
 :Date:			2006-10-27
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 40040db747fc..e42082eccf36 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2982,7 +2982,7 @@ F:	include/uapi/linux/audit.h
 F:	kernel/audit*
 
 AUXILIARY DISPLAY DRIVERS
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	drivers/auxdisplay/
 F:	include/linux/cfag12864b.h
@@ -4128,13 +4128,13 @@ F:	scripts/extract-cert.c
 F:	scripts/sign-file.c
 
 CFAG12864B LCD DRIVER
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	drivers/auxdisplay/cfag12864b.c
 F:	include/linux/cfag12864b.h
 
 CFAG12864BFB LCD FRAMEBUFFER DRIVER
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	drivers/auxdisplay/cfag12864bfb.c
 F:	include/linux/cfag12864b.h
@@ -4304,7 +4304,7 @@ S:	Supported
 F:	drivers/infiniband/hw/usnic/
 
 CLANG-FORMAT FILE
-M:	Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	.clang-format
 
@@ -4444,7 +4444,7 @@ S:	Maintained
 F:	drivers/platform/x86/compal-laptop.c
 
 COMPILER ATTRIBUTES
-M:	Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	include/linux/compiler_attributes.h
 
@@ -9939,7 +9939,7 @@ F:	include/linux/kprobes.h
 F:	kernel/kprobes.c
 
 KS0108 LCD CONTROLLER DRIVER
-M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
+M:	Miguel Ojeda <ojeda@kernel.org>
 S:	Maintained
 F:	Documentation/admin-guide/auxdisplay/ks0108.rst
 F:	drivers/auxdisplay/ks0108.c
diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c
index 7eebae7e322c..fd430e6866a1 100644
--- a/drivers/auxdisplay/cfag12864b.c
+++ b/drivers/auxdisplay/cfag12864b.c
@@ -5,7 +5,7 @@
  * Description: cfag12864b LCD driver
  *     Depends: ks0108
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
@@ -376,5 +376,5 @@ module_init(cfag12864b_init);
 module_exit(cfag12864b_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>");
+MODULE_AUTHOR("Miguel Ojeda <ojeda@kernel.org>");
 MODULE_DESCRIPTION("cfag12864b LCD driver");
diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c
index 2002291ab338..d66821adf453 100644
--- a/drivers/auxdisplay/cfag12864bfb.c
+++ b/drivers/auxdisplay/cfag12864bfb.c
@@ -5,7 +5,7 @@
  * Description: cfag12864b LCD framebuffer driver
  *     Depends: cfag12864b
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
@@ -171,5 +171,5 @@ module_init(cfag12864bfb_init);
 module_exit(cfag12864bfb_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>");
+MODULE_AUTHOR("Miguel Ojeda <ojeda@kernel.org>");
 MODULE_DESCRIPTION("cfag12864b LCD framebuffer driver");
diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c
index abfe3fa9e6f4..03c95ad4216c 100644
--- a/drivers/auxdisplay/ks0108.c
+++ b/drivers/auxdisplay/ks0108.c
@@ -5,7 +5,7 @@
  * Description: ks0108 LCD Controller driver
  *     Depends: parport
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
@@ -182,6 +182,6 @@ module_init(ks0108_init);
 module_exit(ks0108_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>");
+MODULE_AUTHOR("Miguel Ojeda <ojeda@kernel.org>");
 MODULE_DESCRIPTION("ks0108 LCD Controller driver");
 
diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h
index 4060004968c8..6617d9c68d86 100644
--- a/include/linux/cfag12864b.h
+++ b/include/linux/cfag12864b.h
@@ -4,7 +4,7 @@
  *     Version: 0.1.0
  * Description: cfag12864b LCD driver header
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-12
  */
 
diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h
index 0738389b42b6..1a37a664f915 100644
--- a/include/linux/ks0108.h
+++ b/include/linux/ks0108.h
@@ -4,7 +4,7 @@
  *     Version: 0.1.0
  * Description: ks0108 LCD Controller driver header
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
index bfeab44f81d0..2e3bb7375c99 100644
--- a/samples/auxdisplay/cfag12864b-example.c
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -4,7 +4,7 @@
  *     Version: 0.1.0
  * Description: cfag12864b LCD userspace example program
  *
- *      Author: Copyright (C) Miguel Ojeda Sandonis
+ *      Author: Copyright (C) Miguel Ojeda <ojeda@kernel.org>
  *        Date: 2006-10-31
  */
 
-- 
cgit v1.2.3


From c1f26493ed7f363c63e0e9d91e50d4db26df6603 Mon Sep 17 00:00:00 2001
From: Hubert Jasudowicz <hubert.jasudowicz@gmail.com>
Date: Thu, 25 Feb 2021 17:21:03 -0800
Subject: groups: use flexible-array member in struct group_info

Replace zero-size array with flexible array member, as recommended by
the docs.

Link: https://lkml.kernel.org/r/155995eed35c3c1bdcc56e69d8997c8e4c46740a.1611620846.git.hubert.jasudowicz@gmail.com
Signed-off-by: Hubert Jasudowicz <hubert.jasudowicz@gmail.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Micah Morton <mortonm@chromium.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Michael Kelley <mikelley@microsoft.com>
Cc: Thomas Cedeno <thomascedeno@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..4c6350503697 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -25,7 +25,7 @@ struct inode;
 struct group_info {
 	atomic_t	usage;
 	int		ngroups;
-	kgid_t		gid[0];
+	kgid_t		gid[];
 } __randomize_layout;
 
 /**
-- 
cgit v1.2.3


From a28a6e860c6cf231cf3c5171c75c342adcd00406 Mon Sep 17 00:00:00 2001
From: Francis Laniel <laniel_francis@privacyrequired.com>
Date: Thu, 25 Feb 2021 17:21:20 -0800
Subject: string.h: move fortified functions definitions in a dedicated header.

This patch adds fortify-string.h to contain fortified functions
definitions.  Thus, the code is more separated and compile time is
approximately 1% faster for people who do not set CONFIG_FORTIFY_SOURCE.

Link: https://lkml.kernel.org/r/20210111092141.22946-1-laniel_francis@privacyrequired.com
Link: https://lkml.kernel.org/r/20210111092141.22946-2-laniel_francis@privacyrequired.com
Signed-off-by: Francis Laniel <laniel_francis@privacyrequired.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fortify-string.h | 302 +++++++++++++++++++++++++++++++++++++++++
 include/linux/string.h         | 282 +-------------------------------------
 2 files changed, 303 insertions(+), 281 deletions(-)
 create mode 100644 include/linux/fortify-string.h

(limited to 'include/linux')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
new file mode 100644
index 000000000000..c1be37437e77
--- /dev/null
+++ b/include/linux/fortify-string.h
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FORTIFY_STRING_H_
+#define _LINUX_FORTIFY_STRING_H_
+
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
+extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
+extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
+extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
+extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
+extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
+extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
+#else
+#define __underlying_memchr	__builtin_memchr
+#define __underlying_memcmp	__builtin_memcmp
+#define __underlying_memcpy	__builtin_memcpy
+#define __underlying_memmove	__builtin_memmove
+#define __underlying_memset	__builtin_memset
+#define __underlying_strcat	__builtin_strcat
+#define __underlying_strcpy	__builtin_strcpy
+#define __underlying_strlen	__builtin_strlen
+#define __underlying_strncat	__builtin_strncat
+#define __underlying_strncpy	__builtin_strncpy
+#endif
+
+__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__write_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __underlying_strncpy(p, q, size);
+}
+
+__FORTIFY_INLINE char *strcat(char *p, const char *q)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+
+	if (p_size == (size_t)-1)
+		return __underlying_strcat(p, q);
+	if (strlcat(p, q, p_size) >= p_size)
+		fortify_panic(__func__);
+	return p;
+}
+
+__FORTIFY_INLINE __kernel_size_t strlen(const char *p)
+{
+	__kernel_size_t ret;
+	size_t p_size = __builtin_object_size(p, 1);
+
+	/* Work around gcc excess stack consumption issue */
+	if (p_size == (size_t)-1 ||
+		(__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
+		return __underlying_strlen(p);
+	ret = strnlen(p, p_size);
+	if (p_size <= ret)
+		fortify_panic(__func__);
+	return ret;
+}
+
+extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
+__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
+
+	if (p_size <= ret && maxlen != ret)
+		fortify_panic(__func__);
+	return ret;
+}
+
+/* defined after fortified strlen to reuse it */
+extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
+__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
+{
+	size_t ret;
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+		return __real_strlcpy(p, q, size);
+	ret = strlen(q);
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+
+		if (__builtin_constant_p(len) && len >= p_size)
+			__write_overflow();
+		if (len >= p_size)
+			fortify_panic(__func__);
+		__underlying_memcpy(p, q, len);
+		p[len] = '\0';
+	}
+	return ret;
+}
+
+/* defined after fortified strnlen to reuse it */
+extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
+__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size)
+{
+	size_t len;
+	/* Use string size rather than possible enclosing struct size. */
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	/* If we cannot get size of p and q default to call strscpy. */
+	if (p_size == (size_t) -1 && q_size == (size_t) -1)
+		return __real_strscpy(p, q, size);
+
+	/*
+	 * If size can be known at compile time and is greater than
+	 * p_size, generate a compile time write overflow error.
+	 */
+	if (__builtin_constant_p(size) && size > p_size)
+		__write_overflow();
+
+	/*
+	 * This call protects from read overflow, because len will default to q
+	 * length if it smaller than size.
+	 */
+	len = strnlen(q, size);
+	/*
+	 * If len equals size, we will copy only size bytes which leads to
+	 * -E2BIG being returned.
+	 * Otherwise we will copy len + 1 because of the final '\O'.
+	 */
+	len = len == size ? size : len + 1;
+
+	/*
+	 * Generate a runtime write overflow error if len is greater than
+	 * p_size.
+	 */
+	if (len > p_size)
+		fortify_panic(__func__);
+
+	/*
+	 * We can now safely call vanilla strscpy because we are protected from:
+	 * 1. Read overflow thanks to call to strnlen().
+	 * 2. Write overflow thanks to above ifs.
+	 */
+	return __real_strscpy(p, q, len);
+}
+
+/* defined after fortified strlen and strnlen to reuse them */
+__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
+{
+	size_t p_len, copy_len;
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+
+	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+		return __underlying_strncat(p, q, count);
+	p_len = strlen(p);
+	copy_len = strnlen(q, count);
+	if (p_size < p_len + copy_len + 1)
+		fortify_panic(__func__);
+	__underlying_memcpy(p + p_len, q, copy_len);
+	p[p_len + copy_len] = '\0';
+	return p;
+}
+
+__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__write_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __underlying_memset(p, c, size);
+}
+
+__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+	size_t q_size = __builtin_object_size(q, 0);
+
+	if (__builtin_constant_p(size)) {
+		if (p_size < size)
+			__write_overflow();
+		if (q_size < size)
+			__read_overflow2();
+	}
+	if (p_size < size || q_size < size)
+		fortify_panic(__func__);
+	return __underlying_memcpy(p, q, size);
+}
+
+__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+	size_t q_size = __builtin_object_size(q, 0);
+
+	if (__builtin_constant_p(size)) {
+		if (p_size < size)
+			__write_overflow();
+		if (q_size < size)
+			__read_overflow2();
+	}
+	if (p_size < size || q_size < size)
+		fortify_panic(__func__);
+	return __underlying_memmove(p, q, size);
+}
+
+extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
+__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __real_memscan(p, c, size);
+}
+
+__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+	size_t q_size = __builtin_object_size(q, 0);
+
+	if (__builtin_constant_p(size)) {
+		if (p_size < size)
+			__read_overflow();
+		if (q_size < size)
+			__read_overflow2();
+	}
+	if (p_size < size || q_size < size)
+		fortify_panic(__func__);
+	return __underlying_memcmp(p, q, size);
+}
+
+__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __underlying_memchr(p, c, size);
+}
+
+void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
+__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __real_memchr_inv(p, c, size);
+}
+
+extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
+__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
+{
+	size_t p_size = __builtin_object_size(p, 0);
+
+	if (__builtin_constant_p(size) && p_size < size)
+		__read_overflow();
+	if (p_size < size)
+		fortify_panic(__func__);
+	return __real_kmemdup(p, size, gfp);
+}
+
+/* defined after fortified strlen and memcpy to reuse them */
+__FORTIFY_INLINE char *strcpy(char *p, const char *q)
+{
+	size_t p_size = __builtin_object_size(p, 1);
+	size_t q_size = __builtin_object_size(q, 1);
+	size_t size;
+
+	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+		return __underlying_strcpy(p, q);
+	size = strlen(q) + 1;
+	/* test here to use the more stringent object size */
+	if (p_size < size)
+		fortify_panic(__func__);
+	memcpy(p, q, size);
+	return p;
+}
+
+/* Don't use these outside the FORITFY_SOURCE implementation */
+#undef __underlying_memchr
+#undef __underlying_memcmp
+#undef __underlying_memcpy
+#undef __underlying_memmove
+#undef __underlying_memset
+#undef __underlying_strcat
+#undef __underlying_strcpy
+#undef __underlying_strlen
+#undef __underlying_strncat
+#undef __underlying_strncpy
+
+#endif /* _LINUX_FORTIFY_STRING_H_ */
diff --git a/include/linux/string.h b/include/linux/string.h
index 4fcfb56abcf5..9521d8cab18e 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -266,287 +266,7 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob
 void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
 
 #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
-
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
-extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
-extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
-extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
-extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
-extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
-extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
-extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
-extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
-extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
-#else
-#define __underlying_memchr	__builtin_memchr
-#define __underlying_memcmp	__builtin_memcmp
-#define __underlying_memcpy	__builtin_memcpy
-#define __underlying_memmove	__builtin_memmove
-#define __underlying_memset	__builtin_memset
-#define __underlying_strcat	__builtin_strcat
-#define __underlying_strcpy	__builtin_strcpy
-#define __underlying_strlen	__builtin_strlen
-#define __underlying_strncat	__builtin_strncat
-#define __underlying_strncpy	__builtin_strncpy
-#endif
-
-__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	if (__builtin_constant_p(size) && p_size < size)
-		__write_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __underlying_strncpy(p, q, size);
-}
-
-__FORTIFY_INLINE char *strcat(char *p, const char *q)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	if (p_size == (size_t)-1)
-		return __underlying_strcat(p, q);
-	if (strlcat(p, q, p_size) >= p_size)
-		fortify_panic(__func__);
-	return p;
-}
-
-__FORTIFY_INLINE __kernel_size_t strlen(const char *p)
-{
-	__kernel_size_t ret;
-	size_t p_size = __builtin_object_size(p, 1);
-
-	/* Work around gcc excess stack consumption issue */
-	if (p_size == (size_t)-1 ||
-	    (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
-		return __underlying_strlen(p);
-	ret = strnlen(p, p_size);
-	if (p_size <= ret)
-		fortify_panic(__func__);
-	return ret;
-}
-
-extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
-__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
-	if (p_size <= ret && maxlen != ret)
-		fortify_panic(__func__);
-	return ret;
-}
-
-/* defined after fortified strlen to reuse it */
-extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
-__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
-{
-	size_t ret;
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
-		return __real_strlcpy(p, q, size);
-	ret = strlen(q);
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-		if (__builtin_constant_p(len) && len >= p_size)
-			__write_overflow();
-		if (len >= p_size)
-			fortify_panic(__func__);
-		__underlying_memcpy(p, q, len);
-		p[len] = '\0';
-	}
-	return ret;
-}
-
-/* defined after fortified strnlen to reuse it */
-extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy);
-__FORTIFY_INLINE ssize_t strscpy(char *p, const char *q, size_t size)
-{
-	size_t len;
-	/* Use string size rather than possible enclosing struct size. */
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-
-	/* If we cannot get size of p and q default to call strscpy. */
-	if (p_size == (size_t) -1 && q_size == (size_t) -1)
-		return __real_strscpy(p, q, size);
-
-	/*
-	 * If size can be known at compile time and is greater than
-	 * p_size, generate a compile time write overflow error.
-	 */
-	if (__builtin_constant_p(size) && size > p_size)
-		__write_overflow();
-
-	/*
-	 * This call protects from read overflow, because len will default to q
-	 * length if it smaller than size.
-	 */
-	len = strnlen(q, size);
-	/*
-	 * If len equals size, we will copy only size bytes which leads to
-	 * -E2BIG being returned.
-	 * Otherwise we will copy len + 1 because of the final '\O'.
-	 */
-	len = len == size ? size : len + 1;
-
-	/*
-	 * Generate a runtime write overflow error if len is greater than
-	 * p_size.
-	 */
-	if (len > p_size)
-		fortify_panic(__func__);
-
-	/*
-	 * We can now safely call vanilla strscpy because we are protected from:
-	 * 1. Read overflow thanks to call to strnlen().
-	 * 2. Write overflow thanks to above ifs.
-	 */
-	return __real_strscpy(p, q, len);
-}
-
-/* defined after fortified strlen and strnlen to reuse them */
-__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
-{
-	size_t p_len, copy_len;
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
-		return __underlying_strncat(p, q, count);
-	p_len = strlen(p);
-	copy_len = strnlen(q, count);
-	if (p_size < p_len + copy_len + 1)
-		fortify_panic(__func__);
-	__underlying_memcpy(p + p_len, q, copy_len);
-	p[p_len + copy_len] = '\0';
-	return p;
-}
-
-__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__write_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __underlying_memset(p, c, size);
-}
-
-__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
-	if (__builtin_constant_p(size)) {
-		if (p_size < size)
-			__write_overflow();
-		if (q_size < size)
-			__read_overflow2();
-	}
-	if (p_size < size || q_size < size)
-		fortify_panic(__func__);
-	return __underlying_memcpy(p, q, size);
-}
-
-__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
-	if (__builtin_constant_p(size)) {
-		if (p_size < size)
-			__write_overflow();
-		if (q_size < size)
-			__read_overflow2();
-	}
-	if (p_size < size || q_size < size)
-		fortify_panic(__func__);
-	return __underlying_memmove(p, q, size);
-}
-
-extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
-__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __real_memscan(p, c, size);
-}
-
-__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
-	if (__builtin_constant_p(size)) {
-		if (p_size < size)
-			__read_overflow();
-		if (q_size < size)
-			__read_overflow2();
-	}
-	if (p_size < size || q_size < size)
-		fortify_panic(__func__);
-	return __underlying_memcmp(p, q, size);
-}
-
-__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __underlying_memchr(p, c, size);
-}
-
-void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
-__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __real_memchr_inv(p, c, size);
-}
-
-extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
-__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
-{
-	size_t p_size = __builtin_object_size(p, 0);
-	if (__builtin_constant_p(size) && p_size < size)
-		__read_overflow();
-	if (p_size < size)
-		fortify_panic(__func__);
-	return __real_kmemdup(p, size, gfp);
-}
-
-/* defined after fortified strlen and memcpy to reuse them */
-__FORTIFY_INLINE char *strcpy(char *p, const char *q)
-{
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
-	size_t size;
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
-		return __underlying_strcpy(p, q);
-	size = strlen(q) + 1;
-	/* test here to use the more stringent object size */
-	if (p_size < size)
-		fortify_panic(__func__);
-	memcpy(p, q, size);
-	return p;
-}
-
-/* Don't use these outside the FORITFY_SOURCE implementation */
-#undef __underlying_memchr
-#undef __underlying_memcmp
-#undef __underlying_memcpy
-#undef __underlying_memmove
-#undef __underlying_memset
-#undef __underlying_strcat
-#undef __underlying_strcpy
-#undef __underlying_strlen
-#undef __underlying_strncat
-#undef __underlying_strncpy
+#include <linux/fortify-string.h>
 #endif
 
 /**
-- 
cgit v1.2.3


From e1fdc403349c64fa58f4c163f4bf9b860b4db808 Mon Sep 17 00:00:00 2001
From: Vijayanand Jitta <vjitta@codeaurora.org>
Date: Thu, 25 Feb 2021 17:21:27 -0800
Subject: lib: stackdepot: add support to disable stack depot

Add a kernel parameter stack_depot_disable to disable stack depot.  So
that stack hash table doesn't consume any memory when stack depot is
disabled.

The use case is CONFIG_PAGE_OWNER without page_owner=on.  Without this
patch, stackdepot will consume the memory for the hashtable.  By default,
it's 8M which is never trivial.

With this option, in CONFIG_PAGE_OWNER configured system, page_owner=off,
stack_depot_disable in kernel command line, we could save the wasted
memory for the hashtable.

[akpm@linux-foundation.org: fix CONFIG_STACKDEPOT=n build]

Link: https://lkml.kernel.org/r/1611749198-24316-2-git-send-email-vjitta@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Vijayanand Jitta <vjitta@codeaurora.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Yogesh Lal <ylal@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 +++++
 include/linux/stackdepot.h                      |  9 +++++++
 init/main.c                                     |  2 ++
 lib/stackdepot.c                                | 32 +++++++++++++++++++++----
 4 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bab6a8b01202..04545725f187 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5182,6 +5182,12 @@
 			growing up) the main stack are reserved for no other
 			mapping. Default value is 256 pages.
 
+	stack_depot_disable= [KNL]
+			Setting this to true through kernel command line will
+			disable the stack depot thereby saving the static memory
+			consumed by the stack hash table. By default this is set
+			to false.
+
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 24d49c732341..6bb4bc1a5f54 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -21,4 +21,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 
 unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
 
+#ifdef CONFIG_STACKDEPOT
+int stack_depot_init(void);
+#else
+static inline int stack_depot_init(void)
+{
+	return 0;
+}
+#endif	/* CONFIG_STACKDEPOT */
+
 #endif
diff --git a/init/main.c b/init/main.c
index 261051070e3c..3648c9f94882 100644
--- a/init/main.c
+++ b/init/main.c
@@ -97,6 +97,7 @@
 #include <linux/mem_encrypt.h>
 #include <linux/kcsan.h>
 #include <linux/init_syscalls.h>
+#include <linux/stackdepot.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -827,6 +828,7 @@ static void __init mm_init(void)
 	init_mem_debugging_and_hardening();
 	kfence_alloc_pool();
 	report_meminit();
+	stack_depot_init();
 	mem_init();
 	/* page_owner must be initialized after buddy is ready */
 	page_ext_init_flatmem_late();
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 4b9715470e87..cc21116512a7 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -31,6 +31,7 @@
 #include <linux/stackdepot.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/memblock.h>
 
 #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
 
@@ -145,9 +146,32 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
 #define STACK_HASH_MASK (STACK_HASH_SIZE - 1)
 #define STACK_HASH_SEED 0x9747b28c
 
-static struct stack_record *stack_table[STACK_HASH_SIZE] = {
-	[0 ...	STACK_HASH_SIZE - 1] = NULL
-};
+static bool stack_depot_disable;
+static struct stack_record **stack_table;
+
+static int __init is_stack_depot_disabled(char *str)
+{
+	kstrtobool(str, &stack_depot_disable);
+	if (stack_depot_disable) {
+		pr_info("Stack Depot is disabled\n");
+		stack_table = NULL;
+	}
+	return 0;
+}
+early_param("stack_depot_disable", is_stack_depot_disabled);
+
+int __init stack_depot_init(void)
+{
+	if (!stack_depot_disable) {
+		size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *));
+		int i;
+
+		stack_table = memblock_alloc(size, size);
+		for (i = 0; i < STACK_HASH_SIZE;  i++)
+			stack_table[i] = NULL;
+	}
+	return 0;
+}
 
 /* Calculate hash for a stack */
 static inline u32 hash_stack(unsigned long *entries, unsigned int size)
@@ -241,7 +265,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 	unsigned long flags;
 	u32 hash;
 
-	if (unlikely(nr_entries == 0))
+	if (unlikely(nr_entries == 0) || stack_depot_disable)
 		goto fast_exit;
 
 	hash = hash_stack(entries, nr_entries);
-- 
cgit v1.2.3


From 4945cca232ce8bc699b8743f2436af664c471b96 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 25 Feb 2021 17:21:37 -0800
Subject: include/linux/bitops.h: spelling s/synomyn/synonym/

Fix a misspelling of "synonym".

Link: https://lkml.kernel.org/r/20210108105305.2028120-1-geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a61f192c096b..a5a48303b0f1 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -214,7 +214,7 @@ static inline int get_count_order_long(unsigned long l)
  * __ffs64 - find first set bit in a 64 bit word
  * @word: The 64 bit word
  *
- * On 64 bit arches this is a synomyn for __ffs
+ * On 64 bit arches this is a synonym for __ffs
  * The result is not defined if no bits are set, so check that @word
  * is non-zero before calling this.
  */
-- 
cgit v1.2.3


From a5a673f7312253a842f3da8c60c980461cc269ec Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 25 Feb 2021 17:22:15 -0800
Subject: init: clean up early_param_on_off() macro

Use early_param() to define early_param_on_off().

Link: https://lkml.kernel.org/r/20210201041532.4025025-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Johan Hovold <johan@kernel.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nick Desaulniers <ndesaulniers@gooogle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index a01f01c1a5c5..31f54de58429 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -338,14 +338,14 @@ struct obs_kernel_param {
 		var = 1;						\
 		return 0;						\
 	}								\
-	__setup_param(str_on, parse_##var##_on, parse_##var##_on, 1);	\
+	early_param(str_on, parse_##var##_on);				\
 									\
 	static int __init parse_##var##_off(char *arg)			\
 	{								\
 		var = 0;						\
 		return 0;						\
 	}								\
-	__setup_param(str_off, parse_##var##_off, parse_##var##_off, 1)
+	early_param(str_off, parse_##var##_off)
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
-- 
cgit v1.2.3


From d54ce6158e354f5358a547b96299ecd7f3725393 Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Thu, 25 Feb 2021 17:22:38 -0800
Subject: kgdb: fix to kill breakpoints on initmem after boot

Currently breakpoints in kernel .init.text section are not handled
correctly while allowing to remove them even after corresponding pages
have been freed.

Fix it via killing .init.text section breakpoints just prior to initmem
pages being freed.

Doug: "HW breakpoints aren't handled by this patch but it's probably
not such a big deal".

Link: https://lkml.kernel.org/r/20210224081652.587785-1-sumit.garg@linaro.org
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Suggested-by: Doug Anderson <dianders@chromium.org>
Acked-by: Doug Anderson <dianders@chromium.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Tested-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kgdb.h      |  2 ++
 init/main.c               |  1 +
 kernel/debug/debug_core.c | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 0444b44bd156..392a3670944c 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -359,9 +359,11 @@ extern atomic_t			kgdb_active;
 extern bool dbg_is_early;
 extern void __init dbg_late_init(void);
 extern void kgdb_panic(const char *msg);
+extern void kgdb_free_init_mem(void);
 #else /* ! CONFIG_KGDB */
 #define in_dbg_master() (0)
 #define dbg_late_init()
 static inline void kgdb_panic(const char *msg) {}
+static inline void kgdb_free_init_mem(void) { }
 #endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/init/main.c b/init/main.c
index 3648c9f94882..53b278845b88 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1426,6 +1426,7 @@ static int __ref kernel_init(void *unused)
 	async_synchronize_full();
 	kprobe_free_init_mem();
 	ftrace_free_init_mem();
+	kgdb_free_init_mem();
 	free_initmem();
 	mark_readonly();
 
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b636d517c02c..4708aec492df 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -455,6 +455,17 @@ setundefined:
 	return 0;
 }
 
+void kgdb_free_init_mem(void)
+{
+	int i;
+
+	/* Clear init memory breakpoints. */
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0))
+			kgdb_break[i].state = BP_UNDEFINED;
+	}
+}
+
 #ifdef CONFIG_KGDB_KDB
 void kdb_dump_stack_on_cpu(int cpu)
 {
-- 
cgit v1.2.3


From f685a533a7fab35c5d069dcd663f59c8e4171a75 Mon Sep 17 00:00:00 2001
From: Huang Pei <huangpei@loongson.cn>
Date: Thu, 25 Feb 2021 17:22:49 -0800
Subject: MIPS: make userspace mapping young by default

MIPS page fault path(except huge page) takes 3 exceptions (1 TLB Miss + 2
TLB Invalid), butthe second TLB Invalid exception is just triggered by
__update_tlb from do_page_fault writing tlb without _PAGE_VALID set.  With
this patch, user space mapping prot is made young by default (with both
_PAGE_VALID and _PAGE_YOUNG set), and it only take 1 TLB Miss + 1 TLB
Invalid exception

Remove pte_sw_mkyoung without polluting MM code and make page fault delay
of MIPS on par with other architecture

Link: https://lkml.kernel.org/r/20210204013942.8398-1-huangpei@loongson.cn
Signed-off-by: Huang Pei <huangpei@loongson.cn>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: <huangpei@loongson.cn>
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: <ambrosehua@gmail.com>
Cc: Bibo Mao <maobibo@loongson.cn>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Paul Burton <paulburton@kernel.org>
Cc: Li Xuefeng <lixuefeng@loongson.cn>
Cc: Yang Tiezhu <yangtiezhu@loongson.cn>
Cc: Gao Juxin <gaojuxin@loongson.cn>
Cc: Fuxin Zhang <zhangfx@lemote.com>
Cc: Huacai Chen <chenhc@lemote.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/mm/cache.c    | 30 ++++++++++++++++--------------
 include/linux/pgtable.h |  8 --------
 mm/memory.c             |  4 ----
 3 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 1754498b0717..7719d632df8d 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -157,29 +157,31 @@ unsigned long _page_cachable_default;
 EXPORT_SYMBOL(_page_cachable_default);
 
 #define PM(p)	__pgprot(_page_cachable_default | (p))
+#define PVA(p)	PM(_PAGE_VALID | _PAGE_ACCESSED | (p))
 
 static inline void setup_protection_map(void)
 {
 	protection_map[0]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[1]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[2]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[3]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[4]  = PM(_PAGE_PRESENT);
-	protection_map[5]  = PM(_PAGE_PRESENT);
-	protection_map[6]  = PM(_PAGE_PRESENT);
-	protection_map[7]  = PM(_PAGE_PRESENT);
+	protection_map[1]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[2]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
+	protection_map[3]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[4]  = PVA(_PAGE_PRESENT);
+	protection_map[5]  = PVA(_PAGE_PRESENT);
+	protection_map[6]  = PVA(_PAGE_PRESENT);
+	protection_map[7]  = PVA(_PAGE_PRESENT);
 
 	protection_map[8]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ);
-	protection_map[9]  = PM(_PAGE_PRESENT | _PAGE_NO_EXEC);
-	protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE |
+	protection_map[9]  = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC);
+	protection_map[10] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE |
 				_PAGE_NO_READ);
-	protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE);
-	protection_map[12] = PM(_PAGE_PRESENT);
-	protection_map[13] = PM(_PAGE_PRESENT);
-	protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE);
-	protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE);
+	protection_map[11] = PVA(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE);
+	protection_map[12] = PVA(_PAGE_PRESENT);
+	protection_map[13] = PVA(_PAGE_PRESENT);
+	protection_map[14] = PVA(_PAGE_PRESENT);
+	protection_map[15] = PVA(_PAGE_PRESENT);
 }
 
+#undef _PVA
 #undef PM
 
 void cpu_cache_init(void)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 36eb748f3c97..cdfc4e9f253e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -432,14 +432,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
  * To be differentiate with macro pte_mkyoung, this macro is used on platforms
  * where software maintains page access bit.
  */
-#ifndef pte_sw_mkyoung
-static inline pte_t pte_sw_mkyoung(pte_t pte)
-{
-	return pte;
-}
-#define pte_sw_mkyoung	pte_sw_mkyoung
-#endif
-
 #ifndef pte_savedwrite
 #define pte_savedwrite pte_write
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 784249f3307b..c8e357627318 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2902,7 +2902,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		entry = pte_sw_mkyoung(entry);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
 		/*
@@ -3560,7 +3559,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	__SetPageUptodate(page);
 
 	entry = mk_pte(page, vma->vm_page_prot);
-	entry = pte_sw_mkyoung(entry);
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
@@ -3745,8 +3743,6 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 
 	if (prefault && arch_wants_old_prefaulted_pte())
 		entry = pte_mkold(entry);
-	else
-		entry = pte_sw_mkyoung(entry);
 
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-- 
cgit v1.2.3


From 5f7136db82996089cdfb2939c7664b29e9da141d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Jan 2021 04:38:57 +0000
Subject: block: Add bio_max_segs

It's often inconvenient to use BIO_MAX_PAGES due to min() requiring the
sign to be the same.  Introduce bio_max_segs() and change BIO_MAX_PAGES to
be unsigned to make it easier for the users.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c                     |  4 +---
 drivers/block/xen-blkback/blkback.c |  4 +---
 drivers/md/dm-io.c                  |  4 ++--
 drivers/md/dm-log-writes.c          | 10 +++++-----
 drivers/nvme/target/io-cmd-bdev.c   |  8 ++++----
 drivers/nvme/target/passthru.c      |  4 ++--
 drivers/target/target_core_iblock.c |  9 +++------
 drivers/target/target_core_pscsi.c  |  2 +-
 fs/block_dev.c                      | 10 +++++-----
 fs/direct-io.c                      |  2 +-
 fs/erofs/data.c                     |  4 +---
 fs/ext4/readpage.c                  |  3 +--
 fs/f2fs/data.c                      |  3 +--
 fs/f2fs/node.c                      |  2 +-
 fs/iomap/buffered-io.c              |  4 ++--
 fs/mpage.c                          |  4 +---
 fs/nfs/blocklayout/blocklayout.c    |  6 +++---
 fs/xfs/xfs_bio_io.c                 |  2 +-
 fs/xfs/xfs_buf.c                    |  4 ++--
 include/linux/bio.h                 |  7 ++++++-
 20 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index 21630dccac62..369e204d14d0 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -150,9 +150,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
 	bmd->is_our_pages = !map_data;
 	bmd->is_null_mapped = (map_data && map_data->null_mapped);
 
-	nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-	if (nr_pages > BIO_MAX_PAGES)
-		nr_pages = BIO_MAX_PAGES;
+	nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE));
 
 	ret = -ENOMEM;
 	bio = bio_kmalloc(gfp_mask, nr_pages);
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index da16121140ca..1cdf09ff67b6 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1326,9 +1326,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				     pages[i]->page,
 				     seg[i].nsec << 9,
 				     seg[i].offset) == 0)) {
-
-			int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
-			bio = bio_alloc(GFP_KERNEL, nr_iovecs);
+			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i));
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4312007d2d34..2d3cda0acacb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -341,8 +341,8 @@ static void do_region(int op, int op_flags, unsigned region,
 			num_bvecs = 1;
 			break;
 		default:
-			num_bvecs = min_t(int, BIO_MAX_PAGES,
-					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+			num_bvecs = bio_max_segs(dm_sector_div_up(remaining,
+						(PAGE_SIZE >> SECTOR_SHIFT)));
 		}
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index e3d35c6c9f71..57882654ffee 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -264,15 +264,14 @@ static int write_inline_data(struct log_writes_c *lc, void *entry,
 			     size_t entrylen, void *data, size_t datalen,
 			     sector_t sector)
 {
-	int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
+	int bio_pages, pg_datalen, pg_sectorlen, i;
 	struct page *page;
 	struct bio *bio;
 	size_t ret;
 	void *ptr;
 
 	while (datalen) {
-		num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
-		bio_pages = min(num_pages, BIO_MAX_PAGES);
+		bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE));
 
 		atomic_inc(&lc->io_blocks);
 
@@ -364,7 +363,7 @@ static int log_one_block(struct log_writes_c *lc,
 		goto out;
 
 	atomic_inc(&lc->io_blocks);
-	bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
+	bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt));
 	if (!bio) {
 		DMERR("Couldn't alloc log bio");
 		goto error;
@@ -386,7 +385,8 @@ static int log_one_block(struct log_writes_c *lc,
 		if (ret != block->vecs[i].bv_len) {
 			atomic_inc(&lc->io_blocks);
 			submit_bio(bio);
-			bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
+			bio = bio_alloc(GFP_KERNEL,
+					bio_max_segs(block->vec_cnt - i));
 			if (!bio) {
 				DMERR("Couldn't alloc log bio");
 				goto error;
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 3d9a5d3ed9cd..9a8b3726a37c 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -185,7 +185,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
 	}
 
 	bip = bio_integrity_alloc(bio, GFP_NOIO,
-		min_t(unsigned int, req->metadata_sg_cnt, BIO_MAX_PAGES));
+					bio_max_segs(req->metadata_sg_cnt));
 	if (IS_ERR(bip)) {
 		pr_err("Unable to allocate bio_integrity_payload\n");
 		return PTR_ERR(bip);
@@ -225,7 +225,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
 
 static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 {
-	int sg_cnt = req->sg_cnt;
+	unsigned int sg_cnt = req->sg_cnt;
 	struct bio *bio;
 	struct scatterlist *sg;
 	struct blk_plug plug;
@@ -262,7 +262,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 		bio = &req->b.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	} else {
-		bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+		bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
 	}
 	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_iter.bi_sector = sector;
@@ -289,7 +289,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 				}
 			}
 
-			bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+			bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
 			bio_set_dev(bio, req->ns->bdev);
 			bio->bi_iter.bi_sector = sector;
 			bio->bi_opf = op;
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index f50c7b2bf21c..26c587ccd152 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -26,7 +26,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
 	struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
 	u16 status = NVME_SC_SUCCESS;
 	struct nvme_id_ctrl *id;
-	int max_hw_sectors;
+	unsigned int max_hw_sectors;
 	int page_shift;
 
 	id = kzalloc(sizeof(*id), GFP_KERNEL);
@@ -198,7 +198,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
 		bio = &req->p.inline_bio;
 		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	} else {
-		bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES));
+		bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt));
 		bio->bi_end_io = bio_put;
 	}
 	bio->bi_opf = req_op(rq);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 8ed93fd205c7..ee3d52061281 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -315,10 +315,8 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op,
 	 * Only allocate as many vector entries as the bio code allows us to,
 	 * we'll loop later on until we have handled the whole request.
 	 */
-	if (sg_num > BIO_MAX_PAGES)
-		sg_num = BIO_MAX_PAGES;
-
-	bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set);
+	bio = bio_alloc_bioset(GFP_NOIO, bio_max_segs(sg_num),
+				&ib_dev->ibd_bio_set);
 	if (!bio) {
 		pr_err("Unable to allocate memory for bio\n");
 		return NULL;
@@ -638,8 +636,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
 		return -ENODEV;
 	}
 
-	bip = bio_integrity_alloc(bio, GFP_NOIO,
-			min_t(unsigned int, cmd->t_prot_nents, BIO_MAX_PAGES));
+	bip = bio_integrity_alloc(bio, GFP_NOIO, bio_max_segs(cmd->t_prot_nents));
 	if (IS_ERR(bip)) {
 		pr_err("Unable to allocate bio_integrity_payload\n");
 		return PTR_ERR(bip);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 33770e5808ce..3cbc074992bc 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -881,7 +881,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 
 			if (!bio) {
 new_bio:
-				nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages);
+				nr_vecs = bio_max_segs(nr_pages);
 				nr_pages -= nr_vecs;
 				/*
 				 * Calls bio_kmalloc() and sets bio->bi_end_io()
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ec26179c8062..0f95ff343d6b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
 
 static ssize_t
 __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
-		int nr_pages)
+		unsigned int nr_pages)
 {
 	struct file *file = iocb->ki_filp;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
@@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio)
 	}
 }
 
-static ssize_t
-__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
+static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+		unsigned int nr_pages)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
@@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
-	int nr_pages;
+	unsigned int nr_pages;
 
 	if (!iov_iter_count(iter))
 		return 0;
@@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
-	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
 
 static __init int blkdev_init(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aa1083ecd623..c9639b4166c2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
 	if (ret)
 		goto out;
 	sector = start_sector << (sdio->blkbits - 9);
-	nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
+	nr_pages = bio_max_segs(sdio->pages_in_io);
 	BUG_ON(nr_pages <= 0);
 	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
 	sdio->boundary = 0;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index ea4f693bee22..f88851c5c250 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -215,10 +215,8 @@ submit_bio_retry:
 		/* max # of continuous pages */
 		if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
 			nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
-		if (nblocks > BIO_MAX_PAGES)
-			nblocks = BIO_MAX_PAGES;
 
-		bio = bio_alloc(GFP_NOIO, nblocks);
+		bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks));
 
 		bio->bi_end_io = erofs_readendio;
 		bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f014c5e473a9..3db923403505 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
 			 * bio_alloc will _always_ be able to allocate a bio if
 			 * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
 			 */
-			bio = bio_alloc(GFP_KERNEL,
-				min_t(int, nr_pages, BIO_MAX_PAGES));
+			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
 			fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
 						  GFP_KERNEL);
 			ext4_set_bio_post_read_ctx(bio, inode, page->index);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b9721c8f116c..7c95818639a6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	unsigned int post_read_steps = 0;
 
 	bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
-			       min_t(int, nr_pages, BIO_MAX_PAGES),
-			       &f2fs_bioset);
+			       bio_max_segs(nr_pages), &f2fs_bioset);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a8a0fb890e8d..4b0e2e3c2c88 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
 	sum_entry = &sum->entries[0];
 
 	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
-		nrpages = min(last_offset - i, BIO_MAX_PAGES);
+		nrpages = bio_max_segs(last_offset - i);
 
 		/* readahead node pages */
 		f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 16a1e82e3aeb..0d9d1a6a947e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	if (!is_contig || bio_full(ctx->bio, plen)) {
 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 		gfp_t orig_gfp = gfp;
-		int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
 
 		if (ctx->bio)
 			submit_bio(ctx->bio);
 
 		if (ctx->rac) /* same as readahead_gfp_mask */
 			gfp |= __GFP_NORETRY | __GFP_NOWARN;
-		ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+		ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
 		/*
 		 * If the bio_alloc fails, try it again for a single page to
 		 * avoid having to deal with partial page reads.  This emulates
diff --git a/fs/mpage.c b/fs/mpage.c
index 830e6cc2a9e7..961234d68779 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -304,9 +304,7 @@ alloc_new:
 				goto out;
 		}
 		args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-					min_t(int, args->nr_pages,
-					      BIO_MAX_PAGES),
-					gfp);
+					bio_max_segs(args->nr_pages), gfp);
 		if (args->bio == NULL)
 			goto confused;
 	}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1a96ce28efb0..fe860c538747 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio)
 	return NULL;
 }
 
-static struct bio *
-bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+static struct bio *bl_alloc_init_bio(unsigned int npg,
+		struct block_device *bdev, sector_t disk_sector,
 		bio_end_io_t end_io, struct parallel_io *par)
 {
 	struct bio *bio;
 
-	npg = min(npg, BIO_MAX_PAGES);
+	npg = bio_max_segs(npg);
 	bio = bio_alloc(GFP_NOIO, npg);
 	if (bio) {
 		bio->bi_iter.bi_sector = disk_sector;
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index e2148f2d5d6b..17f36db2f792 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -6,7 +6,7 @@
 
 static inline unsigned int bio_max_vecs(unsigned int count)
 {
-	return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+	return bio_max_segs(howmany(count, PAGE_SIZE));
 }
 
 int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f6e5235df7c9..37a1d12762d8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map(
 	int		op)
 {
 	int		page_index;
-	int		total_nr_pages = bp->b_page_count;
+	unsigned int	total_nr_pages = bp->b_page_count;
 	int		nr_pages;
 	struct bio	*bio;
 	sector_t	sector =  bp->b_maps[map].bm_bn;
@@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map(
 
 next_chunk:
 	atomic_inc(&bp->b_io_remaining);
-	nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
+	nr_pages = bio_max_segs(total_nr_pages);
 
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	bio_set_dev(bio, bp->b_target->bt_bdev);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5b468f2242ff..983ed2fe7c85 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -20,7 +20,12 @@
 #define BIO_BUG_ON
 #endif
 
-#define BIO_MAX_PAGES		256
+#define BIO_MAX_PAGES		256U
+
+static inline unsigned int bio_max_segs(unsigned int nr_segs)
+{
+	return min(nr_segs, BIO_MAX_PAGES);
+}
 
 #define bio_prio(bio)			(bio)->bi_ioprio
 #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
-- 
cgit v1.2.3


From 5218e12e9f3a324f41c05da4874d76d7ea3677cb Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Mon, 1 Mar 2021 17:23:25 +0100
Subject: block: Drop leftover references to RQF_SORTED

Commit a1ce35fa49852db60fc6e268038530be533c5b15 ("block: remove dead
elevator code") removed all users of RQF_SORTED. However it is still
defined, and there is one reference left to it (which in effect is
dead code). Clear it all up.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 1 -
 block/blk-mq-sched.c   | 6 +-----
 include/linux/blkdev.h | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4de03da9a624..9ebb344e2585 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -292,7 +292,6 @@ static const char *const cmd_flag_name[] = {
 
 #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
 static const char *const rqf_name[] = {
-	RQF_NAME(SORTED),
 	RQF_NAME(STARTED),
 	RQF_NAME(SOFTBARRIER),
 	RQF_NAME(FLUSH_SEQ),
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ddb65e9e6fd9..e1e997af89a0 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -385,7 +385,6 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 
 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
-				       bool has_sched,
 				       struct request *rq)
 {
 	/*
@@ -402,9 +401,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
 		return true;
 
-	if (has_sched)
-		rq->rq_flags |= RQF_SORTED;
-
 	return false;
 }
 
@@ -418,7 +414,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 
 	WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
 
-	if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
+	if (blk_mq_sched_bypass_insert(hctx, rq)) {
 		/*
 		 * Firstly normal IO request is inserted to scheduler queue or
 		 * sw queue, meantime we add flush request to dispatch queue(
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c032cfe133c7..bc6bc8383b43 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -65,8 +65,6 @@ typedef void (rq_end_io_fn)(struct request *, blk_status_t);
  * request flags */
 typedef __u32 __bitwise req_flags_t;
 
-/* elevator knows about this request */
-#define RQF_SORTED		((__force req_flags_t)(1 << 0))
 /* drive already may have started this one */
 #define RQF_STARTED		((__force req_flags_t)(1 << 1))
 /* may not be passed by ioscheduler */
-- 
cgit v1.2.3


From 08c2a4bc9f2acaefbd0158866db5cb3238a68674 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 1 Mar 2021 18:31:24 -0600
Subject: ALSA: hda: move Intel SoundWire ACPI scan to dedicated module

The ACPI scan capabilities is called from the intel-dspconfig as well
as the SOF/HDaudio drivers. This creates dependencies and randconfig issues
when HDaudio and SOF/SoundWire are not all configured as modules.

To simplify Kconfig dependencies between HDAudio, SoundWire, SOF and
intel-dspconfig, move the ACPI scan helpers to a dedicated
module. This follows the same idea as NHLT helpers which are already
handled as a dedicated module.

The only functional change is that the kernel parameter to filter
links is now handled by a different module, but that was only provided
for developers needing work-arounds for early BIOS releases.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Bard Liao <bard.liao@intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210302003125.1178419-7-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 drivers/soundwire/intel.h           |   2 -
 drivers/soundwire/intel_init.c      | 158 --------------------------------
 include/linux/soundwire/sdw_intel.h |   2 +
 sound/hda/Kconfig                   |   4 +
 sound/hda/Makefile                  |   3 +
 sound/hda/intel-dsp-config.c        |   2 +-
 sound/hda/intel-sdw-acpi.c          | 174 ++++++++++++++++++++++++++++++++++++
 sound/soc/sof/intel/Kconfig         |   1 +
 sound/soc/sof/intel/hda.c           |   1 +
 9 files changed, 186 insertions(+), 161 deletions(-)
 create mode 100644 sound/hda/intel-sdw-acpi.c

(limited to 'include/linux')

diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index 76820d0b9deb..06bac8ba14e9 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -48,8 +48,6 @@ struct sdw_intel {
 #endif
 };
 
-#define SDW_INTEL_QUIRK_MASK_BUS_DISABLE      BIT(1)
-
 int intel_master_startup(struct platform_device *pdev);
 int intel_master_process_wakeen_event(struct platform_device *pdev);
 
diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index bc8520eb385e..05b726cdfebc 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -18,42 +18,12 @@
 #include "cadence_master.h"
 #include "intel.h"
 
-#define SDW_LINK_TYPE		4 /* from Intel ACPI documentation */
-#define SDW_MAX_LINKS		4
 #define SDW_SHIM_LCAP		0x0
 #define SDW_SHIM_BASE		0x2C000
 #define SDW_ALH_BASE		0x2C800
 #define SDW_LINK_BASE		0x30000
 #define SDW_LINK_SIZE		0x10000
 
-static int ctrl_link_mask;
-module_param_named(sdw_link_mask, ctrl_link_mask, int, 0444);
-MODULE_PARM_DESC(sdw_link_mask, "Intel link mask (one bit per link)");
-
-static bool is_link_enabled(struct fwnode_handle *fw_node, int i)
-{
-	struct fwnode_handle *link;
-	char name[32];
-	u32 quirk_mask = 0;
-
-	/* Find master handle */
-	snprintf(name, sizeof(name),
-		 "mipi-sdw-link-%d-subproperties", i);
-
-	link = fwnode_get_named_child_node(fw_node, name);
-	if (!link)
-		return false;
-
-	fwnode_property_read_u32(link,
-				 "intel-quirk-mask",
-				 &quirk_mask);
-
-	if (quirk_mask & SDW_INTEL_QUIRK_MASK_BUS_DISABLE)
-		return false;
-
-	return true;
-}
-
 static int sdw_intel_cleanup(struct sdw_intel_ctx *ctx)
 {
 	struct sdw_intel_link_res *link = ctx->links;
@@ -81,74 +51,6 @@ static int sdw_intel_cleanup(struct sdw_intel_ctx *ctx)
 	return 0;
 }
 
-static int
-sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
-{
-	struct acpi_device *adev;
-	int ret, i;
-	u8 count;
-
-	if (acpi_bus_get_device(info->handle, &adev))
-		return -EINVAL;
-
-	/* Found controller, find links supported */
-	count = 0;
-	ret = fwnode_property_read_u8_array(acpi_fwnode_handle(adev),
-					    "mipi-sdw-master-count", &count, 1);
-
-	/*
-	 * In theory we could check the number of links supported in
-	 * hardware, but in that step we cannot assume SoundWire IP is
-	 * powered.
-	 *
-	 * In addition, if the BIOS doesn't even provide this
-	 * 'master-count' property then all the inits based on link
-	 * masks will fail as well.
-	 *
-	 * We will check the hardware capabilities in the startup() step
-	 */
-
-	if (ret) {
-		dev_err(&adev->dev,
-			"Failed to read mipi-sdw-master-count: %d\n", ret);
-		return -EINVAL;
-	}
-
-	/* Check count is within bounds */
-	if (count > SDW_MAX_LINKS) {
-		dev_err(&adev->dev, "Link count %d exceeds max %d\n",
-			count, SDW_MAX_LINKS);
-		return -EINVAL;
-	}
-
-	if (!count) {
-		dev_warn(&adev->dev, "No SoundWire links detected\n");
-		return -EINVAL;
-	}
-	dev_dbg(&adev->dev, "ACPI reports %d SDW Link devices\n", count);
-
-	info->count = count;
-	info->link_mask = 0;
-
-	for (i = 0; i < count; i++) {
-		if (ctrl_link_mask && !(ctrl_link_mask & BIT(i))) {
-			dev_dbg(&adev->dev,
-				"Link %d masked, will not be enabled\n", i);
-			continue;
-		}
-
-		if (!is_link_enabled(acpi_fwnode_handle(adev), i)) {
-			dev_dbg(&adev->dev,
-				"Link %d not selected in firmware\n", i);
-			continue;
-		}
-
-		info->link_mask |= BIT(i);
-	}
-
-	return 0;
-}
-
 #define HDA_DSP_REG_ADSPIC2             (0x10)
 #define HDA_DSP_REG_ADSPIS2             (0x14)
 #define HDA_DSP_REG_ADSPIC2_SNDW        BIT(5)
@@ -357,66 +259,6 @@ sdw_intel_startup_controller(struct sdw_intel_ctx *ctx)
 	return 0;
 }
 
-static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level,
-				     void *cdata, void **return_value)
-{
-	struct sdw_intel_acpi_info *info = cdata;
-	struct acpi_device *adev;
-	acpi_status status;
-	u64 adr;
-
-	status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &adr);
-	if (ACPI_FAILURE(status))
-		return AE_OK; /* keep going */
-
-	if (acpi_bus_get_device(handle, &adev)) {
-		pr_err("%s: Couldn't find ACPI handle\n", __func__);
-		return AE_NOT_FOUND;
-	}
-
-	info->handle = handle;
-
-	/*
-	 * On some Intel platforms, multiple children of the HDAS
-	 * device can be found, but only one of them is the SoundWire
-	 * controller. The SNDW device is always exposed with
-	 * Name(_ADR, 0x40000000), with bits 31..28 representing the
-	 * SoundWire link so filter accordingly
-	 */
-	if (FIELD_GET(GENMASK(31, 28), adr) != SDW_LINK_TYPE)
-		return AE_OK; /* keep going */
-
-	/* device found, stop namespace walk */
-	return AE_CTRL_TERMINATE;
-}
-
-/**
- * sdw_intel_acpi_scan() - SoundWire Intel init routine
- * @parent_handle: ACPI parent handle
- * @info: description of what firmware/DSDT tables expose
- *
- * This scans the namespace and queries firmware to figure out which
- * links to enable. A follow-up use of sdw_intel_probe() and
- * sdw_intel_startup() is required for creation of devices and bus
- * startup
- */
-int sdw_intel_acpi_scan(acpi_handle *parent_handle,
-			struct sdw_intel_acpi_info *info)
-{
-	acpi_status status;
-
-	info->handle = NULL;
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
-				     parent_handle, 1,
-				     sdw_intel_acpi_cb,
-				     NULL, info, NULL);
-	if (ACPI_FAILURE(status) || info->handle == NULL)
-		return -ENODEV;
-
-	return sdw_intel_scan_controller(info);
-}
-EXPORT_SYMBOL_NS(sdw_intel_acpi_scan, SOUNDWIRE_INTEL_INIT);
-
 /**
  * sdw_intel_probe() - SoundWire Intel probe routine
  * @res: resource data
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 120ffddc03d2..3a5446ac014a 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -187,4 +187,6 @@ void sdw_intel_enable_irq(void __iomem *mmio_base, bool enable);
 
 irqreturn_t sdw_intel_thread(int irq, void *dev_id);
 
+#define SDW_INTEL_QUIRK_MASK_BUS_DISABLE      BIT(1)
+
 #endif
diff --git a/sound/hda/Kconfig b/sound/hda/Kconfig
index 9ed5cfa3c18c..57595f1552c9 100644
--- a/sound/hda/Kconfig
+++ b/sound/hda/Kconfig
@@ -44,9 +44,13 @@ config SND_INTEL_NHLT
 config SND_INTEL_DSP_CONFIG
 	tristate
 	select SND_INTEL_NHLT if ACPI
+	select SND_INTEL_SOUNDWIRE_ACPI if ACPI
 	# this config should be selected only for Intel DSP platforms.
 	# A fallback is provided so that the code compiles in all cases.
 
+config SND_INTEL_SOUNDWIRE_ACPI
+	tristate
+
 config SND_INTEL_BYT_PREFER_SOF
 	bool "Prefer SOF driver over SST on BY/CHT platforms"
 	depends on SND_SST_ATOM_HIFI2_PLATFORM_ACPI && SND_SOC_SOF_BAYTRAIL
diff --git a/sound/hda/Makefile b/sound/hda/Makefile
index 601e617918b8..78f487a635f8 100644
--- a/sound/hda/Makefile
+++ b/sound/hda/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_SND_HDA_EXT_CORE) += ext/
 snd-intel-dspcfg-objs := intel-dsp-config.o
 snd-intel-dspcfg-$(CONFIG_SND_INTEL_NHLT) += intel-nhlt.o
 obj-$(CONFIG_SND_INTEL_DSP_CONFIG) += snd-intel-dspcfg.o
+
+snd-intel-sdw-acpi-objs := intel-sdw-acpi.o
+obj-$(CONFIG_SND_INTEL_SOUNDWIRE_ACPI) += snd-intel-sdw-acpi.o
diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c
index d1eb9d34993a..ab5ff7867eb9 100644
--- a/sound/hda/intel-dsp-config.c
+++ b/sound/hda/intel-dsp-config.c
@@ -557,4 +557,4 @@ EXPORT_SYMBOL_GPL(snd_intel_acpi_dsp_driver_probe);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Intel DSP config driver");
-MODULE_IMPORT_NS(SOUNDWIRE_INTEL_INIT);
+MODULE_IMPORT_NS(SND_INTEL_SOUNDWIRE_ACPI);
diff --git a/sound/hda/intel-sdw-acpi.c b/sound/hda/intel-sdw-acpi.c
new file mode 100644
index 000000000000..6359936a1503
--- /dev/null
+++ b/sound/hda/intel-sdw-acpi.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2015-2021 Intel Corporation.
+
+/*
+ * SDW Intel ACPI scan helpers
+ */
+
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/soundwire/sdw_intel.h>
+#include <linux/string.h>
+
+#define SDW_LINK_TYPE		4 /* from Intel ACPI documentation */
+#define SDW_MAX_LINKS		4
+
+static int ctrl_link_mask;
+module_param_named(sdw_link_mask, ctrl_link_mask, int, 0444);
+MODULE_PARM_DESC(sdw_link_mask, "Intel link mask (one bit per link)");
+
+static bool is_link_enabled(struct fwnode_handle *fw_node, int i)
+{
+	struct fwnode_handle *link;
+	char name[32];
+	u32 quirk_mask = 0;
+
+	/* Find master handle */
+	snprintf(name, sizeof(name),
+		 "mipi-sdw-link-%d-subproperties", i);
+
+	link = fwnode_get_named_child_node(fw_node, name);
+	if (!link)
+		return false;
+
+	fwnode_property_read_u32(link,
+				 "intel-quirk-mask",
+				 &quirk_mask);
+
+	if (quirk_mask & SDW_INTEL_QUIRK_MASK_BUS_DISABLE)
+		return false;
+
+	return true;
+}
+
+static int
+sdw_intel_scan_controller(struct sdw_intel_acpi_info *info)
+{
+	struct acpi_device *adev;
+	int ret, i;
+	u8 count;
+
+	if (acpi_bus_get_device(info->handle, &adev))
+		return -EINVAL;
+
+	/* Found controller, find links supported */
+	count = 0;
+	ret = fwnode_property_read_u8_array(acpi_fwnode_handle(adev),
+					    "mipi-sdw-master-count", &count, 1);
+
+	/*
+	 * In theory we could check the number of links supported in
+	 * hardware, but in that step we cannot assume SoundWire IP is
+	 * powered.
+	 *
+	 * In addition, if the BIOS doesn't even provide this
+	 * 'master-count' property then all the inits based on link
+	 * masks will fail as well.
+	 *
+	 * We will check the hardware capabilities in the startup() step
+	 */
+
+	if (ret) {
+		dev_err(&adev->dev,
+			"Failed to read mipi-sdw-master-count: %d\n", ret);
+		return -EINVAL;
+	}
+
+	/* Check count is within bounds */
+	if (count > SDW_MAX_LINKS) {
+		dev_err(&adev->dev, "Link count %d exceeds max %d\n",
+			count, SDW_MAX_LINKS);
+		return -EINVAL;
+	}
+
+	if (!count) {
+		dev_warn(&adev->dev, "No SoundWire links detected\n");
+		return -EINVAL;
+	}
+	dev_dbg(&adev->dev, "ACPI reports %d SDW Link devices\n", count);
+
+	info->count = count;
+	info->link_mask = 0;
+
+	for (i = 0; i < count; i++) {
+		if (ctrl_link_mask && !(ctrl_link_mask & BIT(i))) {
+			dev_dbg(&adev->dev,
+				"Link %d masked, will not be enabled\n", i);
+			continue;
+		}
+
+		if (!is_link_enabled(acpi_fwnode_handle(adev), i)) {
+			dev_dbg(&adev->dev,
+				"Link %d not selected in firmware\n", i);
+			continue;
+		}
+
+		info->link_mask |= BIT(i);
+	}
+
+	return 0;
+}
+
+static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level,
+				     void *cdata, void **return_value)
+{
+	struct sdw_intel_acpi_info *info = cdata;
+	struct acpi_device *adev;
+	acpi_status status;
+	u64 adr;
+
+	status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &adr);
+	if (ACPI_FAILURE(status))
+		return AE_OK; /* keep going */
+
+	if (acpi_bus_get_device(handle, &adev)) {
+		pr_err("%s: Couldn't find ACPI handle\n", __func__);
+		return AE_NOT_FOUND;
+	}
+
+	info->handle = handle;
+
+	/*
+	 * On some Intel platforms, multiple children of the HDAS
+	 * device can be found, but only one of them is the SoundWire
+	 * controller. The SNDW device is always exposed with
+	 * Name(_ADR, 0x40000000), with bits 31..28 representing the
+	 * SoundWire link so filter accordingly
+	 */
+	if (FIELD_GET(GENMASK(31, 28), adr) != SDW_LINK_TYPE)
+		return AE_OK; /* keep going */
+
+	/* device found, stop namespace walk */
+	return AE_CTRL_TERMINATE;
+}
+
+/**
+ * sdw_intel_acpi_scan() - SoundWire Intel init routine
+ * @parent_handle: ACPI parent handle
+ * @info: description of what firmware/DSDT tables expose
+ *
+ * This scans the namespace and queries firmware to figure out which
+ * links to enable. A follow-up use of sdw_intel_probe() and
+ * sdw_intel_startup() is required for creation of devices and bus
+ * startup
+ */
+int sdw_intel_acpi_scan(acpi_handle *parent_handle,
+			struct sdw_intel_acpi_info *info)
+{
+	acpi_status status;
+
+	info->handle = NULL;
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
+				     parent_handle, 1,
+				     sdw_intel_acpi_cb,
+				     NULL, info, NULL);
+	if (ACPI_FAILURE(status) || info->handle == NULL)
+		return -ENODEV;
+
+	return sdw_intel_scan_controller(info);
+}
+EXPORT_SYMBOL_NS(sdw_intel_acpi_scan, SND_INTEL_SOUNDWIRE_ACPI);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Soundwire ACPI helpers");
diff --git a/sound/soc/sof/intel/Kconfig b/sound/soc/sof/intel/Kconfig
index 21e24a3c64fb..da1c396f529d 100644
--- a/sound/soc/sof/intel/Kconfig
+++ b/sound/soc/sof/intel/Kconfig
@@ -286,6 +286,7 @@ config SND_SOC_SOF_INTEL_SOUNDWIRE
 	depends on ACPI && SOUNDWIRE
 	depends on !(SOUNDWIRE=m && SND_SOC_SOF_INTEL_SOUNDWIRE_LINK_BASELINE=y)
 	select SOUNDWIRE_INTEL
+	select SND_INTEL_SOUNDWIRE_ACPI
 	help
 	  This adds support for SoundWire with Sound Open Firmware
 	  for Intel(R) platforms.
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index 995a8c427177..1d29b1fd6a94 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -1279,4 +1279,5 @@ MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV);
 MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC);
 MODULE_IMPORT_NS(SND_SOC_SOF_HDA_AUDIO_CODEC_I915);
 MODULE_IMPORT_NS(SND_SOC_SOF_XTENSA);
+MODULE_IMPORT_NS(SND_INTEL_SOUNDWIRE_ACPI);
 MODULE_IMPORT_NS(SOUNDWIRE_INTEL_INIT);
-- 
cgit v1.2.3


From caf6912f3f4af7232340d500a4a2008f81b93f14 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 2 Mar 2021 14:53:21 -0700
Subject: swap: fix swapfile read/write offset

We're not factoring in the start of the file for where to write and
read the swapfile, which leads to very unfortunate side effects of
writing where we should not be...

Fixes: 48d15436fde6 ("mm: remove get_swap_bio")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/swap.h |  1 +
 mm/page_io.c         |  5 -----
 mm/swapfile.c        | 13 +++++++++++++
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 32f665b1ee85..4cc6ec3bf0ab 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -485,6 +485,7 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
+sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index 485fa5cca4a2..c493ce9ebcf5 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -254,11 +254,6 @@ out:
 	return ret;
 }
 
-static sector_t swap_page_sector(struct page *page)
-{
-	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
-}
-
 static inline void count_swpout_vm_event(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f039745989d2..084a5b9a18e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -219,6 +219,19 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
 	BUG();
 }
 
+sector_t swap_page_sector(struct page *page)
+{
+	struct swap_info_struct *sis = page_swap_info(page);
+	struct swap_extent *se;
+	sector_t sector;
+	pgoff_t offset;
+
+	offset = __page_file_index(page);
+	se = offset_to_swap_extent(sis, offset);
+	sector = se->start_block + (offset - se->start_page);
+	return sector << (PAGE_SHIFT - 9);
+}
+
 /*
  * swap allocation tell device that a cluster of swap can now be discarded,
  * to allow the swap device to optimize its wear-levelling.
-- 
cgit v1.2.3


From ff70784ab9f89e78e67d5d172bf7644de673f61f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 2 Mar 2021 15:35:48 +0200
Subject: ACPI: bus: Constify is_acpi_node() and friends (part 2)

Commit 8b9d6802583a ("ACPI: Constify acpi_bus helper functions,
switch to macros") only changed functions for CONFIG_ACPI=y case.
This part adjusts the rest.

Fixes: 8b9d6802583a ("ACPI: Constify acpi_bus helper functions, switch to macros")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 3c5757d539ab..9f432411e988 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -746,12 +746,12 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
 
 static inline void acpi_dev_put(struct acpi_device *adev) {}
 
-static inline bool is_acpi_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
 {
 	return false;
 }
 
-static inline bool is_acpi_device_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
 {
 	return false;
 }
@@ -761,7 +761,7 @@ static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwno
 	return NULL;
 }
 
-static inline bool is_acpi_data_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 8452d4a674b0e59bd53baef0b30b018690dde594 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Feb 2021 11:16:46 +0000
Subject: io_uring: destroy io-wq on exec

Destroy current's io-wq backend and tctx on __io_uring_task_cancel(),
aka exec(). Looks it's not strictly necessary, because it will be done
at some point when the task dies and changes of creds/files/etc. are
handled, but better to do that earlier to free io-wq and not potentially
lock previous mm and other resources for the time being.

It's safe to do because we wait for all requests of the current task to
complete, so no request will use tctx afterwards. Note, that
io_uring_files_cancel() may leave some requests for later reaping, so it
leaves tctx intact, that's ok as the task is dying anyway.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 19 ++++++++++---------
 include/linux/io_uring.h |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 796b6d1f72f9..73d1bd8db1bb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8835,13 +8835,17 @@ static void io_uring_del_task_file(struct file *file)
 		fput(file);
 }
 
-static void io_uring_remove_task_files(struct io_uring_task *tctx)
+static void io_uring_clean_tctx(struct io_uring_task *tctx)
 {
 	struct file *file;
 	unsigned long index;
 
 	xa_for_each(&tctx->xa, index, file)
 		io_uring_del_task_file(file);
+	if (tctx->io_wq) {
+		io_wq_put_and_exit(tctx->io_wq);
+		tctx->io_wq = NULL;
+	}
 }
 
 void __io_uring_files_cancel(struct files_struct *files)
@@ -8856,13 +8860,8 @@ void __io_uring_files_cancel(struct files_struct *files)
 		io_uring_cancel_task_requests(file->private_data, files);
 	atomic_dec(&tctx->in_idle);
 
-	if (files) {
-		io_uring_remove_task_files(tctx);
-		if (tctx->io_wq) {
-			io_wq_put_and_exit(tctx->io_wq);
-			tctx->io_wq = NULL;
-		}
-	}
+	if (files)
+		io_uring_clean_tctx(tctx);
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -8954,7 +8953,9 @@ void __io_uring_task_cancel(void)
 
 	atomic_dec(&tctx->in_idle);
 
-	io_uring_remove_task_files(tctx);
+	io_uring_clean_tctx(tctx);
+	/* all current's requests should be gone, we can kill tctx */
+	__io_uring_free(current);
 }
 
 static int io_uring_flush(struct file *file, void *data)
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 51ede771cd99..7cb7bd0e334c 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -38,7 +38,7 @@ void __io_uring_free(struct task_struct *tsk);
 
 static inline void io_uring_task_cancel(void)
 {
-	if (current->io_uring && !xa_empty(&current->io_uring->xa))
+	if (current->io_uring)
 		__io_uring_task_cancel();
 }
 static inline void io_uring_files_cancel(struct files_struct *files)
-- 
cgit v1.2.3


From f9f344479d8b40b3b001c913fb992d85d19261d0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 26 Feb 2021 14:09:15 -0500
Subject: tracing: Fix comment about the trace_event_call flags

In the declaration of the struct trace_event_call, the flags has the bits
defined in the comment above it. But these bits are also defined by the
TRACE_EVENT_FL_* enums just above the declaration of the struct. As the
comment about the flags in the struct has become stale and incorrect, just
replace it with a reference to the TRACE_EVENT_FL_* enum above.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 7077fec653bb..28e7af1406f2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -349,15 +349,8 @@ struct trace_event_call {
 	struct event_filter	*filter;
 	void			*mod;
 	void			*data;
-	/*
-	 *   bit 0:		filter_active
-	 *   bit 1:		allow trace by non root (cap any)
-	 *   bit 2:		failed to apply filter
-	 *   bit 3:		trace internal event (do not enable)
-	 *   bit 4:		Event was enabled by module
-	 *   bit 5:		use call filter rather than file filter
-	 *   bit 6:		Event is a tracepoint
-	 */
+
+	/* See the TRACE_EVENT_FL_* flags above */
 	int			flags; /* static flags of different events */
 
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From cc440e8738e5c875297ac0e90316745093be7e28 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Mar 2021 12:21:05 -0700
Subject: kernel: provide create_io_thread() helper

Provide a generic helper for setting up an io_uring worker. Returns a
task_struct so that the caller can do whatever setup is needed, then call
wake_up_new_task() to kick it into gear.

Add a kernel_clone_args member, io_thread, which tells copy_process() to
mark the task with PF_IO_WORKER.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched/task.h |  2 ++
 kernel/fork.c              | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c0f71f2e7160..ef02be869cf2 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -31,6 +31,7 @@ struct kernel_clone_args {
 	/* Number of elements in *set_tid */
 	size_t set_tid_size;
 	int cgroup;
+	int io_thread;
 	struct cgroup *cgrp;
 	struct css_set *cset;
 };
@@ -82,6 +83,7 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..d3171e8e88e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1940,6 +1940,8 @@ static __latent_entropy struct task_struct *copy_process(
 	p = dup_task_struct(current, node);
 	if (!p)
 		goto fork_out;
+	if (args->io_thread)
+		p->flags |= PF_IO_WORKER;
 
 	/*
 	 * This _must_ happen before we call free_task(), i.e. before we jump
@@ -2410,6 +2412,34 @@ struct mm_struct *copy_init_mm(void)
 	return dup_mm(NULL, &init_mm);
 }
 
+/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p). All signals are blocked in the created task.
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+{
+	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+				CLONE_IO;
+	struct kernel_clone_args args = {
+		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+				    CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+		.io_thread	= 1,
+	};
+	struct task_struct *tsk;
+
+	tsk = copy_process(NULL, 0, node, &args);
+	if (!IS_ERR(tsk)) {
+		sigfillset(&tsk->blocked);
+		sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
+	}
+	return tsk;
+}
+
 /*
  *  Ok, this is the main fork-routine.
  *
-- 
cgit v1.2.3


From 69dd4503a7e6bae3389b8e028e5768008be8f2d7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 16 Feb 2021 15:36:07 +0100
Subject: irqdomain: Remove debugfs_file from struct irq_domain

There's no need to keep around a dentry pointer to a simple file that
debugfs itself can look up when we need to remove it from the system.
So simplify the code by deleting the variable and cleaning up the logic
around the debugfs file.

Cc: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/YCvYV53ZdzQSWY6w@kroah.com
---
 include/linux/irqdomain.h | 4 ----
 kernel/irq/irqdomain.c    | 9 ++++-----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 42d196805f58..33cacc8af26d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -150,7 +150,6 @@ struct irq_domain_chip_generic;
  *      setting up one or more generic chips for interrupt controllers
  *      drivers using the generic chip library which uses this pointer.
  * @parent: Pointer to parent irq_domain to support hierarchy irq_domains
- * @debugfs_file: dentry for the domain debugfs file
  *
  * Revmap data, used internally by irq_domain
  * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that
@@ -174,9 +173,6 @@ struct irq_domain {
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	struct irq_domain *parent;
 #endif
-#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
-	struct dentry		*debugfs_file;
-#endif
 
 	/* reverse map data. The linear map gets appended to the irq_domain */
 	irq_hw_number_t hwirq_max;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 288151393a06..d10ab1d689d5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
 
 static void debugfs_add_domain_dir(struct irq_domain *d)
 {
-	if (!d->name || !domain_dir || d->debugfs_file)
+	if (!d->name || !domain_dir)
 		return;
-	d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
-					      &irq_domain_debug_fops);
+	debugfs_create_file(d->name, 0444, domain_dir, d,
+			    &irq_domain_debug_fops);
 }
 
 static void debugfs_remove_domain_dir(struct irq_domain *d)
 {
-	debugfs_remove(d->debugfs_file);
-	d->debugfs_file = NULL;
+	debugfs_remove(debugfs_lookup(d->name, domain_dir));
 }
 
 void __init irq_domain_debugfs_init(struct dentry *root)
-- 
cgit v1.2.3